In [1]:
import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
from pyspark.sql.types import DateType
from pyspark.sql.functions import pandas_udf, PandasUDFType,create_map, lit, col, to_date, concat,row_number
from itertools import chain
import os
from pyspark.sql.window import Window



# Create spark session
spark = (SparkSession
    .builder 
    .appName("spark-cleansing") 
    .getOrCreate()
    )
sc = spark.sparkContext
sc.setLogLevel("WARN")

####################################
# path file
####################################
csv_file = "/home/azril/bank_campaign/datasets/bank_marketing.csv"

####################################
# Read csv Data
####################################
#from csv to parquet


df = (
    spark.read
    .format("csv")
    .option("sep", ";")
    .option("header", True)
    .load(csv_file)
)

23/07/31 12:22:28 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.100.54 instead (on interface wlp3s0)
23/07/31 12:22:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/31 12:22:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
rows = df.count()
cols = len(df.columns)

print(f'Dimensions of Data: {(rows,cols)}')
print(f'Rows of Data: {rows}')
print(f'Columns of Data: {cols}')

Dimensions of Data: (41188, 21)
Rows of Data: 41188
Columns of Data: 21


In [3]:
df_transform1 = df.withColumn("education",
                                        when(df.education.endswith('4y'), regexp_replace(df.education, 'basic.4y', 'basic')) \
                                        .when(df.education.endswith('6y'), regexp_replace(df.education, 'basic.6y', 'basic')) \
                                        .when(df.education.endswith('9y'), regexp_replace(df.education, 'basic.9y', 'basic')) \
                                        .otherwise(df.education)
                                        )


In [4]:
df_transform1.show(5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp.var.rate|cons.price.idx|cons.conf.idx|euribor3m|nr.employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|      basic|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|       5191| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|       5191| no|
| 37| serv

In [5]:
df_transform2 = df_transform1.withColumnRenamed('emp.var.rate', 'emp_var_rate') \
       .withColumnRenamed('cons.price.idx', 'cons_price_idx') \
       .withColumnRenamed('cons.conf.idx', 'cons_conf_idx') \
       .withColumnRenamed('nr.employed', 'nr_employed') \
       .withColumnRenamed('default', 'credit') \
       .withColumnRenamed('y', 'subcribed')


In [6]:
df_transform2.show(5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---------+
|age|      job|marital|  education| credit|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|subcribed|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---------+
| 56|housemaid|married|      basic|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|       5191|       no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|   

In [7]:
df_transform3 = df_transform2.na.drop("any")

In [8]:
rows = df_transform3.count()
cols = len(df_transform3.columns)

print(f'Dimensions of Data: {(rows,cols)}')
print(f'Rows of Data: {rows}')
print(f'Columns of Data: {cols}')

Dimensions of Data: (41188, 21)
Rows of Data: 41188
Columns of Data: 21


In [25]:


window = Window.orderBy(monotonically_increasing_id())
df_transform3 = df_transform3.withColumn("client_id", row_number().over(window) - 1)
df_transform4 = df_transform3.select(["client_id"] + [col for col in df_transform3.columns if col != "client_id"])

In [10]:
#print last 5 rows
df_transform4.tail(5)

23/07/31 12:22:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:22:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:22:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:22:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:22:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

[Row(client_id=41183, age='73', job='retired', marital='married', education='professional.course', credit='no', housing='yes', loan='no', contact='cellular', month='nov', day_of_week='fri', duration='334', campaign='1', pdays='999', previous='0', poutcome='nonexistent', emp_var_rate='-1.1', cons_price_idx='94.767', cons_conf_idx='-50.8', euribor3m='1.028', nr_employed='4963.6', subcribed='yes'),
 Row(client_id=41184, age='46', job='blue-collar', marital='married', education='professional.course', credit='no', housing='no', loan='no', contact='cellular', month='nov', day_of_week='fri', duration='383', campaign='1', pdays='999', previous='0', poutcome='nonexistent', emp_var_rate='-1.1', cons_price_idx='94.767', cons_conf_idx='-50.8', euribor3m='1.028', nr_employed='4963.6', subcribed='no'),
 Row(client_id=41185, age='56', job='retired', marital='married', education='university.degree', credit='no', housing='yes', loan='no', contact='cellular', month='nov', day_of_week='fri', duration='18

In [26]:
df_transform4.filter((df_transform4['month'].isNull()) | (df_transform4['day_of_week'].isNull())).show()
df_transform4.filter((df_transform4['month'] == 'unknown') | (df_transform4['day_of_week'] == 'unknown')).show()


23/07/31 12:33:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+
|client_id|age|job|marital|education|credit|housing|loan|contact|month|day_of_week|duration|campaign|pdays|previous|poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|subcribed|
+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+
+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+



23/07/31 12:33:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+
|client_id|age|job|marital|education|credit|housing|loan|contact|month|day_of_week|duration|campaign|pdays|previous|poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|subcribed|
+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+
+---------+---+---+-------+---------+------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---------+



In [27]:
# Updating dictionaries to match the values in your dataframe
month_dict = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
              "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}

day_dict = {"mon": 1, "tue": 2, "wed": 3, "thu": 4, "fri": 5}

subscribed_dict = {"no": 0, "yes": 1}

# Use the dictionaries to map the month, day, and subscribed columns to numbers


month_mapping_expr = create_map([lit(x) for x in chain(*month_dict.items())])
day_mapping_expr = create_map([lit(x) for x in chain(*day_dict.items())])
subscribed_mapping_expr = create_map([lit(x) for x in chain(*subscribed_dict.items())])

df_transform4 = df_transform4.withColumn('month', month_mapping_expr.getItem(col('month')))
df_transform4 = df_transform4.withColumn('day_of_week', day_mapping_expr.getItem(col('day_of_week')))
df_transform4 = df_transform4.withColumn('subcribed', subscribed_mapping_expr.getItem(col('subcribed')))

# Convert the month and day columns to string to avoid null values during concatenation
df_transform4 = df_transform4.withColumn('month', df_transform4['month'].cast(StringType()))
df_transform4 = df_transform4.withColumn('day_of_week', df_transform4['day_of_week'].cast(StringType()))

# Create the date column
df_transform4 = df_transform4.withColumn("date", 
                   to_date(concat(lit("2022-"), 
                                  df_transform4["month"], 
                                  lit("-"),
                                  df_transform4["day_of_week"]), 
                            "yyyy-MM-dd"))


In [28]:
df_transform4.show(5)

23/07/31 12:33:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---------+----------+
|client_id|age|      job|marital|  education| credit|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|subcribed|      date|
+---------+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---------+----------+
|        0| 56|housemaid|married|      basic|     no|     no|  no|telephone|    5|          1|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|       5191|        0|2022-05-01|
|        1| 57| services|married|high.school|unknown|     no|  no|telephone|    5|          1|     1

23/07/31 12:33:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:33:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [29]:
df_transform4.tail(5)

23/07/31 12:34:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:34:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:34:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:34:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/31 12:34:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Row(client_id=41183, age='73', job='retired', marital='married', education='professional.course', credit='no', housing='yes', loan='no', contact='cellular', month='11', day_of_week='5', duration='334', campaign='1', pdays='999', previous='0', poutcome='nonexistent', emp_var_rate='-1.1', cons_price_idx='94.767', cons_conf_idx='-50.8', euribor3m='1.028', nr_employed='4963.6', subcribed=1, date=datetime.date(2022, 11, 5)),
 Row(client_id=41184, age='46', job='blue-collar', marital='married', education='professional.course', credit='no', housing='no', loan='no', contact='cellular', month='11', day_of_week='5', duration='383', campaign='1', pdays='999', previous='0', poutcome='nonexistent', emp_var_rate='-1.1', cons_price_idx='94.767', cons_conf_idx='-50.8', euribor3m='1.028', nr_employed='4963.6', subcribed=0, date=datetime.date(2022, 11, 5)),
 Row(client_id=41185, age='56', job='retired', marital='married', education='university.degree', credit='no', housing='yes', loan='no', contact='ce