In [1]:
import os
import sys

os.environ["JAVA_HOME"] = "JDK 8/Contents/Home"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, VectorAssembler, StringIndexer, IDF,HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator,TrainValidationSplit, ParamGridBuilder

In [4]:
# Create a new Sparksession
spark = SparkSession\
    .builder\
    .appName('Credit_Linear_Regression_Model')\
    .getOrCreate()

25/02/22 16:24:07 WARN Utils: Your hostname, Toms-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.180.174.193 instead (on interface en0)
25/02/22 16:24:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/22 16:24:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
path = "/Users/tomdursley/Downloads/Credit_card.csv"

Credit_sparkdf = spark.read.csv(path, header=True, inferSchema=True)

Credit_sparkdf.show()

                                                                                

+-------+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------------+-----------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
| Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|           EDUCATION|      Marital_status|     Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|
+-------+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------------+-----------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|5008827|     M|        Y|            Y|       0|     180000.0|           Pensioner|    Higher education|             Married|House / apartment|        -18772|       365243|           1|         0|    0|       0|           NULL|             2|
|5009744|     F|        

In [6]:
null_count = Credit_sparkdf.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in Credit_sparkdf.columns])

print("the number of null values is: ")

null_count.show()

the number of null values is: 
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|Type_Income|EDUCATION|Marital_status|Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|     0|     7|        0|            0|       0|           23|          0|        0|             0|           0|            22|            0|           0|         0|    0|       0|            488|             0|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+---------

In [7]:
print((23/1548)*100)

1.4857881136950903


Looked at the null values and realised we can replace the type occupation nulls as N/A as the column is very useful for analysis. We then rmeoved the individuals who have nulls after which is like 50.

In [8]:
Credit_sparkdf = Credit_sparkdf.fillna({"Type_Occupation": "N/A"})

Credit_sparkdf = Credit_sparkdf.dropna()

null_count = Credit_sparkdf.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in Credit_sparkdf.columns])
print("After handling missing values:")
null_count.show()

After handling missing values:
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|Type_Income|EDUCATION|Marital_status|Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|     0|     0|        0|            0|       0|            0|          0|        0|             0|           0|             0|            0|           0|         0|    0|       0|              0|             0|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+---------

In [9]:
Credit_sparkdf = Credit_sparkdf.withColumn("Age_Years", F.round(F.abs(F.col("Birthday_count")) / 365).cast("int"))

# Convert Employed_days to employment status
Credit_sparkdf = Credit_sparkdf.withColumn("Employment_Status", 
    F.when(F.col("Employed_days") > 0, "Unemployed")
     .when(F.col("Employed_days") < 0, F.concat(F.lit("Employed for "), F.abs(F.col("Employed_days")) / 365, F.lit(" years")))
     .otherwise("Currently Employed")
)

In [16]:
columns_to_drop = ["Mobile_phone", "Work_Phone", "Phone", "EMAIL_ID", "Birthday_count", "Employed_days"]
cleaned_data = Credit_sparkdf.drop(*columns_to_drop)

# Show the cleaned data
cleaned_data.show(5)

+-------+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------+-----------------+---------------+--------------+---------+--------------------+
| Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|           EDUCATION|Marital_status|     Housing_type|Type_Occupation|Family_Members|Age_Years|   Employment_Status|
+-------+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------+-----------------+---------------+--------------+---------+--------------------+
|5008827|     M|        Y|            Y|       0|     180000.0|           Pensioner|    Higher education|       Married|House / apartment|            N/A|             2|       51|          Unemployed|
|5009744|     F|        Y|            N|       0|     315000.0|Commercial associate|    Higher education|       Married|House / apartment|            N/A|             2|       37|Employed for 1.60

In [11]:
categorical_features = ["GENDER", "Car_Owner", "Propert_Owner", "Type_Income", "Housing_type", "Type_Occupation", "EDUCATION"]
numeric_features = ["Ind_ID", "CHILDREN", "Annual_income", "Age_Years", "Employed_days", "Family_Members"]


In [12]:
stages = []
for cat_feature in categorical_features:
    string_indexer = StringIndexer(inputCol=cat_feature, outputCol=f"{cat_feature} Index")
    encoder = OneHotEncoder(inputCol=f"{cat_feature} Index", outputCol=f"{cat_feature} Vec")
    stages += [string_indexer, encoder]

In [13]:
all_features = numeric_features + [f"{cat} Vec" for cat in categorical_features]

assembler = VectorAssembler(inputCols=all_features, outputCol="features")
stages += [assembler]

In [14]:
pipeline = Pipeline(stages = stages)

In [15]:
transformed_data = pipeline.fit(Credit_sparkdf).transform(Credit_sparkdf)
transformed_data.show(5)

25/02/22 16:24:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------+-----------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+---------+--------------------+------------+-------------+---------------+-------------+-------------------+-----------------+-----------------+---------------+------------------+----------------+---------------------+-------------------+---------------+-------------+--------------------+
| Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|           EDUCATION|Marital_status|     Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|Age_Years|   Employment_Status|GENDER Index|   GENDER Vec|Car_Owner Index|Car_Owner Vec|Propert_Owner Index|Propert_Owner Vec|Type_Income Index|Type_Income Vec|Housing_type Index|Housing_type Vec|Type_Occupation Index|Type_Occupation Ve