# Credit Risk Assessment Using PySpark

# Modelling   Stage II

In [12]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CreditRiskModeling").enableHiveSupport().getOrCreate()

sc = spark.sparkContext

In [13]:

from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window


## Load Dataset

In [14]:
# Load dataset
file_path = "hdfs:///user/talentum/processed_data/cleaned_data_scalled/*.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema and some initial data
df.printSchema()
df.select(df.columns[0:5]).show(5)
df.select(df.columns[5:9]).show(5)
df.select(df.columns[9:13]).show(5)


root
 |-- SeriousDlqin2yrs: integer (nullable = true)
 |-- RevolvingUtilizationOfUnsecuredLines: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: integer (nullable = true)
 |-- DebtRatio: double (nullable = true)
 |-- MonthlyIncome: double (nullable = true)
 |-- NumberOfOpenCreditLinesAndLoans: integer (nullable = true)
 |-- NumberOfTimes90DaysLate: integer (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60-89DaysPastDueNotWorse: integer (nullable = true)
 |-- NumberOfDependents: integer (nullable = true)
 |-- DebtRatioCategory: string (nullable = true)

+----------------+------------------------------------+---+------------------------------------+-----------+
|SeriousDlqin2yrs|RevolvingUtilizationOfUnsecuredLines|age|NumberOfTime30-59DaysPastDueNotWorse|  DebtRatio|
+----------------+------------------------------------+---+------------------------------------+-----------+
|        

## One-Hot Encoding and Feature Assembling

In [15]:

# One-Hot Encoding for debtratiocategory
indexer = StringIndexer(inputCol="DebtRatioCategory", outputCol="debtratiocategory_index")
encoder = OneHotEncoder(inputCol="debtratiocategory_index", outputCol="debtratiocategory_vec")

# Assemble features
assembler = VectorAssembler(
    inputCols=[col for col in df.columns if col != "SeriousDlqin2yrs" and col != "DebtRatioCategory"],
    outputCol="features"
)


##  Split Data into Train and Test Sets

In [16]:
# Split the data into train and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

## Model Definitions

In [17]:

# Define the models
log_reg = LogisticRegression(labelCol="SeriousDlqin2yrs", featuresCol="features")
rf = RandomForestClassifier(labelCol="SeriousDlqin2yrs", featuresCol="features")
gbt = GBTClassifier(labelCol="SeriousDlqin2yrs", featuresCol="features")


## Hyperparameter Tuning and Cross-Validation

In [18]:

# Hyperparameter tuning using cross-validation
param_grid_lr = (ParamGridBuilder()
                 .addGrid(log_reg.regParam, [0.01, 0.1 ])
                 .addGrid(log_reg.elasticNetParam, [0.0, 0.5])
                 .build())

param_grid_rf = (ParamGridBuilder()
                 .addGrid(rf.numTrees, [100, 200])
                 .addGrid(rf.maxDepth, [5, 10])
                 .build())

param_grid_gbt = (ParamGridBuilder()
                  .addGrid(gbt.maxDepth, [5, 10])
                  .addGrid(gbt.maxIter, [5, 10])
                  .build())


# Define the evaluator
evaluator = BinaryClassificationEvaluator(labelCol="SeriousDlqin2yrs")

# Define the cross-validators
cv_lr = CrossValidator(estimator=log_reg, estimatorParamMaps=param_grid_lr, evaluator=evaluator, numFolds=2)
cv_rf = CrossValidator(estimator=rf, estimatorParamMaps=param_grid_rf, evaluator=evaluator, numFolds=2)
cv_gbt = CrossValidator(estimator=gbt, estimatorParamMaps=param_grid_gbt, evaluator=evaluator, numFolds=2)


## Model Training

In [19]:

# Create pipeline
pipeline_lr = Pipeline(stages=[indexer, encoder, assembler, cv_lr])
pipeline_rf = Pipeline(stages=[indexer, encoder, assembler, cv_rf])
pipeline_gbt = Pipeline(stages=[indexer, encoder, assembler, cv_gbt])

# Train the models on the training set
model_lr = pipeline_lr.fit(train_df)
model_rf = pipeline_rf.fit(train_df)
model_gbt = pipeline_gbt.fit(train_df)


## Model Evaluation

In [9]:

# Evaluate the models on the test set
auc_lr = evaluator.evaluate(model_lr.transform(test_df))
auc_rf = evaluator.evaluate(model_rf.transform(test_df))
auc_gbt = evaluator.evaluate(model_gbt.transform(test_df))

# Print the results
print(f"AUC for Logistic Regression: {auc_lr}")
print(f"AUC for Random Forest: {auc_rf}")
print(f"AUC for Gradient Boosted Decision Tree: {auc_gbt}")


AUC for Logistic Regression: 0.8543626111531084
AUC for Random Forest: 0.8444228318392916
AUC for Gradient Boosted Decision Tree: 0.8550442272383626


## Best Model and Parameters

In [20]:
# Function to filter parameters based on the parameter grid
def filter_best_params(best_model, param_grid):
    param_names = [param.name for param in param_grid]
    best_params = {}
    for param in best_model.extractParamMap():
        if param.name in param_names:
            best_params[param.name] = best_model.extractParamMap()[param]
    return best_params


# Find the best model and its parameters
best_model_lr = model_lr.stages[-1].bestModel
best_model_rf = model_rf.stages[-1].bestModel
best_model_gbt = model_gbt.stages[-1].bestModel

# Extracting the relevant best parameters based on the param grid
best_params_lr = filter_best_params(best_model_lr, [log_reg.regParam, log_reg.elasticNetParam])
best_params_rf = filter_best_params(best_model_rf, [rf.numTrees, rf.maxDepth])
best_params_gbt = filter_best_params(best_model_gbt, [gbt.maxDepth, gbt.maxIter])

# Print the best parameters for each model
print("Best parameters for Logistic Regression:", best_params_lr)
print("Best parameters for Random Forest:", best_params_rf)
print("Best parameters for Gradient Boosted Decision Tree:", best_params_gbt)


Best parameters for Logistic Regression: {'elasticNetParam': 0.0, 'regParam': 0.01}
Best parameters for Random Forest: {'maxDepth': 5, 'numTrees': 100}
Best parameters for Gradient Boosted Decision Tree: {'maxDepth': 5, 'maxIter': 5}


## Saving the best model by Training it with best parameters by creating Pipeline

In [26]:
# Re-create the GBTClassifier with the best parameters
gbt_best = GBTClassifier(labelCol="SeriousDlqin2yrs", featuresCol="features", maxDepth=5, maxIter=5)

# Create a pipeline with the re-created GBT model
pipeline_best_gbt = Pipeline(stages=[indexer, encoder, assembler, gbt_best])

# Retrain the model on the entire training dataset
best_model_gbt = pipeline_best_gbt.fit(train_df)

In [31]:
# Define the path to save the retrained GBT model
path_best_gbt = "hdfs:///user/talentum/models/"

import subprocess
# Define the HDFS command
command = "hdfs dfs -rm -r models"
# Run the command
subprocess.run(command, shell=True, check=True)

# Save the retrained model
best_model_gbt.save(path_best_gbt)

print("Best models saved successfully.")

Best models saved successfully.
