# Credit_Risk_Assessment Model Testing On New Data

## Stage III


   Predicting the Serious Delinquency in 2 years on new Dataset

## Cleaning and Preprocessing

### Setting SparkContext and SparkSession

In [58]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Predicting on New Data").enableHiveSupport().getOrCreate()

sc = spark.sparkContext

### Loading csv file into spark dataframe

In [59]:
file_path = "file:///home/talentum/shared/CDAC_PROJECT/data/GiveMeSomeCredit-testing.csv"

df = spark.read.csv(file_path,header=True,inferSchema=True)

###  understand the data types and structure.

In [75]:
# Droping "_c0" column
print(len(df.columns))
df = df.drop('_c0')


12


### Data type conversion

In [61]:
# Column  MonthlyIncome, NumberOfDependents have String Datatype.
# Converting Datatype to Integer
from pyspark.sql.types import IntegerType, DoubleType

df = df.withColumn("NumberOfDependents", df["NumberOfDependents"].cast(IntegerType()))
df = df.withColumn("MonthlyIncome", df["MonthlyIncome"].cast(DoubleType()))

### Missing values

In [62]:
# Dataframe have "NA" values
# Replace 'NA' values with null
df = df.replace('NA', None)

# Detect missing values
from pyspark.sql.functions import col, when, isnull, count
missing_values = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])


### Handling missing values

In [63]:
# MonthlyIncome and Number of Dependents feature have null values 
# Impute missing values
from pyspark.sql import Window

# 1. Impute MonthlyIncome with median
median_income = df.approxQuantile("MonthlyIncome", [0.5], 0.0)[0]
df = df.withColumn("MonthlyIncome", when(col("MonthlyIncome").isNull(), median_income).otherwise(col("MonthlyIncome")))

# 2. Impute NumberOfDependents with mode
mode_window = Window.partitionBy("NumberOfDependents").orderBy(col("NumberOfDependents").desc())
mode_dependents = df.groupBy("NumberOfDependents").count().orderBy("count", ascending=False).first()[0]
df = df.withColumn("NumberOfDependents", when(col("NumberOfDependents").isNull(), mode_dependents).otherwise(col("NumberOfDependents")))

# Display the cleaned and transformed data

### Outlier detection and handling

In [64]:

from pyspark.sql.functions import col, when, round


# Function to cap outliers using IQR
def cap_outliers(col_name, df):
    quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.05)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df.withColumn(col_name, when(col(col_name) < lower_bound, lower_bound)
                                    .when(col(col_name) > upper_bound, upper_bound)
                                    .otherwise(col(col_name)))
    return df


# Drop records where age is less than 21
df = df.filter(col("age") >= 21)

# Applying capping for columns with potential outliers using IQR
iqr_columns = ["age", "DebtRatio", "MonthlyIncome", "RevolvingUtilizationOfUnsecuredLines"]
for col_name in iqr_columns:
    df = cap_outliers(col_name, df)

# Round off the values after applying IQR to the age column and convert to integer
df = df.withColumn("age", round(col("age"), 0).cast("integer"))

# Handle outliers by dropping records with values greater than specified thresholds
df = df.filter(col("NumberOfTimes90DaysLate") <= 8)
df = df.filter(col("NumberOfTime60-89DaysPastDueNotWorse") <= 12)
df = df.filter(col("NumberOfTime30-59DaysPastDueNotWorse") <= 24)





### Feature engineering

In [65]:
# Create a new column 'DebtRatioCategory'

df = df.withColumn("DebtRatioCategory", when(col("DebtRatio") < 0.2, "Low")
                                       .when(col("DebtRatio") < 0.5, "Medium")
                                       .otherwise("High"))


# Loading Saved Model and Giving Prediction on New Data


In [66]:
from pyspark.ml import PipelineModel

# Load the saved model from HDFS
model_path = "hdfs:///user/talentum/models/"
model = PipelineModel.load(model_path)

# Predict the label column 'SeriousDlqin2yrs'
df_predictions = model.transform(df)
df_result = df_predictions.withColumn("SeriousDlqin2yrs", col("prediction"))
df_result = df_result.drop("prediction","features","rawPrediction","probability","debtratiocategory_index","debtratiocategory_vec")
df_result.select("SeriousDlqin2yrs", "MonthlyIncome").show(10)


+----------------+-------------+
|SeriousDlqin2yrs|MonthlyIncome|
+----------------+-------------+
|             0.0|       3041.0|
|             1.0|       5400.0|
|             1.0|       5400.0|
|             0.0|       4955.0|
|             1.0|       2500.0|
|             0.0|       5400.0|
|             1.0|       7625.0|
|             1.0|      11376.0|
|             0.0|       2950.0|
|             0.0|       5400.0|
+----------------+-------------+
only showing top 10 rows

root
 |-- SeriousDlqin2yrs: double (nullable = false)
 |-- RevolvingUtilizationOfUnsecuredLines: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: double (nullable = true)
 |-- DebtRatio: double (nullable = true)
 |-- MonthlyIncome: double (nullable = true)
 |-- NumberOfOpenCreditLinesAndLoans: integer (nullable = true)
 |-- NumberOfTimes90DaysLate: integer (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60

# Saving Predicted Dataset Into hdfs 

In [67]:
# Define the output directory in HDFS
hdfs_output_dir = "hdfs:///user/talentum/Prediction_data/"

# Save the DataFrame to HDFS in overwrite mode
df_result.coalesce(1).write.mode("overwrite").csv(hdfs_output_dir, header=True)

# hdfs dfs -getmerge Prediction_data/ ~/shared/CDAC_PROJECT/Prediction_data/Prediction_data.csv


In [74]:
df_result.groupBy("SeriousDlqin2yrs").count().show()

+----------------+-----+
|SeriousDlqin2yrs|count|
+----------------+-----+
|             0.0|98143|
|             1.0| 3043|
+----------------+-----+

