## LendingClub - Deployment
In this notebook, we will develop a routine to iterate through all the downloaded Loans data files (in csv format) copied to the "./Dataset/Processing" folder and to perform prediction on each data file.

The loans data that are likely to default for each input data file are then saved to a "xxx_Predictions.csv" file in the "./Dataset/Predictions" folder for further processing.

In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
import os
import shutil

# Selected Features for model input
selectedFeatures = ["int_rate", "loan_amnt", "total_pymnt", "out_prncp", "policy_code"]

fileDirectory = './Dataset/Processing/'
for fname in os.listdir(fileDirectory):  
    
    # extract data file name from full path
    filename = fname[0:len(fname)-4]
    
    # Load LoanStatsDF dataset
    LoanStatsDF = spark.read.csv(fileDirectory + fname, header=True, inferSchema=True, mode="DROPMALFORMED")
    
    # Converts numeric percentage string with "%" suffix to Float datatype
    func3 = udf (lambda x: 0.0 if x == None else float(x.strip('%'))/100.0, FloatType())
    # Process numeric percentage string columns in LoanStatsDF dataframe to convert to Float datatype
    LoanStatsML_df = LoanStatsDF.withColumn('int_rate', func3(col('int_rate')))
    
    # Load saved model and perform prediction on LoanStatsML_df data
    model = PipelineModel.load("./Model/LoanStats_model")
    predictions = model.transform(LoanStatsML_df)
    
    predictions_Filter = predictions.filter(predictions.prediction == "1.0")
    
    predictions_Filter.toPandas().to_csv("./Dataset/Predictions/" + filename + "_Predictions.csv", sep=',')