# Random Forest with Hyperparameter Tuning

In [None]:
import pyspark
import sys

In [None]:
import pyspark.sql.functions as fn

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
# Check spark app name
spark.sparkContext.appName

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [None]:
# print runtime versions
# Python version
sys.version

In [None]:
# Spark version
spark.version

### Exploring Data

In [None]:
# load classification_data.csv into Spark dataframe
df = spark.read.csv('data/classification_data.csv', header=True, inferSchema=True)

In [None]:
# check the shape of the data 
df.count(),len(df.columns)

In [None]:
df.printSchema()

In [None]:
# First 5 rows of Iris dataset
df.show(5)

In [None]:
# Exploratory Data Analysis
df.describe().show()

In [None]:
df.groupBy('label').count().show()

In [None]:
df.groupBy('loan_purpose').count().show()

### Feature Engineering

In [None]:
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index").fit(df)
df = loan_purpose_indexer.transform(df)

In [None]:
df.select(['loan_purpose','loan_index']).show(5,False)

In [None]:
df.columns

In [None]:
feature_cols = df.columns[2:-2]
feature_cols += ['loan_index']
feature_cols

In [None]:
df_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = df_assembler.transform(df)

In [None]:
df.printSchema()

In [None]:
df.select(['features','label']).show(10,False)

In [None]:
# select data for building model
model_df=df.select(['features','label'])

### Split Data - Train & Test sets

In [None]:
# use Random Forest to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [None]:
train_df.count(), len(train_df.columns)

In [None]:
test_df.count(), len(test_df.columns)

### Build Random Forest Model

In [None]:
rf = RandomForestClassifier()
rf_model = rf.fit(train_df)

In [None]:
# predict on the test set
model_predictions = rf_model.transform(test_df)

In [None]:
# print prediction
model_predictions.show(10)

### Evaluate Model

In [None]:
# Select (prediction, true label) to compute AUC
evaluator = BinaryClassificationEvaluator(
    labelCol='label')
rf_auc = evaluator.evaluate(model_predictions)

In [None]:
rf_auc

In [None]:
rf_accuracy = MulticlassClassificationEvaluator(labelCol='label',
               metricName='accuracy').evaluate(model_predictions)

In [None]:
'The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy)

### Hyperparameter Tuning

In [None]:
evaluator = BinaryClassificationEvaluator()
rf = RandomForestClassifier()

In [None]:
#paramGrid = (ParamGridBuilder()
#             .addGrid(rf.maxDepth, [5,10,20,25,30])
#             .addGrid(rf.maxBins, [20,30,40 ])
#             .addGrid(rf.numTrees, [5, 20,50])
#             .build())

In [None]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10,20])
             .addGrid(rf.maxBins, [20,30])
             .addGrid(rf.numTrees, [5,20])
             .build())

In [None]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=5)

In [None]:
%%time
cv_model = cv.fit(train_df)

In [None]:
best_rf_model = cv_model.bestModel

In [None]:
f'Best Param(maxDepth): {best_rf_model._java_obj.getMaxDepth()}'

In [None]:
f'Best Param(maxBins): {best_rf_model._java_obj.getMaxBins()}'

In [None]:
f'Best Param(NumTrees): {best_rf_model._java_obj.getNumTrees()}'

### Evaluate Tuned Model

In [None]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [None]:
best_rf_auc = evaluator.evaluate(model_predictions)

In [None]:
best_rf_auc

In [None]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()

In [None]:
actual_pos=model_predictions.filter(model_predictions['label']==1).count()

In [None]:
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [None]:
#Recall 
float(true_pos)/(actual_pos)

In [None]:
#Precision on test Data 
float(true_pos)/(pred_pos)