# Random Forest with Hyperparameter Tuning

In [1]:
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [4]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [6]:
# print runtime versions
# Python version
sys.version

'3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]'

In [7]:
# Spark version
spark.version

'3.2.1'

### Exploring Data

In [8]:
# load classification_data.csv into Spark dataframe
df = spark.read.csv('data/classification_data.csv', header=True, inferSchema=True)

                                                                                

In [9]:
# check the shape of the data 
df.count(),len(df.columns)

(46751, 12)

In [10]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- label: integer (nullable = true)



In [11]:
# First 5 rows of Iris dataset
df.show(5)

+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|label|
+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|    A_1|    personal|            1|                   7900|                                            0.8|         1103|           6393|          1|        16400| 42|               4|    0|
|    A_2|    personal|            0|                   3300|                                           0.29|         2588|            832|          1|        75500| 56|               1|    0|
|    A_3|    personal|            0|    

                                                                                

In [12]:
# Exploratory Data Analysis
df.describe().show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|loan_id|loan_purpose|     is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|     saving_amount|   checking_amount|       is_employed|     yearly_salary|               age|  dependent_number|              label|
+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|  46751|       46751|             46751|                  46751|                                          46751|             46751|             46751|             46751|             46751|             46751|             467

                                                                                

In [13]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|16201|
|    0|30550|
+-----+-----+



In [14]:
df.groupBy('loan_purpose').count().show()

+------------+-----+
|loan_purpose|count|
+------------+-----+
|      others| 6763|
|   emergency| 7562|
|    property|11388|
|  operations|10580|
|    personal|10458|
+------------+-----+



### Feature Engineering

In [15]:
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index").fit(df)
df = loan_purpose_indexer.transform(df)

                                                                                

In [16]:
df.select(['loan_purpose','loan_index']).show(5,False)

+------------+----------+
|loan_purpose|loan_index|
+------------+----------+
|personal    |2.0       |
|personal    |2.0       |
|personal    |2.0       |
|personal    |2.0       |
|emergency   |3.0       |
+------------+----------+
only showing top 5 rows



In [17]:
df.columns

['loan_id',
 'loan_purpose',
 'is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'label',
 'loan_index']

In [18]:
feature_cols = df.columns[2:-2]
feature_cols += ['loan_index']
feature_cols

['is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_index']

In [19]:
df_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = df_assembler.transform(df)

In [20]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- loan_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [21]:
df.select(['features','label']).show(10,False)

+--------------------------------------------------------+-----+
|features                                                |label|
+--------------------------------------------------------+-----+
|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,2.0] |0    |
|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,2.0] |0    |
|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,2.0] |0    |
|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,2.0]|0    |
|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,3.0] |1    |
|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,1.0]|0    |
|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,1.0]|0    |
|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,2.0]|0    |
|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,2.0]|0    |
|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,2.0]  |1    |
+--------------------------------------------------------+-----+
only showing top 10 rows



In [22]:
# select data for building model
model_df=df.select(['features','label'])

### Split Data - Train & Test sets

In [23]:
# use Random Forest to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [24]:
train_df.count(), len(train_df.columns)

(32776, 2)

In [25]:
test_df.count(), len(test_df.columns)

                                                                                

(13975, 2)

### Build Random Forest Model

In [26]:
rf = RandomForestClassifier()
rf_model = rf.fit(train_df)

                                                                                

In [27]:
# predict on the test set
model_predictions = rf_model.transform(test_df)

In [28]:
# print prediction
model_predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10,[1,2,3,4,7],[...|    0|[16.6838849289974...|[0.83419424644987...|       0.0|
|(10,[1,2,3,4,7],[...|    0|[19.2409223242261...|[0.96204611621130...|       0.0|
|(10,[1,2,3,4,7],[...|    1|[5.52256473103276...|[0.27612823655163...|       1.0|
|(10,[1,2,3,4,7],[...|    0|[18.8323430040495...|[0.94161715020247...|       0.0|
|[0.0,500.0,0.59,9...|    1|[3.64939484193013...|[0.18246974209650...|       1.0|
|[0.0,500.0,0.64,1...|    1|[2.12679355088605...|[0.10633967754430...|       1.0|
|[0.0,500.0,0.69,1...|    1|[3.23323701493719...|[0.16166185074685...|       1.0|
|[0.0,500.0,0.76,5...|    1|[2.12679355088605...|[0.10633967754430...|       1.0|
|[0.0,500.0,0.77,1...|    1|[3.31208121245175...|[0.16560406062258...|       1.0|
|[0.0,500.0,0.78

### Evaluate Model

In [29]:
# Select (prediction, true label) to compute AUC
evaluator = BinaryClassificationEvaluator(
    labelCol='label')
rf_auc = evaluator.evaluate(model_predictions)

In [30]:
rf_auc

0.962829631887776

In [31]:
rf_accuracy = MulticlassClassificationEvaluator(labelCol='label',
               metricName='accuracy').evaluate(model_predictions)

In [32]:
'The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy)

'The accuracy of RF on test data is 90%'

### Hyperparameter Tuning

In [33]:
evaluator = BinaryClassificationEvaluator()
rf = RandomForestClassifier()

In [34]:
#paramGrid = (ParamGridBuilder()
#             .addGrid(rf.maxDepth, [5,10,20,25,30])
#             .addGrid(rf.maxBins, [20,30,40 ])
#             .addGrid(rf.numTrees, [5, 20,50])
#             .build())

In [35]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10,20])
             .addGrid(rf.maxBins, [20,30])
             .addGrid(rf.numTrees, [5,20])
             .build())

In [36]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=5)

In [37]:
%%time
cv_model = cv.fit(train_df)

22/05/09 16:22:13 WARN DAGScheduler: Broadcasting large task binary with size 1079.8 KiB
22/05/09 16:22:14 WARN DAGScheduler: Broadcasting large task binary with size 1639.7 KiB
22/05/09 16:22:14 WARN DAGScheduler: Broadcasting large task binary with size 1109.6 KiB
22/05/09 16:22:17 WARN DAGScheduler: Broadcasting large task binary with size 1084.8 KiB
22/05/09 16:22:17 WARN DAGScheduler: Broadcasting large task binary with size 1626.3 KiB
22/05/09 16:22:18 WARN DAGScheduler: Broadcasting large task binary with size 1124.8 KiB
22/05/09 16:22:19 WARN DAGScheduler: Broadcasting large task binary with size 1096.8 KiB
22/05/09 16:22:19 WARN DAGScheduler: Broadcasting large task binary with size 1345.8 KiB
22/05/09 16:22:19 WARN DAGScheduler: Broadcasting large task binary with size 1582.0 KiB
22/05/09 16:22:19 WARN DAGScheduler: Broadcasting large task binary with size 1789.5 KiB
22/05/09 16:22:20 WARN DAGScheduler: Broadcasting large task binary with size 1971.5 KiB
22/05/09 16:22:20 WAR

CPU times: user 659 ms, sys: 223 ms, total: 882 ms
Wall time: 2min 15s


In [38]:
best_rf_model = cv_model.bestModel

In [39]:
f'Best Param(maxDepth): {best_rf_model._java_obj.getMaxDepth()}'

'Best Param(maxDepth): 10'

In [40]:
f'Best Param(maxBins): {best_rf_model._java_obj.getMaxBins()}'

'Best Param(maxBins): 30'

In [41]:
f'Best Param(NumTrees): {best_rf_model._java_obj.getNumTrees()}'

'Best Param(NumTrees): 20'

### Evaluate Tuned Model

In [42]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [43]:
best_rf_auc = evaluator.evaluate(model_predictions)

22/05/09 16:24:25 WARN DAGScheduler: Broadcasting large task binary with size 1172.3 KiB


In [44]:
best_rf_auc

0.9690665508778045

In [45]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()

22/05/09 16:24:25 WARN DAGScheduler: Broadcasting large task binary with size 1178.7 KiB


In [46]:
actual_pos=model_predictions.filter(model_predictions['label']==1).count()

In [47]:
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

22/05/09 16:24:26 WARN DAGScheduler: Broadcasting large task binary with size 1180.8 KiB


In [48]:
#Recall 
float(true_pos)/(actual_pos)

0.9105196982397318

In [49]:
#Precision on test Data 
float(true_pos)/(pred_pos)

0.849960876369327