In [97]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('algorithm').getOrCreate()

In [152]:
import pandas as pd
import numpy as np
# Load training data. 
data = pd.read_csv("data/credit.csv")

In [153]:
# Let's get an idea of what the data looks like. 
data.dtypes

BAD          int64
LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG      float64
DELINQ     float64
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [160]:
data.head(n=2)

Unnamed: 0,LOAN,MORTDUE,VALUE,REASON,JOB,DEROG,DELINQ,CLAGE,NINQ,property_level,STATUS
0,1100,25860.0,39025.0,0,0,0.0,0.0,94.366667,1.0,1,DEFAULT
1,1300,70053.0,68400.0,0,0,0.0,2.0,121.833333,0.0,2,DEFAULT


In [154]:
# Remove uncorrelated factor by analyzing using corr() function
data.drop('CLNO', axis=1, inplace=True)

# Deal with missing values
import numpy as np
mean = data.mean()
data.replace(np.nan, mean, inplace=True)

# Unclear definition
data.drop('DEBTINC', axis=1, inplace=True)

# Uncorrelated from feature selection method using statistical method
data.drop('YOJ', axis=1, inplace=True)

# Derivate a new variable using value of property
data['property_level'] = np.where(data['VALUE']<=66500,1,np.where(data['VALUE']<=90000,2,3))

In [155]:
# Checking atrributes
data.dtypes

BAD                 int64
LOAN                int64
MORTDUE           float64
VALUE             float64
REASON             object
JOB                object
DEROG             float64
DELINQ            float64
CLAGE             float64
NINQ              float64
property_level      int64
dtype: object

In [103]:
# Checking NaN missing values
data.isna().sum()

BAD                 0
LOAN                0
MORTDUE             0
VALUE               0
REASON            252
JOB               279
DEROG               0
DELINQ              0
CLAGE               0
NINQ                0
property_level      0
dtype: int64

In [156]:
data['REASON'] = pd.factorize(data['REASON'] )[0]
data['JOB'] = pd.factorize(data['JOB'] )[0]

In [105]:
# Checking atrributes
data.dtypes

BAD                 int64
LOAN                int64
MORTDUE           float64
VALUE             float64
REASON              int64
JOB                 int64
DEROG             float64
DELINQ            float64
CLAGE             float64
NINQ              float64
property_level      int64
dtype: object

In [157]:
data.loc[data.BAD == 1, 'STATUS'] = 'DEFAULT'
data.loc[data.BAD == 0, 'STATUS'] = 'PAID'

In [158]:
# Remove BAD
data.drop('BAD', axis=1, inplace=True)

In [159]:
# Checking atrributes
data.dtypes

LOAN                int64
MORTDUE           float64
VALUE             float64
REASON              int64
JOB                 int64
DEROG             float64
DELINQ            float64
CLAGE             float64
NINQ              float64
property_level      int64
STATUS             object
dtype: object

In [110]:
data.to_csv('data/spark.csv', index=False)

In [111]:
# Load training data. 
data = spark.read.csv('data/spark.csv',inferSchema=True,header=True)

In [112]:
# Let's get an idea of what the data looks like. 
data.printSchema()

root
 |-- LOAN: integer (nullable = true)
 |-- MORTDUE: double (nullable = true)
 |-- VALUE: double (nullable = true)
 |-- REASON: integer (nullable = true)
 |-- JOB: integer (nullable = true)
 |-- DEROG: double (nullable = true)
 |-- DELINQ: double (nullable = true)
 |-- CLAGE: double (nullable = true)
 |-- NINQ: double (nullable = true)
 |-- property_level: integer (nullable = true)
 |-- STATUS: string (nullable = true)



In [113]:
data.head()

Row(LOAN=1100, MORTDUE=25860.0, VALUE=39025.0, REASON=0, JOB=0, DEROG=0.0, DELINQ=0.0, CLAGE=94.366666667, NINQ=1.0, property_level=1, STATUS='DEFAULT')

In [114]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns: "label" and "features".

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [115]:
# Let's visualise the columns to help with assembly. 
data.columns

['LOAN',
 'MORTDUE',
 'VALUE',
 'REASON',
 'JOB',
 'DEROG',
 'DELINQ',
 'CLAGE',
 'NINQ',
 'property_level',
 'STATUS']

In [116]:
# Combine all features into one vector named features.
assembler = VectorAssembler(
  inputCols=['LOAN',
             'MORTDUE',
             'VALUE',
             'REASON',
             'JOB',
             'DEROG',
             'DELINQ',
             'CLAGE',
             'NINQ',
             'property_level'],
              outputCol="features")

In [117]:
# Let's transform the data. 
output = assembler.transform(data)

In [118]:
# Let's import the string indexer (similar to the logistic regression exercises).
from pyspark.ml.feature import StringIndexer

In [119]:
indexer = StringIndexer(inputCol="STATUS", outputCol="STATUSIndex")
output_fixed = indexer.fit(output).transform(output)

In [120]:
# Let's select the two columns we want. Features (which contains vectors), and the predictor.
final_data = output_fixed.select("features",'STATUSIndex')

In [121]:
# Split the training and testing set.
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [134]:
# Let's import the relevant classifiers. 
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier,LogisticRegression
from pyspark.ml import Pipeline

In [133]:
# Adjust parameters for decision trees
dtc1 = DecisionTreeClassifier(labelCol='STATUSIndex',featuresCol='features',maxDepth=5,impurity='gini',seed=1)
dtc2 = DecisionTreeClassifier(labelCol='STATUSIndex',featuresCol='features',maxDepth=10,impurity='gini',seed=2)
dtc3 = DecisionTreeClassifier(labelCol='STATUSIndex',featuresCol='features',maxDepth=10,impurity='entropy',seed=3)

In [137]:
# Train the models
dtc1_model = dtc1.fit(train_data)
dtc2_model = dtc2.fit(train_data)
dtc3_model = dtc3.fit(train_data)

In [139]:
# Interpret the predictions
dtc1_predictions = dtc1_model.transform(test_data)
dtc2_predictions = dtc2_model.transform(test_data)
dtc3_predictions = dtc3_model.transform(test_data)

In [141]:
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="STATUSIndex", predictionCol="prediction", metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="STATUSIndex", predictionCol="prediction", metricName="f1")

In [142]:
# Measure the  evaluation
dtc1_acc = acc_evaluator.evaluate(dtc1_predictions)
dtc2_acc = acc_evaluator.evaluate(dtc2_predictions)
dtc3_acc = acc_evaluator.evaluate(dtc3_predictions)

dtc1_f1 = f1_evaluator.evaluate(dtc1_predictions)
dtc2_f1 = f1_evaluator.evaluate(dtc2_predictions)
dtc3_f1 = f1_evaluator.evaluate(dtc3_predictions)

In [143]:
# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*40)
print('The first single decision tree has an accuracy of: {0:2.2f}%'.format(dtc1_acc*100))
print('-'*40)
print('The first single decision tree has a F1 value of: {0:2.2f}%'.format(dtc1_f1*100))
print('-'*40)
print('The second single decision tree has an accuracy of: {0:2.2f}%'.format(dtc2_acc*100))
print('-'*40)
print('The second single decision tree has a F1 value of: {0:2.2f}%'.format(dtc2_f1*100))
print('-'*40)
print('The third single decision tree has an accuracy of: {0:2.2f}%'.format(dtc3_acc*100))
print('-'*40)
print('The third single decision tree has a F1 value of: {0:2.2f}%'.format(dtc3_f1*100))

Here are the results!
----------------------------------------
The first single decision tree has an accuracy of: 82.68%
----------------------------------------
The first single decision tree has a F1 value of: 78.51%
----------------------------------------
The second single decision tree has an accuracy of: 87.35%
----------------------------------------
The second single decision tree has a F1 value of: 86.23%
----------------------------------------
The third single decision tree has an accuracy of: 87.35%
----------------------------------------
The third single decision tree has a F1 value of: 86.29%


In [164]:
# Figure out feature importances
a = dtc1_model.featureImportances
print(a)
b = dtc2_model.featureImportances
print(b)
c = dtc3_model.featureImportances
print(c)

(10,[0,1,2,4,5,6,7,8],[0.06321645830295264,0.05440781538455242,0.05885879600272049,0.017588143151336166,0.1362758718207821,0.4394795650788705,0.1529606001771753,0.07721275008161028])
(10,[0,1,2,3,4,5,6,7,8],[0.14524140601021768,0.12110459811574657,0.11370163219331754,0.026346753903612626,0.04532059660517683,0.08528384853645937,0.2182960207664732,0.14290749482805407,0.10179764904094202])
(10,[0,1,2,3,4,5,6,7,8],[0.14153887904144227,0.1236567498399055,0.08385149577093144,0.03226327706840093,0.0494570874922656,0.09278898056742525,0.20063534388140117,0.16794429043680784,0.10786389590142006])


In [135]:
# Adjust parameters for random forest
rfc1 = RandomForestClassifier(labelCol='STATUSIndex',featuresCol='features',numTrees=20,subsamplingRate=1,seed=4)
rfc2 = RandomForestClassifier(labelCol='STATUSIndex',featuresCol='features',numTrees=20,subsamplingRate=0.8,seed=5)
rfc3 = RandomForestClassifier(labelCol='STATUSIndex',featuresCol='features',numTrees=35,subsamplingRate=1,seed=6)

In [144]:
# Train the models
rfc1_model = rfc1.fit(train_data)
rfc2_model = rfc2.fit(train_data)
rfc3_model = rfc3.fit(train_data)

In [145]:
# Interpret the predictions
rfc1_predictions = rfc1_model.transform(test_data)
rfc2_predictions = rfc2_model.transform(test_data)
rfc3_predictions = rfc3_model.transform(test_data)

In [146]:
# Measure the  evaluation
rfc1_acc = acc_evaluator.evaluate(rfc1_predictions)
rfc2_acc = acc_evaluator.evaluate(rfc2_predictions)
rfc3_acc = acc_evaluator.evaluate(rfc3_predictions)

rfc1_f1 = f1_evaluator.evaluate(rfc1_predictions)
rfc2_f1 = f1_evaluator.evaluate(rfc2_predictions)
rfc3_f1 = f1_evaluator.evaluate(rfc3_predictions)

In [147]:
# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*40)
print('The first random forest has an accuracy of: {0:2.2f}%'.format(rfc1_acc*100))
print('-'*40)
print('The first random forest has a F1 value of: {0:2.2f}%'.format(rfc1_f1*100))
print('-'*40)
print('The second random forest has an accuracy of: {0:2.2f}%'.format(rfc2_acc*100))
print('-'*40)
print('The second random forest has a F1 value of: {0:2.2f}%'.format(rfc2_f1*100))
print('-'*40)
print('The third random forest has an accuracy of: {0:2.2f}%'.format(rfc3_acc*100))
print('-'*40)
print('The third random forest has a F1 value of: {0:2.2f}%'.format(rfc3_f1*100))

Here are the results!
----------------------------------------
The first random forest has an accuracy of: 84.13%
----------------------------------------
The first random forest has a F1 value of: 80.88%
----------------------------------------
The second random forest has an accuracy of: 84.80%
----------------------------------------
The second random forest has a F1 value of: 81.70%
----------------------------------------
The third random forest has an accuracy of: 84.55%
----------------------------------------
The third random forest has a F1 value of: 81.17%


In [168]:
# Figure out feature importances
a = rfc1_model.featureImportances
print(a)
b = rfc2_model.featureImportances
print(b)
c = rfc3_model.featureImportances
print(c)

(10,[0,1,2,3,4,5,6,7,8,9],[0.12302124650498765,0.045848350060963036,0.04909976567773711,0.008129576406415564,0.01574876481622397,0.21586583477082577,0.35071602447516526,0.12869292539899196,0.058833583573123883,0.004043928315565772])
(10,[0,1,2,3,4,5,6,7,8,9],[0.12567880813887544,0.056160576930143466,0.04298737620119368,0.009520926301476117,0.022536995622743118,0.18506613090500545,0.36215291291166596,0.10777506162631204,0.08330191745553163,0.004819293907053056])
(10,[0,1,2,3,4,5,6,7,8,9],[0.09770477974731251,0.047537020711648996,0.049981479227129595,0.008838565930629417,0.01654866910538082,0.1963237330377655,0.3763208781231503,0.12175408127488985,0.0817610306725758,0.0032297621695171258])


In [136]:
# Adjust parameters for logistic regression
log1 = LogisticRegression(featuresCol='features',labelCol='STATUSIndex',family="auto")
log2 = LogisticRegression(featuresCol='features',labelCol='STATUSIndex',family="binomial")

In [148]:
# Train the models
log1_model = log1.fit(train_data)
log2_model = log2.fit(train_data)

In [149]:
# Interpret the predictions
log1_predictions = log1_model.transform(test_data)
log2_predictions = log2_model.transform(test_data)

In [150]:
# Measure the  evaluation
log1_acc = acc_evaluator.evaluate(log1_predictions)
log2_acc = acc_evaluator.evaluate(log2_predictions)

log1_f1 = f1_evaluator.evaluate(log1_predictions)
log2_f1 = f1_evaluator.evaluate(log2_predictions)

In [151]:
# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*40)
print('The first logistic regression has an accuracy of: {0:2.2f}%'.format(log1_acc*100))
print('-'*40)
print('The first logistic regression has a F1 value of: {0:2.2f}%'.format(log1_f1*100))
print('-'*40)
print('The second logistic regression has an accuracy of: {0:2.2f}%'.format(log2_acc*100))
print('-'*40)
print('The second logistic regression has a F1 value of: {0:2.2f}%'.format(log2_f1*100))

Here are the results!
----------------------------------------
The first logistic regression has an accuracy of: 82.51%
----------------------------------------
The first logistic regression has a F1 value of: 78.76%
----------------------------------------
The second logistic regression has an accuracy of: 82.51%
----------------------------------------
The second logistic regression has a F1 value of: 78.76%


In [166]:
# Figure out feature importances
a = log1_model.featureImportances
print(a)
b = log2_model.featureImportances
print(b)

AttributeError: 'LogisticRegressionModel' object has no attribute 'featureImportances'