In [39]:
#Instantiate SparkSession
from pyspark.sql import SparkSession
spark=SparkSession \
			.builder \
			.appName('Predicting the grape variety from wine characteristics') \
			.getOrCreate()

In [40]:
spark

In [41]:
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# gsutil cp adult.data gs://dexdebra-123/datasets
# Use sparkSession to read csv file			
rawData=spark.read \
           .format('csv') \
           .option('header','false') \
           .option('ignoreLeadingWhiteSpace','true') \
           .load('gs://dexdebra-123/datasets/adult.data')

In [42]:
rawData

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string]

In [43]:
rawData.toPandas().head()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [44]:
#Assign meaningful Headers
dataSet=rawData.toDF("Age",
	"WorkClass",
	"FnlWgt",
	"Education",
	"EducationNum",
	"MaritalStatus",
	"Occupation",
	"Relationship",
	"Race",
	"Gender",
	"CapitalGain",
	"CapitalLoss",
	"HoursPerWeek",
	"NativeCountry",
	"Label"
					)	

In [45]:
dataSet.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [46]:
# Drop the FnlWgt Column as not relevant
dataSet=dataSet.drop('FnlWgt')

In [47]:
# Replace All '?' value in all cells with specific value('None') in dataFrame
print('Count Before: ' , dataSet.count())
dataSet=dataSet.replace('?',None)
dataSet=dataSet.dropna(how='any')
print('Count After: ' , dataSet.count())


('Count Before: ', 32561)
('Count After: ', 30162)


In [48]:
dataSet.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [49]:
# Describe structure of dataSet
dataSet.describe()


DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [51]:
# Some of the columns Age,EducationNum,HoursPerWeek,CapitalGain,CapitalLoss ould contain numeric values
# But these numeric values are present as strings in this dataframe
# We can convert these values to numeric by using cast operation

from pyspark.sql.types  import FloatType
from pyspark.sql.functions import col

dataSet=dataSet.withColumn('Age', dataSet['Age'].cast(FloatType()))

dataSet=dataSet.withColumn('EducationNum', dataSet['EducationNum'].cast(FloatType()))

dataSet=dataSet.withColumn('CapitalGain', dataSet['CapitalGain'].cast(FloatType()))

dataSet=dataSet.withColumn('CapitalLoss', dataSet['CapitalLoss'].cast(FloatType()))

dataSet=dataSet.withColumn('HoursPerWeek', dataSet['HoursPerWeek'].cast(FloatType()))



In [52]:
dataSet.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [53]:
dataSet.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37.0,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49.0,Private,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52.0,Self-emp-not-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31.0,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42.0,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [54]:
dataSet

DataFrame[Age: float, WorkClass: string, Education: string, EducationNum: float, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: float, CapitalLoss: float, HoursPerWeek: float, NativeCountry: string, Label: string]

In [55]:
# Create a List of Categorical Features which we want to convert to numerical form
categoricalFeatures= [
		'WorkClass',
		'Education',
		'MaritalStatus',
		'Occupation',
		'Relationship',
		'Race',
		'Gender',
		'NativeCountry'
		]

In [56]:
# Create StringIndexer for CategoricalFeatures using python list comprehension
from pyspark.ml.feature import StringIndexer

indexers=[ StringIndexer(inputCol=column,outputCol=column + '_index',handleInvalid='keep') for column in categoricalFeatures]

In [57]:
# Convert Numerical form of categories to One-hot-encoding using python list comprehension
from pyspark.ml.feature import OneHotEncoder
encoders=[ OneHotEncoder ( 
				inputCol=column + '_index',
				outputCol=column + '_encoded') for column  in categoricalFeatures]

In [58]:
# Split the data into 80% for trainingData and 20% for testData
(trainingData,testData)=dataSet.randomSplit([0.8,0.2])

In [59]:
# Add indexer for label To Convert Categorical Strin gRepresentation of Label to numeric form
labelIndexer= [ StringIndexer( inputCol='Label' , outputCol='Label_index')]

In [60]:
# Create a pyspark ml pipeline 
from pyspark.ml import Pipeline

# Setup the various stages of pipeline
pipeline=Pipeline(stages=indexers + encoders + labelIndexer )

# The pipeline will
# Perform all transformations on the input data and then
# Assemble the features into Dense Vector form and then
# Apply RandomForest Classifier to it.

In [61]:
pipeline

Pipeline_42369adc6c7d8b172538

In [62]:
# Pass the trainingData thru the pipeline by calling first the fit() method and then the transform() method both on the trainingData
transformedDF = pipeline.fit(trainingData).transform(trainingData)

In [63]:
transformedDF.toPandas().head()

# All Categorical Features are represented in One-Hot-Encoded form
# And Label is also numeric

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,17.0,Local-gov,11th,7.0,Never-married,Other-service,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,...,4.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [64]:
# Setup the features that we want to use train our ML Model

# We use data in numerical columns as it is
# For CategoricalVariables we will use their one-hot-encoder notation

requiredFeatures= [
        'Age',
        'EducationNum',
        'CapitalGain',
        'CapitalLoss',
        'HoursPerWeek',
        'WorkClass_encoded',
        'Education_encoded',
        'MaritalStatus_encoded',
        'Occupation_encoded',
        'Relationship_encoded',
        'Race_encoded',
        'Gender_encoded',
        'NativeCountry_encoded',
]


In [65]:
requiredFeatures

['Age',
 'EducationNum',
 'CapitalGain',
 'CapitalLoss',
 'HoursPerWeek',
 'WorkClass_encoded',
 'Education_encoded',
 'MaritalStatus_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Gender_encoded',
 'NativeCountry_encoded']

In [67]:
# InputFeatures are fed in the form of a DenseVector
# Use VectorAssember to Vectorize the features to a column called 'Features'
# Pass the columns we want to be vectorized and
# Output will be stored in a column called 'Features'


from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler( inputCols=requiredFeatures , outputCol='Features')

In [68]:
# Use the VectorAssembler to vectorize the features
# Store the result in transformedDF dataframe
transformedDF=assembler.transform(transformedDF)
transformedDF.toPandas().head()

# There is a column called 'features' which holds all of our features in a Dense Vector Representation


Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,Features
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 0.0, ..."
1,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
2,17.0,Local-gov,11th,7.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 16.0, 0.0, 0.0, 1.0, 0.0..."
3,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 20.0, 0.0, 0.0, 1.0, 0.0..."
4,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0.0..."


In [80]:
# Only select the features column to examine exactly how it looks

transformedDF.select('Label_index').toPandas()

Unnamed: 0,Label_index
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


In [84]:
# We are now Ready to go ahead and instantiate the RandomForest Classifier
# ML Model that  we want to use to train on our Adult Census Data Set
from pyspark.ml.classification import RandomForestClassifier

# Specify the labels and features for this classifer and our maxDepth hyperparameter
rf=RandomForestClassifier( labelCol='Label_index', featuresCol='Features' , maxDepth=5)
# Individual Decison Tree should not have a depth of '5'


In [106]:
# Instantiate a ML pipeline for this training workflow

# This pipeline will perform all the transformations that we want on the input data
# Assemble the features into Dense Vector form
# Apply RandomForest Classifier to it

from pyspark.ml import Pipeline
pipeline = Pipeline ( 
        stages = indexers + encoders + labelIndexer + [assembler,rf]
)

In [107]:
pipeline

Pipeline_4fdaa6f10042c78ef2e0

In [109]:
# Run the training pipeline to get the ML Classifier Model
# Call pipeline.fit() on trainingData
model= pipeline.fit(trainingData)

In [110]:
# We now have trained Random Forest Model
# Let us use this model for predictions
# This model is the result of running our training pipeline stages
# Here are our predictions result in a dataframe
# We have columns at very end for rawPrediction, probablity and prediction

model = pipeline.fit(trainingData)

predictions = model.transform(testData)

predictionsDF = predictions.toPandas()

predictionsDF.head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,Features,rawPrediction,probability,prediction
0,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0...","[19.872854812495056, 0.1271451875049458]","[0.9936427406247527, 0.006357259375247289]",0.0
1,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 1.0, ...","[19.730724035734475, 0.26927596426552614]","[0.9865362017867237, 0.013463798213276307]",0.0
2,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0...","[19.862058359416096, 0.13794164058390387]","[0.9931029179708049, 0.006897082029195194]",0.0
3,17.0,Local-gov,11th,7.0,Never-married,Craft-repair,Own-child,White,Male,0.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 35.0, 0.0, 0.0, 1.0, 0.0...","[19.84988581444681, 0.15011418555318776]","[0.9924942907223405, 0.007505709277659388]",0.0
4,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0...","[19.661851193042295, 0.33814880695770305]","[0.9830925596521147, 0.016907440347885154]",0.0


In [111]:
# We will create a DataFrame called predictions with actual label(Label_index) and predicted label (prediction) side by side

predictions = predictions.select ( 'Label_index' , 'prediction')

In [112]:
# Instantiate the MulticlassClassificationEvaluator to see how our model performs

# The metric we are using 'accuracy'
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(
    labelCol='Label_index',
    predictionCol='prediction',
    metricName='accuracy'
)



In [113]:
#Use the evaluator's evaluate method to evaluate the model
accuracy=evaluator.evaluate(predictions)
print('Test Accuracy=',accuracy)


('Test Accuracy=', 0.857354149548069)


In [114]:
# Take a closer look at the incorrect predictions made by the model
# So as to glean some insights from this
predictionsDF.loc[
    predictionsDF['Label_index'] != predictionsDF['prediction']
]

# Take a look at Raw probablity value , they are very close

# The model had a hard time figuring out if this individual earned > 50K or < 50K



Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,Features,rawPrediction,probability,prediction
191,19.0,Private,7th-8th,4.0,Never-married,Other-service,Not-in-family,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(19.0, 4.0, 0.0, 0.0, 60.0, 1.0, 0.0, 0.0, 0.0...","[19.259932455193127, 0.7400675448068702]","[0.9629966227596565, 0.03700337724034352]",0.0
437,21.0,Private,HS-grad,9.0,Married-civ-spouse,Prof-specialty,Own-child,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(21.0, 9.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0.0...","[16.40920333423458, 3.590796665765418]","[0.8204601667117292, 0.17953983328827094]",0.0
593,22.0,Private,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 9.0, 0.0, 0.0, 50.0, 1.0, 0.0, 0.0, 0.0...","[14.976065841712877, 5.023934158287122]","[0.7488032920856439, 0.2511967079143561]",0.0
695,23.0,Private,11th,7.0,Married-civ-spouse,Transport-moving,Husband,White,Male,4508.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(23.0, 7.0, 4508.0, 0.0, 25.0, 1.0, 0.0, 0.0, ...","[9.815794345402017, 10.184205654597985]","[0.49078971727010084, 0.5092102827298992]",1.0
850,24.0,Local-gov,Some-college,10.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 10.0, 0.0, 0.0, 72.0, 0.0, 0.0, 1.0, 0....","[13.883711477146674, 6.116288522853324]","[0.6941855738573337, 0.3058144261426662]",0.0
900,24.0,Private,HS-grad,9.0,Divorced,Exec-managerial,Not-in-family,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 9.0, 0.0, 0.0, 45.0, 1.0, 0.0, 0.0, 0.0...","[18.83873305145277, 1.1612669485472318]","[0.9419366525726384, 0.05806334742736158]",0.0
906,24.0,Private,HS-grad,9.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,4386.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 9.0, 4386.0, 0.0, 40.0, 1.0, 0.0, 0.0, ...","[10.896519783550909, 9.103480216449093]","[0.5448259891775454, 0.4551740108224546]",0.0
909,24.0,Private,HS-grad,9.0,Married-civ-spouse,Other-service,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 9.0, 0.0, 0.0, 45.0, 1.0, 0.0, 0.0, 0.0...","[16.43831006401563, 3.5616899359843672]","[0.8219155032007815, 0.17808449679921837]",0.0
947,24.0,Private,Some-college,10.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 10.0, 7298.0, 0.0, 40.0, 1.0, 0.0, 0.0,...","[10.99194473714528, 9.008055262854723]","[0.5495972368572639, 0.4504027631427361]",0.0
954,24.0,Private,Some-college,10.0,Married-civ-spouse,Tech-support,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 10.0, 0.0, 0.0, 50.0, 1.0, 0.0, 0.0, 0....","[14.587353002797107, 5.412646997202891]","[0.7293676501398554, 0.27063234986014456]",0.0


In [105]:
# Tweek the RandomForest model just a bit
# Instead of maxDepth as '5' , we will change this to '10'
# This means individual decision trees can be much deeper
# We are now Ready to go ahead and instantiate the RandomForest Classifier
# ML Model that  we want to use to train on our Adult Census Data Set
from pyspark.ml.classification import RandomForestClassifier

# Specify the labels and features for this classifer and our maxDepth hyperparameter
rf=RandomForestClassifier( labelCol='Label_index', featuresCol='Features' , maxDepth=10)
# Individual Decison Tree should not have a depth of '10'

# Re-run all the steps again after this step