In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('adult_logReg').getOrCreate()
df = spark.read.csv('adult.csv', inferSchema = True, header=True)
df.show(3)

+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt| education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
| 25|  Private|226802|      11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|
| 38|  Private| 89814|   HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-States| <=50K|
| 28|Local-gov|336951|Assoc-acdm|             12|Married-civ-spouse|  Protective-serv|     Husband|White|  Male|        

In [2]:
cols = df.columns

In [3]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [5]:
label_stringIdx = StringIndexer(inputCol = 'income', outputCol = 'label')
stages += [label_stringIdx]

In [6]:
numericCols = ["age", "fnlwgt", "educational-num", "capital-gain", "capital-loss", "hours-per-week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [7]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedcols = ["label", "features"] + cols
df = df.select(selectedcols)
df.show(3)

+-----+--------------------+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|label|            features|age|workclass|fnlwgt| education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+-----+--------------------+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|  0.0|(100,[0,13,24,35,...| 25|  Private|226802|      11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|
|  0.0|(100,[0,8,23,39,4...| 38|  Private| 89814|   HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-

In [8]:
display(df)

DataFrame[label: double, features: vector, age: int, workclass: string, fnlwgt: int, education: string, educational-num: int, marital-status: string, occupation: string, relationship: string, race: string, gender: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, income: string]

In [9]:
train, test = df.randomSplit([0.7, 0.3], seed=100)
print(train.count())
print(test.count())

34255
14587


### Logistic Regression

In [10]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter=10)
lrModel = lr.fit(train)

In [11]:
predictions = lrModel.transform(test)
predictions.take(1)

[Row(label=0.0, features=SparseVector(100, {0: 1.0, 8: 1.0, 23: 1.0, 29: 1.0, 43: 1.0, 48: 1.0, 52: 1.0, 53: 1.0, 94: 26.0, 95: 58426.0, 96: 9.0, 99: 50.0}), age=26, workclass='Private', fnlwgt=58426, education='HS-grad', educational-num=9, marital-status='Married-civ-spouse', occupation='Prof-specialty', relationship='Husband', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country='United-States', income='<=50K', rawPrediction=DenseVector([0.8176, -0.8176]), probability=DenseVector([0.6937, 0.3063]), prediction=0.0)]

In [12]:
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [13]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [14]:
selected.show(4)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.69373398473136...| 26|Prof-specialty|
|  0.0|       0.0|[0.61163146131025...| 30|Prof-specialty|
|  0.0|       0.0|[0.66067198545007...| 31|Prof-specialty|
|  0.0|       0.0|[0.66129318473475...| 32|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 4 rows



In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Area Under ROC', evaluator.evaluate(predictions))

Area Under ROC 0.9023600168373183


In [16]:
evaluator.getMetricName()

'areaUnderROC'

In [17]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [18]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [19]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train)

In [20]:
predictions = cvModel.transform(test)
print('Area Under ROC', evaluator.evaluate(predictions))

Area Under ROC 0.9014836720704358


### Decision Trees

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)

In [22]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

numNodes =  15
depth =  3


In [23]:
predictions = dtModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.6750538342550954

In [26]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [28]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train)

In [29]:
print("numNodes = ", cvModel.bestModel.numNodes)
print("depth = ", cvModel.bestModel.depth)

numNodes =  561
depth =  10


In [30]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.7930449475013471

In [31]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [32]:
selected.show(3)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.91479820627802...| 26|Prof-specialty|
|  0.0|       0.0|[0.79166666666666...| 30|Prof-specialty|
|  0.0|       0.0|[0.79166666666666...| 31|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 3 rows



### Random Forest

In [34]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [35]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [36]:
selected.show(3)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.63089701288376...| 26|Prof-specialty|
|  0.0|       0.0|[0.58219399032986...| 30|Prof-specialty|
|  0.0|       0.0|[0.56792881180860...| 31|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 3 rows



In [38]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8874404939215993

In [39]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(train)

In [40]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.8950399552714288

In [41]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [42]:
selected.show(3)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.70195216177776...| 26|Prof-specialty|
|  0.0|       0.0|[0.64177635600230...| 30|Prof-specialty|
|  0.0|       0.0|[0.63093587802395...| 31|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 3 rows



### Make Predictions

In [43]:
bestModel = cvModel.bestModel
final_predictions = bestModel.transform(df)
evaluator.evaluate(final_predictions)

0.8969236717684712