In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

22/03/06 11:05:58 WARN Utils: Your hostname, Fans-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.0.0.115 instead (on interface en0)
22/03/06 11:05:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/06 11:05:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/06 11:06:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/06 11:06:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

## Create an RDD

In [11]:
#load and convert the data
census_raw = sc.textFile('../Data/adult.raw').map(lambda x: x.split(', '))
census_raw.first()

['39',
 'State-gov',
 '77516',
 'Bachelors',
 'Never-married',
 'Adm-clerical',
 'Not-in-family',
 'White',
 'Male',
 '2174',
 '0',
 '40',
 'United-States',
 '<=50K']

In [12]:
census_raw = census_raw.map(lambda x: [toDoubleSafe(i) for i in x])
census_raw.first()

[39.0,
 'State-gov',
 77516.0,
 'Bachelors',
 'Never-married',
 'Adm-clerical',
 'Not-in-family',
 'White',
 'Male',
 2174.0,
 0.0,
 40.0,
 'United-States',
 '<=50K']

## Convert the RDD to DataFrame.


In [13]:
from pyspark.sql.types import *
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])

In [14]:
dfraw = ss.createDataFrame(census_raw, adultschema)

In [16]:
dfraw.show(5)

+----+----------------+--------+---------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|education|    marital_status|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+---------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|Bachelors|     Never-married|     Adm-clerical|Not-in-family|White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|Bachelors|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|  HS-grad|          Divorced|Handlers-cleaners|Not-in-family|White|  Male|         0.0|         0.0|          40.0| Un

## Clean the data. 

### Missing data imputation.


In [21]:
#Missing data imputation - Impute the most common row for "?".
dfrawnona = dfraw.replace(['?'],['Private'],['workclass'])\
                .replace(['?'],['Prof-specialty'],['occupation'])\
                .replace(["?"], ["United-States"], ["native_country"])
dfrawnona.show()

+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|   education|      marital_status|       occupation| relationship|              race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|   Bachelors|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|   Bachelors|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|     HS-grad|            Div

### Convert strings to categorical values

In [36]:
#converting strings to numeric values
from pyspark.ml.feature import StringIndexer
categorical_col = ["income", "sex", "workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"]

def indexStringColumns(df, cols):
    newdf = df
    
    for c in cols:
        newdf = StringIndexer(inputCol=c, outputCol=c+"-num")\
                    .fit(newdf)\
                    .transform(newdf)\
                    .drop(c).withColumnRenamed(c+"-num", c)
        
        
    return newdf

In [43]:
df_numeric = indexStringColumns(dfrawnona, categorical_col)
df_numeric.show()

+----+--------+------------+------------+--------------+------+---+---------+---------+--------------+----------+------------+----+--------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|income|sex|workclass|education|marital_status|occupation|relationship|race|native_country|
+----+--------+------------+------------+--------------+------+---+---------+---------+--------------+----------+------------+----+--------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|   0.0|0.0|      3.0|      2.0|           1.0|       3.0|         1.0| 0.0|           0.0|
|50.0| 83311.0|         0.0|         0.0|          13.0|   0.0|0.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|           0.0|
|38.0|215646.0|         0.0|         0.0|          40.0|   0.0|0.0|      0.0|      0.0|           2.0|       8.0|         1.0| 0.0|           0.0|
|53.0|234721.0|         0.0|         0.0|          40.0|   0.0|0.0|      0.0|      5.0|           0.0|       8.0|     

In [60]:
from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols: 
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        
        newdf = ohe_model.transform(newdf).drop(c).withColumnRenamed(c+"-onehot", c)
        
    return newdf

In [65]:
try:
    categorical_col.remove('income')
except ValueError:
    dfhot= oneHotEncodeColumns(df_numeric,categorical_col)
dfhot.show(5)

+----+--------+------------+------------+--------------+------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|income|          sex|    workclass|     education|marital_status|    occupation| relationship|         race|native_country|
+----+--------+------------+------------+--------------+------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|   0.0|(2,[0],[1.0])|(8,[3],[1.0])|(16,[2],[1.0])| (7,[1],[1.0])|(14,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])|(41,[0],[1.0])|
|50.0| 83311.0|         0.0|         0.0|          13.0|   0.0|(2,[0],[1.0])|(8,[1],[1.0])|(16,[2],[1.0])| (7,[0],[1.0])|(14,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])|(41,[0],[1.0])|
|38.0|215646.0|         0.0|         0.0|          40.0|   0.0|(2,[0],[1.0])|(8,[0],[1.0])|(16,[0],[

### Create a feature vector

In [76]:
columns = dfhot.columns
columns.remove('income')

In [77]:
columns

['age',
 'fnlwgt',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'sex',
 'workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'native_country']

In [79]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler

def Vector_Assembler(df,y_column):
    columns = df.columns
    # remove y column
    columns.remove(y_column)
    va= VectorAssembler(inputCols=columns,outputCol='features').transform(df)
    lpoints = va.select("features", y_column).withColumnRenamed(y_column, "label")
    return lpoints

In [82]:
lpoints = Vector_Assembler(dfhot,'income')
lpoints.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(104,[0,1,2,4,5,1...|  0.0|
|(104,[0,1,4,5,8,1...|  0.0|
|(104,[0,1,4,5,7,1...|  0.0|
|(104,[0,1,4,5,7,2...|  0.0|
|(104,[0,1,4,6,7,1...|  0.0|
+--------------------+-----+
only showing top 5 rows



## Divide the dataset into training and vaildation sets.

In [86]:
#Divide the dataset into training and vaildation sets.
splits = lpoints.randomSplit([.8,.2])
train = splits[0].cache()
val = splits[1].cache()

In [87]:
train.show(5)

[Stage 222:>                                                        (0 + 1) / 1]

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(104,[0,1,2,4,5,7...|  1.0|
|(104,[0,1,2,4,5,7...|  0.0|
|(104,[0,1,2,4,5,7...|  1.0|
|(104,[0,1,2,4,5,7...|  0.0|
|(104,[0,1,2,4,5,7...|  0.0|
+--------------------+-----+
only showing top 5 rows



                                                                                

## Train the model.

In [117]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(train)

## Interpret the model parameters.

In [118]:
#Interpret the model parameters
print(lrmodel.coefficients)
print(lrmodel.intercept)

[0.019048544353646533,6.025532035592802e-07,0.00014042478869093222,0.0005294247009284179,0.027058871790069616,0.27393702618167853,-0.2739370261816784,0.04101027175690962,-0.3778735225736142,0.021355289697829582,-0.14130834246190138,0.269750282079987,0.548608618665547,-0.9571939109863444,-1.3252952620750136,-0.3518872465611693,-0.024203278536703846,0.748561652828763,1.0891877637834393,0.11914515859647198,-0.9207895010737327,0.1723818673605788,-0.9583869592527567,-1.4067753509734287,1.6586423825886842,-1.1234445034569833,-0.773599454957589,1.741486007244763,-1.1680553749802483,-1.4263117239011553,-1.5591811623517093,0.834147021595898,-0.693788676066706,-0.29608201592777494,-0.3003503380595792,-0.2498361368241031,-0.2081314991427576,0.8910839754504425,0.1958758738380577,0.04047818607945754,0.6394337824883596,-0.017336455381927126,0.2032966569704506,-0.7635144236445873,-0.2659076636353434,-0.1075329391964485,-0.5891256912849959,-0.9023057153793905,0.5232383327771201,0.404380524340088,-1.16

In [119]:
#Evaluate models using test dataset.
val_pred = lrmodel.transform(val)

In [120]:
train_pred = lrmodel.transform(train)

In [121]:
val_pred.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(104,[0,1,2,4,5,7...|  0.0|[0.69164559860821...|[0.66633289832113...|       0.0|
|(104,[0,1,2,4,5,7...|  1.0|[-13.381595448889...|[1.54328546968845...|       1.0|
|(104,[0,1,2,4,5,7...|  0.0|[1.08148766550806...|[0.74677540697372...|       0.0|
|(104,[0,1,2,4,5,7...|  1.0|[0.10351239447730...|[0.52585501676244...|       0.0|
|(104,[0,1,2,4,5,7...|  1.0|[-12.972225867524...|[2.32398263389271...|       1.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



## Output
rawPrediction : includes two values - log-odds that a sample doesn't and does belong to the category (making > 50,000).

probability : the probability that the sample is not in the category.

prediction : proability that the sample belongs to the category.

In [103]:
val_pred.select('rawPrediction').show(5)

+--------------------+
|       rawPrediction|
+--------------------+
|[0.69164559860821...|
|[-13.381595448889...|
|[1.08148766550806...|
|[0.10351239447730...|
|[-12.972225867524...|
+--------------------+
only showing top 5 rows



In [100]:
val_pred.select('probability').collect()

[Row(probability=DenseVector([0.6663, 0.3337])),
 Row(probability=DenseVector([0.0, 1.0])),
 Row(probability=DenseVector([0.7468, 0.2532])),
 Row(probability=DenseVector([0.5259, 0.4741])),
 Row(probability=DenseVector([0.0, 1.0])),
 Row(probability=DenseVector([0.6776, 0.3224])),
 Row(probability=DenseVector([0.5081, 0.4919])),
 Row(probability=DenseVector([0.2649, 0.7351])),
 Row(probability=DenseVector([0.4058, 0.5942])),
 Row(probability=DenseVector([0.4156, 0.5844])),
 Row(probability=DenseVector([0.5248, 0.4752])),
 Row(probability=DenseVector([0.3932, 0.6068])),
 Row(probability=DenseVector([0.7173, 0.2827])),
 Row(probability=DenseVector([0.5057, 0.4943])),
 Row(probability=DenseVector([0.8711, 0.1289])),
 Row(probability=DenseVector([0.4557, 0.5443])),
 Row(probability=DenseVector([0.2247, 0.7753])),
 Row(probability=DenseVector([0.2422, 0.7578])),
 Row(probability=DenseVector([0.2922, 0.7078])),
 Row(probability=DenseVector([0.2858, 0.7142])),
 Row(probability=DenseVector([0.

## Evaluate the model.

In [113]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval= BinaryClassificationEvaluator()
bceval.setMetricName('areaUnderROC')
print (bceval.getMetricName() +":" + str(bceval.evaluate(val_pred)))

areaUnderROC:0.9009564285505816


In [115]:
#Evaluate the model. metric : Area Under PR
bceval.setMetricName('areaUnderPR').evaluate(val_pred)

0.753519493342017

### n-fold validation and the results.

In [116]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder


In [138]:
bceval= BinaryClassificationEvaluator()
bceval.setMetricName('areaUnderROC')

BinaryClassificationEvaluator_af4a2d279449

In [139]:
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder()\
            .addGrid(lr.maxIter,[1000])\
            .addGrid(lr.regParam,[0.0001, 0.001])\
            .build()

cv = CrossValidator()\
        .setEstimator(lr)\
        .setEvaluator(bceval)\
        .setEstimatorParamMaps(paramGrid)\
        .setNumFolds(5)

In [140]:
cvmodel =cv.fit(train)

In [146]:
cv_val_pred = cvmodel.bestModel.transform(val)
bceval.setMetricName('areaUnderPR').evaluate(cv_val_pred)

0.7625897590222693

In [141]:
print(cvmodel.bestModel.getMaxIter())
print(cvmodel.bestModel.getRegParam())

1000
0.001


In [147]:
sc.stop()