In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/01 02:08:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

## Create an RDD

In [3]:
#load and convert the data
census_raw = sc.textFile("../Data/adult.raw").map(lambda x : x.split(",")).map(lambda x : [toDoubleSafe(y) for y in x])


In [4]:
census_raw.first()

                                                                                

[39.0,
 ' State-gov',
 77516.0,
 ' Bachelors',
 ' Never-married',
 ' Adm-clerical',
 ' Not-in-family',
 ' White',
 ' Male',
 2174.0,
 0.0,
 40.0,
 ' United-States',
 ' <=50K']

## Convert the RDD to DataFrame.


In [5]:
from pyspark.sql.types import *
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])


In [6]:
dfraw = ss.createDataFrame(census_raw, adultschema)

In [7]:
dfraw.show()

+----+-----------------+--------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
| age|        workclass|  fnlwgt|    education|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+-----------------+--------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|39.0|        State-gov| 77516.0|    Bachelors|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0| Self-emp-not-inc| 83311.0|    Bachelors|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|          Private|215646

## Clean the data. 

### Missing data imputation.


In [8]:
## Missing data imputation - Impute the most common row for "?".
# workclass : Private, occupation: Prof-specialty, native_country: United-States has missing values
dfraw.groupBy('occupation').count().orderBy('count', ascending=False).show(1) 

[Stage 2:>                                                          (0 + 2) / 2]

+---------------+-----+
|     occupation|count|
+---------------+-----+
| Prof-specialty| 6172|
+---------------+-----+
only showing top 1 row



                                                                                

In [9]:
#Missing data imputation - Impute the most common row for "?".
dfrawrp = dfraw.replace(["?"], ["Private"], ["workclass"])
dfrawrpl = dfrawrp.replace(["?"], ["Prof-specialty"], ["occupation"])
dfrawnona = dfrawrpl.replace(["?"], ["United-States"], ["native_country"])

In [10]:
dfrawnona.show()

+----+-----------------+--------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
| age|        workclass|  fnlwgt|    education|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+-----------------+--------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|39.0|        State-gov| 77516.0|    Bachelors|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0| Self-emp-not-inc| 83311.0|    Bachelors|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|          Private|215646

### Convert strings to categorical values

In [11]:
#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    newdf = df
    
    for c in cols:
        newdf = StringIndexer(inputCol=c, outputCol=c+"-num")\
                    .fit(newdf)\
                    .transform(newdf)\
                    .drop(c).withColumnRenamed(c+"-num", c)
        
        
    return newdf

In [12]:
categorical_col = ["income", "sex", "workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"]
df_numeric = indexStringColumns(dfrawnona, categorical_col)


                                                                                

In [13]:
df_numeric.show()

+----+--------+------------+------------+--------------+------+---+---------+---------+--------------+----------+------------+----+--------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|income|sex|workclass|education|marital_status|occupation|relationship|race|native_country|
+----+--------+------------+------------+--------------+------+---+---------+---------+--------------+----------+------------+----+--------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|   0.0|0.0|      4.0|      2.0|           1.0|       3.0|         1.0| 0.0|           0.0|
|50.0| 83311.0|         0.0|         0.0|          13.0|   0.0|0.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|           0.0|
|38.0|215646.0|         0.0|         0.0|          40.0|   0.0|0.0|      0.0|      0.0|           2.0|       9.0|         1.0| 0.0|           0.0|
|53.0|234721.0|         0.0|         0.0|          40.0|   0.0|0.0|      0.0|      5.0|           0.0|       9.0|     

In [14]:
categorical_col

['income',
 'sex',
 'workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'native_country']

In [15]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols: 
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        
        newdf = ohe_model.transform(newdf).drop(c).withColumnRenamed(c+"-onehot", c)
        
    return newdf

categorical_col.remove("income")
dfhot = oneHotEncodeColumns(df_numeric, categorical_col)

In [16]:
dfhot.show()

+----+--------+------------+------------+--------------+------+-------------+-------------+---------------+--------------+---------------+-------------+-------------+---------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|income|          sex|    workclass|      education|marital_status|     occupation| relationship|         race| native_country|
+----+--------+------------+------------+--------------+------+-------------+-------------+---------------+--------------+---------------+-------------+-------------+---------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|   0.0|(2,[0],[1.0])|(9,[4],[1.0])| (16,[2],[1.0])| (7,[1],[1.0])| (15,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])| (42,[0],[1.0])|
|50.0| 83311.0|         0.0|         0.0|          13.0|   0.0|(2,[0],[1.0])|(9,[1],[1.0])| (16,[2],[1.0])| (7,[0],[1.0])| (15,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])| (42,[0],[1.0])|
|38.0|215646.0|         0.0|         0.0|          40.0|   0.0|(2,[0],[1.0])|(9,[0],[

### Create a feature vector

In [17]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler

input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass",
            "education","marital_status","occupation","relationship","native_country","race"]

va = VectorAssembler(inputCols=input_cols, outputCol="features").transform(dfhot)

In [18]:
lpoints = va.select("features", "income").withColumnRenamed("income", "label")

In [19]:
lpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(107,[0,1,3,4,5,1...|  0.0|
|(107,[0,3,4,5,8,1...|  0.0|
|(107,[0,3,4,5,7,1...|  0.0|
|(107,[0,3,4,5,7,2...|  0.0|
|(107,[0,3,4,6,7,1...|  0.0|
|(107,[0,3,4,6,7,1...|  0.0|
|(107,[0,3,4,6,7,2...|  0.0|
|(107,[0,3,4,5,8,1...|  1.0|
|(107,[0,1,3,4,6,7...|  1.0|
|(107,[0,1,3,4,5,7...|  1.0|
|(107,[0,3,4,5,7,1...|  1.0|
|(107,[0,3,4,5,11,...|  1.0|
|(107,[0,3,4,6,7,1...|  0.0|
|(107,[0,3,4,5,7,2...|  0.0|
|(107,[0,3,4,5,7,2...|  1.0|
|(107,[0,3,4,5,7,2...|  0.0|
|(107,[0,3,4,5,8,1...|  0.0|
|(107,[0,3,4,5,7,1...|  0.0|
|(107,[0,3,4,5,7,2...|  0.0|
|(107,[0,3,4,6,8,1...|  1.0|
+--------------------+-----+
only showing top 20 rows



22/03/01 02:09:35 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## Divide the dataset into training and vaildation sets.

In [20]:
#Divide the dataset into training and vaildation sets.
splits = lpoints.randomSplit([0.8, 0.2])
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

## Train the model.

In [None]:
#Train the model.
from pyspark.ml.classification import LogisticRegression


## Interpret the model parameters.

In [None]:
#Interpret the model parameters
print(lrmodel.coefficients)
print(lrmodel.intercept)

In [None]:
#Evaluate models using test dataset.


## Output
rawPrediction : includes two values - log-odds that a sample doesn't and does belong to the category (making > 50,000).

probability : the probability that the sample is not in the category.

prediction : proability that the sample belongs to the category.

## Evaluate the model.

In [None]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
#Evaluate the model. metric : Area Under PR


### n-fold validation and the results.

In [None]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
