In [1]:
#1. Build a Classification Model

#In this exercise, you will fit a binary logistic regression model to the baby name dataset you used in the previous exercise. This model will predict the sex of a person based on their age, name, and state they were born in. To train the model, you will use the data found in baby-names/names-classifier.

#a. Prepare in Input Features


#First, you will need to prepare each of the input features. While age is a numeric feature, state and name are not. These need to be converted into numeric vectors before you can train the model. Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors. Use the VectorAssembler to combine the name, state, and age vectors into a single features vector. Your final dataset should contain a column called features containing the prepared vector and a column called label containing the sex of the person.

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [3]:
spark = SparkSession.builder.appName('week10').getOrCreate()
df = spark.read.parquet(r"/FileStore/tables/namesclassifier")

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [5]:
categoricalColumns = ["state", "name", "sex"]
transform = [] 
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    transform += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol="sex", outputCol="label")
transform += [label_stringIdx]

numericCols = ["age"]
assemblerInputs = ["stateclassVec","nameclassVec"]+ numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
transform += [assembler]

In [6]:
partialPipeline = Pipeline().setStages(transform)
pipelineModel = partialPipeline.fit(df)
transformedDataDF = pipelineModel.transform(df)

In [7]:
transformedDataDF.show()

In [8]:
#2. Fit and Evaluate the Model

#Fit the model as a logistic regression model with the following parameters. LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8). Provide the area under the ROC curve for the model.

In [9]:
import matplotlib.pyplot as plt

In [10]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel= lr.fit(transformedDataDF)

In [11]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [12]:
trainingSummary = lrModel.summary

roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('FPR')
plt.xlabel('TPR')
plt.title('ROC')
plt.show()

In [13]:
print(str(trainingSummary.areaUnderROC))