# Final optimizations

We will perform a small optimization on some parameters and try different undersampling ratios as well.

In [1]:
# Imports
import findspark
findspark.init()
findspark.find()
import pyspark

In [2]:
# Imports for creating spark session
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('sparkify-capstone-model').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
# Imports for modelling, tuning and evaluation
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [4]:
# Imports for visualization and output
import matplotlib.pyplot as plt
from IPython.display import HTML, display

In [5]:
# Read in dataset
conf.set("spark.driver.maxResultSize",  "0")
path = "out/features.parquet"
df = spark.read.parquet(path)

In [6]:
def createSubset(df, factor):
    """
    INPUT:
        df: The dataset to split
        factor: How much of the dataset to return
    OUTPUT: 
        df_subset: The split subset
    """
    df_subset, df_dummy = df.randomSplit([factor, 1 - factor])
    return df_subset

In [7]:
def printConfusionMatrix(tp, fp, tn, fn):
    """ Simple function to output a confusion matrix from f/t/n/p values as html table.
    INPUT:
        data: The array to print as table
    OUTPUT:
        Prints the array as html table.
    """
    html = "<table><tr><td></td><td>Act. True</td><td>False</td></tr>"
    html += "<tr><td>Pred. Pos.</td><td>{}</td><td>{}</td></tr>".format(tp, fp)
    html += "<tr><td>Negative</td><td>{}</td><td>{}</td></tr>".format(fn, tn)    
    html += "</table>"
    display(HTML(html))    
    
def showEvaluationMetrics(predictions):
    """ Calculate and print the some evaluation metrics for the passed predictions.
    INPUT:
        predictions: The predictions to evaluate and print
    OUTPUT:
        Just prints the evaluation metrics
    """
    # Calculate true, false positives and negatives to calculate further metrics later:
    tp = predictions[(predictions.churn == 1) & (predictions.prediction == 1)].count()
    tn = predictions[(predictions.churn == 0) & (predictions.prediction == 0)].count()
    fp = predictions[(predictions.churn == 0) & (predictions.prediction == 1)].count()
    fn = predictions[(predictions.churn == 1) & (predictions.prediction == 0)].count()
    
    printConfusionMatrix(tp, fp, tn, fn)
    
    # Calculate and print metrics
    f1 = MulticlassClassificationEvaluator(labelCol = "churn", metricName = "f1") \
        .evaluate(predictions)
    accuracy = float((tp + tn) / (tp + tn + fp + fn))
    recall = float(tp / (tp + fn))
    precision = float(tp / (tp + fp))
    print("F1: ", f1) 
    print("Accuracy: ", accuracy) 
    print("Recall: ", recall)
    print("Precision: ", precision) 

In [8]:
def undersampleNegatives(df, ratio, labelCol = "churn"):
    """
    Undersample the negatives (0's) in the given dataframe by ratio.
    
    NOTE: The "selection" method here is of course very crude and in a real version should be randomized and shuffled.
    
    INPUT:
        df: dataframe to undersample negatives from
        ratio: Undersampling ratio
        labelCol: LAbel column name in the input dataframe
    OUTPUT:
        A new dataframe with negatives undersampled by ratio
    """
    zeros = df.filter(df[labelCol] == 0)
    ones = df.filter(df[labelCol] == 1)
    zeros = createSubset(zeros, ratio)
    return zeros.union(ones)

In [9]:
def tv_gs_GBT(df_train, df_test, labelCol = "churn", featuresCol = "features"):
    
    gbt = GBTClassifier(labelCol = labelCol, featuresCol = featuresCol)
        
    parameterGrid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [2, 5]) \
            .addGrid(gbt.maxIter, [30, 120]) \
            .build()
    
    tv_split = TrainValidationSplit(estimator = gbt, 
                          estimatorParamMaps = parameterGrid,
                          evaluator = MulticlassClassificationEvaluator(labelCol = labelCol),
                          trainRatio = 0.8)
    
    model = tv_split.fit(df_train)    
    bestModel = model.bestModel
             
    bestParams = {'maxDepth':bestModel._java_obj.getMaxDepth(), \
                  'maxIter':bestModel._java_obj.getMaxIter()}
        
    predictions = model.transform(df_test)
    evaluator = MulticlassClassificationEvaluator(labelCol = labelCol)
    
    showEvaluationMetrics(predictions)  
                     
    return bestParams, bestModel

# Perform cross-validation and grid search on a small subset
df_subset = createSubset(df, .1)

In [10]:
df_undersampled = undersampleNegatives(df_subset, .6)
df_train, df_test = df_undersampled.randomSplit([0.9, 0.1])
gbt_bestParams, gbt_bestModel = tv_gs_GBT(df_train, df_test)
print("Best set of parameters: ", gbt_bestParams)        

0,1,2
,Act. True,False
Pred. Pos.,1012,3
Negative,7,2553


F1:  0.9972011402692175
Accuracy:  0.9972027972027973
Recall:  0.9931305201177625
Precision:  0.9970443349753695
Best set of parameters:  {'maxDepth': 5, 'maxIter': 120}


In [11]:
df_undersampled = undersampleNegatives(df_subset, .5)
df_train, df_test = df_undersampled.randomSplit([0.9, 0.1])
gbt_bestParams, gbt_bestModel = tv_gs_GBT(df_train, df_test)
print("Best set of parameters: ", gbt_bestParams)

0,1,2
,Act. True,False
Pred. Pos.,953,1
Negative,5,2043


F1:  0.998000218654461
Accuracy:  0.9980013324450366
Recall:  0.9947807933194155
Precision:  0.9989517819706499
Best set of parameters:  {'maxDepth': 5, 'maxIter': 120}


In [12]:
# Output the notebook to an html file
from subprocess import call
call(['python', '-m', 'nbconvert', 'optimize.ipynb'])

0