# Final model

Train the best model on the bigger dataset and evaluate once more.

In [1]:
# Imports
import findspark
findspark.init()
findspark.find()
import pyspark

In [2]:
# Imports for creating spark session
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('sparkify-capstone-model').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
# Imports for modelling, tuning and evaluation
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [4]:
# Imports for visualization and output
import matplotlib.pyplot as plt
from IPython.display import HTML, display

In [5]:
# Read in dataset
conf.set("spark.driver.maxResultSize",  "0")
path = "out/features.parquet"
df = spark.read.parquet(path)

In [6]:
def createSubset(df, factor):
    """
    INPUT:
        df: The dataset to split
        factor: How much of the dataset to return
    OUTPUT: 
        df_subset: The split subset
    """
    df_subset, df_dummy = df.randomSplit([factor, 1 - factor])
    return df_subset

In [7]:
def printConfusionMatrix(tp, fp, tn, fn):
    """ Simple function to output a confusion matrix from f/t/n/p values as html table.
    INPUT:
        data: The array to print as table
    OUTPUT:
        Prints the array as html table.
    """
    html = "<table><tr><td></td><td>Act. True</td><td>False</td></tr>"
    html += "<tr><td>Pred. Pos.</td><td>{}</td><td>{}</td></tr>".format(tp, fp)
    html += "<tr><td>Negative</td><td>{}</td><td>{}</td></tr>".format(fn, tn)    
    html += "</table>"
    display(HTML(html))    
    
def showEvaluationMetrics(predictions):
    """ Calculate and print the some evaluation metrics for the passed predictions.
    INPUT:
        predictions: The predictions to evaluate and print
    OUTPUT:
        Just prints the evaluation metrics
    """
    # Calculate true, false positives and negatives to calculate further metrics later:
    tp = predictions[(predictions.churn == 1) & (predictions.prediction == 1)].count()
    tn = predictions[(predictions.churn == 0) & (predictions.prediction == 0)].count()
    fp = predictions[(predictions.churn == 0) & (predictions.prediction == 1)].count()
    fn = predictions[(predictions.churn == 1) & (predictions.prediction == 0)].count()
    
    printConfusionMatrix(tp, fp, tn, fn)
    
    # Calculate and print metrics
    f1 = MulticlassClassificationEvaluator(labelCol = "churn", metricName = "f1") \
        .evaluate(predictions)
    accuracy = float((tp + tn) / (tp + tn + fp + fn))
    recall = float(tp / (tp + fn))
    precision = float(tp / (tp + fp))
    print("F1: ", f1) 
    print("Accuracy: ", accuracy) 
    print("Recall: ", recall)
    print("Precision: ", precision) 
    
def printAUC(predictions, labelCol = "churn"):
    """ Print the area under curve for the predictions.
    INPUT: 
        predictions: The predictions to get and print the AUC for
    OUTPU:
        Prints the AUC
    """
    print("Area under curve: ", BinaryClassificationEvaluator(labelCol = labelCol).evaluate(predictions))

In [8]:
def undersampleNegatives(df, ratio, labelCol = "churn"):
    """
    Undersample the negatives (0's) in the given dataframe by ratio.
    
    NOTE: The "selection" method here is of course very crude and in a real version should be randomized and shuffled.
    
    INPUT:
        df: dataframe to undersample negatives from
        ratio: Undersampling ratio
        labelCol: LAbel column name in the input dataframe
    OUTPUT:
        A new dataframe with negatives undersampled by ratio
    """
    zeros = df.filter(df[labelCol] == 0)
    ones = df.filter(df[labelCol] == 1)
    zeros = createSubset(zeros, ratio)
    return zeros.union(ones)

In [9]:
def gbtPredictions(df_train, df_test, maxIter = 10, labelCol = "churn", featuresCol = "features"):
    """ Fit, evaluate and show results for GBTClassifier 
    INPUT:
        df_train: The training data set.
        df_test: The testing data set.
        maxIter: Number of maximum iterations in the gradeint boost.
        labelCol: The label column name, "churn" by default.
        featuresCol: The label column name, "features" by default.
    OUTPUT:
        predictions: The model's predictions
    """
    # Fit and train model
    gbt = GBTClassifier(labelCol = labelCol, featuresCol = featuresCol, maxIter = maxIter).fit(df_train)
    return gbt.transform(df_test)

In [None]:
df_train, df_test = df.randomSplit([0.9, 0.1])

gbt = GBTClassifier(labelCol = "churn", featuresCol = "features", maxIter = 120, maxDepth = 5).fit(undersampleNegatives(df_train, .7))
predictions = gbt.transform(df_test)
                                 
showEvaluationMetrics(predictions)  
printAUC(predictions)

In [None]:
gbt.save("out/model")

In [None]:
# Output the notebook to an html file
from subprocess import call
call(['python', '-m', 'nbconvert', 'final-model.ipynb'])