# Web predictions

The purpose of this notebook is to experiment with making predictions from "raw" accumulated user values, that
could for instance be user input from a web form.

In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('sparkify-capstone-web').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.feature import VectorAssembler

In [2]:
transformedPath = "out/transformed.parquet"
predictionsPath = "out/predictions.parquet"
df_transformed = spark.read.parquet(transformedPath)
df_predictions = spark.read.parquet(predictionsPath)
model = GBTClassificationModel.load("out/model")

In [3]:
zeros = df_predictions.filter(df_predictions["prediction"] == 0)
ones = df_predictions.filter(df_predictions["prediction"] == 1)
zerosCount = zeros.count()
onesCount = ones.count()
print("Ones: {}, Zeros: {}".format(onesCount, zerosCount))
print(onesCount / zerosCount * 100)

Ones: 93, Zeros: 355
26.197183098591548


In [3]:
usersPredictedToChurn = df_predictions.filter(df_predictions["prediction"] == 1).take(5)

In [4]:
for row in usersPredictedToChurn:
    print(int(row["userId"]))

85
296
100003
200021
100042


In [5]:
df_transformed.show()

+------+-----+-----------+------------+-------------+---------------+------------+-------------+-----------------+---------------+
|userId|churn|level_index|gender_index|thumbs_up_sum|thumbs_down_sum|nextsong_sum|downgrade_sum|       length_sum|sessionId_count|
+------+-----+-----------+------------+-------------+---------------+------------+-------------+-----------------+---------------+
|    93|    0|        0.0|         0.0|           90|             12|        1628|           16|412840.6919200001|             16|
|    93|    0|        0.0|         0.0|           90|             12|        1628|           16|412840.6919200001|             16|
|    93|    0|        0.0|         0.0|           90|             12|        1628|           16|412840.6919200001|             16|
|    93|    0|        0.0|         0.0|           90|             12|        1628|           16|412840.6919200001|             16|
|    93|    0|        0.0|         0.0|           90|             12|        1628| 

In [6]:
df_predictions.show()

+----------+--------+
|prediction|  userId|
+----------+--------+
|       0.0|   148.0|
|       0.0|200049.0|
|       0.0|300040.0|
|       1.0|    85.0|
|       0.0|   137.0|
|       0.0|   251.0|
|       0.0|200031.0|
|       0.0|300044.0|
|       0.0|    65.0|
|       0.0|200001.0|
|       0.0|    53.0|
|       0.0|   255.0|
|       0.0|   133.0|
|       1.0|   296.0|
|       1.0|100003.0|
|       1.0|200021.0|
|       0.0|    78.0|
|       0.0|100007.0|
|       1.0|100042.0|
|       0.0|100035.0|
+----------+--------+
only showing top 20 rows



In [7]:
# 1 300044
# 0 251

# Select the prediction of a user as value
pred = df_predictions[df_predictions["userId"] == 78].select("prediction").collect()[0][0]

In [8]:
pred

0.0

In [9]:
# From a query that could be entered in a web form, create a prediction

# Query from web
query = "1.0,0.0,10,4,307,0,76200,10"

# Split to values
values = query.split(",")

# Prepare dictionary for feature dataframe from web form values
features_dict = [{
    "level_index": float(values[0]),
    "gender_index": float(values[1]), 
    "thumbs_up_sum": int(values[2]),
    "thumbs_down_sum": int(values[3]),
    "nextsong_sum": int(values[4]),
    "downgrade_sum": int(values[5]),
    "length_sum": float(values[6]),
    "sessionId_count": int(values[7]),
    }]

# Create a user row to use in VectorAssembler
df_user_row = spark.createDataFrame(features_dict)

# Create feature dataframe with VectorAssembler
df_features = VectorAssembler(inputCols = \
                         ["level_index", "gender_index", "thumbs_up_sum", "thumbs_down_sum", \
                          "nextsong_sum", "downgrade_sum", "length_sum", "sessionId_count"], \
                         outputCol = "features").transform(df_user_row)

# Select features
df_features = df_features.select("features")

# Predict on model
prediction = model.transform(df_features)



In [10]:
# Show result
prediction.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[1.0,0.0,10.0,4.0...|[1.59711984191447...|[0.96061692995759...|       0.0|
+--------------------+--------------------+--------------------+----------+



In [11]:
prediction.select("prediction").collect()[0][0]

0.0

In [13]:
# Output the notebook to an html file
from subprocess import call
call(['python', '-m', 'nbconvert', 'web_pred.ipynb'])

0