In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install pyspark --quiet
! pip install plotly --quiet
! pip install chart_studio --quiet
! pip install spark_sklearn --quiet
! pip install flake8_nb --quiet

[K     |████████████████████████████████| 281.4 MB 30 kB/s 
[K     |████████████████████████████████| 198 kB 56.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 64 kB 2.3 MB/s 
[?25h  Building wheel for retrying (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 4.9 MB 5.4 MB/s 
[?25h  Building wheel for spark-sklearn (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.19.2 which is incompatible.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.19.2 which is incompatible.[0m
[K     |████████████████████████████████| 793 kB 6.9 MB/s 
[K     |████████████████████████████████| 64 kB 1.4 MB/s 
[K     |█████████████████████████████

In [3]:
import pandas as pd
import numpy as np
import os
import sys
import random
import time
import ast
import pickle
import flake8_nb

from google.cloud import bigquery
from google.colab import auth, files
from typing import Union
from spark_sklearn import Converter

import pyspark
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.ml import *
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score
from sklearn.feature_selection import mutual_info_classif

import chart_studio.plotly as py
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=1, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1

In [None]:
def pyspark_column_prep(dataset):
  """Returns column names with underscores for PySpark ready format.
     dataset: pd.DataFrame"""
  for col in dataset.columns:
    if '.' in col:
      repl = col.replace('.','_').replace(' ','_')
      dataset.rename(columns={col: repl},inplace=True)

  return dataset
  
def get_mutual_info_features(train_df, target_df, thres:float):
  MI = mutual_info_classif(train_df, target_df)
  mi_results = sorted(list(zip(MI, train_df.columns)), key=lambda x: x[0], reverse=True)

  mi_features = []
  for idx,item in enumerate(mi_results):
    val, col = item
    if val > thres:
      mi_features.append(col)
  mi_features.append('label')

  return mi_features,mi_results

#### Mutual Information (Information Gain)

For binary classification, the sklearn library provides a method to get the mutual information score with the feature and the target variable. This paper explains how mutual information could improve ROC Curve results. Source: [National Library of Medicine](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7597205/)

In [None]:
RANDOM_SEED = 655

a1_b2_dataset = pd.read_csv(
    '/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/model_files/A1_B2_data.csv'
)

spark_ready_X = pyspark_column_prep(a1_b2_dataset)

train = spark_ready_X[spark_ready_X.columns[~spark_ready_X.columns.isin([
                                                                         'Monetary',
                                                                         'buyers',
                                                                         'Frequency',
                                                                         'repurchasers',
                                                                         'Recency',
                                                                         'totals_transactions',
                                                                         'date',
                                                                         'fullVisitorId'
                                                                         ])]].fillna(0.0)
target = a1_b2_dataset[['totals_transactions']]
target = target.totals_transactions.fillna(0.0).apply(lambda trans: trans if trans == 0.0 else 1.0)
target = pd.Categorical(target)
target = pd.Series(target)

filter1,mi1 = get_mutual_info_features(train, target, 0.0)
filter2,_ = get_mutual_info_features(train, target, .001)
filter3,_ = get_mutual_info_features(train, target, .05)

mi1 = pd.DataFrame(mi1, columns=['mutual_info_score', 'column_name'])


## PySpark Cluster

1. Create a spark session using the local compute.
2. If you have not exited the last Spark session, it will remain open. To forcible start a new session, with new environment variables, run the newSession() method.

In [None]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Binary Buyer Prediction") \
    .getOrCreate()

In [None]:
spark.newSession()

### Methodology:
Transformations:
- PCA=2 was the conscensus among the group. Added to the pipeline for binary classification probability prediction for likelihood to buy. (Did not incorporate at this time.)
- Remove columns from Mutual Information step earlier
- Under sampling - using the same number of customers / visits labels
- Train(,val), test: Overall, the dataset is split into 90% train and 10% test. The train dataset will be split into a 75% training and 25% validation data because this is the behavior of the Cross Validator.

*Ethical consideration*: continued to exclude fullvisitorid because it was not useful for understanding if a customer was likely to return or not. It is better to understand what patterns, or behaviors, arise from those who are interacting with the site in order to better predict if the random visitor is a buyer and subsequently someone to target with marketing.

In [None]:
def downsampling(data):
  df = data.withColumn('rand_col', rand())
  balanced_data = df.withColumn(
      "row_num",row_number().over(Window.partitionBy("label")\
                                  .orderBy("rand_col"))).filter(col("row_num")<=label_balance.head(2)[1][1])\
                                  .drop("rand_col", "row_num")
  return balanced_data

def take_random_sample(data):
  return data.sample(True, .1, RANDOM_SEED)

In [None]:
train_spark = (spark.createDataFrame(spark_ready_X)
                    .withColumn('label', col('totals_transactions')))

In [None]:
unbalanced_df = train_spark.withColumn('label', when(col('label')==0.0, 0).otherwise(1.0))
label_balance = unbalanced_df.groupBy('label').count().withColumnRenamed('count','unbalanced_count')

downsample_df = downsampling(unbalanced_df)
random_df = take_random_sample(unbalanced_df)

dlabel_df = downsample_df.select('label').groupBy('label').count().withColumnRenamed('count','downsampling_count')
rlabel_df = random_df.select('label').groupBy('label').count().withColumnRenamed('count','randsamp_count')

(label_balance.join(dlabel_df,how='inner',on='label')
              .join(rlabel_df,how='inner',on='label')).show()

+-----+----------------+------------------+--------------+
|label|unbalanced_count|downsampling_count|randsamp_count|
+-----+----------------+------------------+--------------+
|  0.0|           49516|              5865|          4828|
|  1.0|            5865|              5865|           600|
+-----+----------------+------------------+--------------+



Models:
- Logistic Regression (baseline model since pyspark doesn't have a dummy classifier - could still compare, but it's ok for now.)
- Decision Tree (more robust model)
- Random Forest Classifier (most robust model for classification)


Metrics: Precision, Recall, AUC, and F1-Score (Harmonious Combo of Precision and Recall)

MLFlow tracking was tested in this environment, but it was not a successful strategy for the charting and continued components in the pipeline. Removed and would be considered if more time or resources avalaible.

In [None]:
X_train_unbalanced, X_test_unbalanced = unbalanced_df.randomSplit([0.9, 0.1], seed=RANDOM_SEED)
X_train_dsample, X_test_dsample = downsample_df.randomSplit([0.9, 0.1], seed=RANDOM_SEED)
X_train_random, X_test_random  = random_df.randomSplit([0.9, 0.1], seed=RANDOM_SEED)

In [6]:
def cv_model(model, train, test, param_grid=None):
  if not param_grid: param_grid = ParamGridBuilder().build()
  cols = [col for col in train.columns if col not in set(['totals_transactions','label','fullVisitorId'])]
  vec = VectorAssembler(inputCols=cols, outputCol='features', handleInvalid='skip')
  pipeline = Pipeline(stages=[vec, model])

  evaluator = BinaryClassificationEvaluator()

  crossval = CrossValidator(estimator=pipeline,
                            estimatorParamMaps=param_grid,
                            evaluator=evaluator,
                            numFolds=4) 
  # increased the number of folds from 4 to 10 after initial testing as 10, time increased drastically
  
  crossval = crossval.fit(train)
  best_model = crossval.bestModel.stages[-1]
  validation_res = crossval.transform(train)
  prediction_res = crossval.transform(test)
  
  train_eval = crossval.getEvaluator().evaluate(validation_res)
  test_eval = crossval.getEvaluator().evaluate(prediction_res)
  return best_model, train_eval, test_eval, validation_res, prediction_res
  
def save_model(model, file_loc, model_name):
  filename = file_loc + '/' + f'a1_{model_name}_model.sav'
  try:
    model.save(filename)
  except:
    model.write().overwrite().save(filename)
  return

def convert_model(spark_model):
  conv = Converter(spark)
  sklearn_model = conv.toSKLearn(spark_model)
  return sklearn_model


In [7]:
def get_clf_metrics(data:pyspark.sql.dataframe.DataFrame, true:str, pred:str):
  pred_df = data.select(true,pred).toPandas()
  recall = recall_score(pred_df['label'], pred_df['prediction'])
  f1Score = f1_score(pred_df['label'], pred_df['prediction'])
  precision = precision_score(pred_df['label'], pred_df['prediction'])
  return recall, f1Score, precision

def train_spark_models(spark_datasets, spark_models, param_grids, filters, models):

  trained_models = []
  df = pd.DataFrame()
  datasets = []

  for model in models:
    for names, data in spark_datasets.items():
      train_name, test_name = names
      X_train, X_test = data

      for filter in filters:

        if model in spark_models.keys():

          start_time = time.time()
          trained_model, train_eval, test_eval, validation, predictions = \
          cv_model(spark_models[model],X_train[filter], X_test[filter], param_grid=param_grids[model])

          try:
            best_params = trained_model._java_obj.extractParamMap()
          except:
            best_params = trained_model._java_obj.parent().extractParamMap()
          end_time = time.time()

          datasets.append((validation, predictions))
          trained_models.append(trained_model)

          train_recall, train_f1Score, train_precision = get_clf_metrics(validation,'label','prediction')
          test_recall, test_f1Score, test_precision = get_clf_metrics(predictions,'label','prediction')

          df2 = pd.DataFrame([[model, train_name, filter, len(filter), train_eval, train_recall, 
                               train_precision, train_f1Score, best_params, start_time, end_time]],
                      columns=['Model', 'Dataset', 'Column_Filter', 'Col_Num',
                               'AUC', 'Recall', 'Precision', 'F1-Score', 'Best_Params',
                               'Start_Time', 'End_Time'])
          df3 = pd.DataFrame([[model, test_name, filter, len(filter), test_eval, test_recall, 
                               test_precision, test_f1Score, best_params, start_time, end_time]],
                      columns=['Model', 'Dataset','Column_Filter', 'Col_Num',
                               'AUC', 'Recall', 'Precision', 'F1-Score', 'Best_Params',
                               'Start_Time', 'End_Time'])
          if len(df) == 0:
            df = pd.concat([df2, df3], ignore_index=True)
          else:
            df = pd.concat([df, df2, df3], ignore_index=True)

  return trained_models, df, datasets

def post_processing(df):
  for idx,param_grid in enumerate(df.Best_Params.apply(lambda x: str(x)[3:-2])):
    for param_str in (param_grid.split(',\n\t')):
      param = param_str.split(':')
      param[0] = param[0].split('-')[-1]

      if param[0] not in df.columns:
        df[param[0]] = np.nan
      if param[0] in df.columns and len(param) > 1: 
        df[param[0]].iloc[idx] = param[1]
  return df

def save_models(df, models, force:bool=False):

  for idx,model in enumerate(models):
    try:
      dataset = df.iloc[idx]['Test_Dataset'].split("_")[-1]
      col_num = df.iloc[idx]['Col_Num']
      f_name = df.iloc[idx]['Model'] + f'run{idx}_{dataset}_{col_num}_col'

      save_model(model, '../content/drive/Shareddrives/SIADS - 694-695 Team Drive/models/spark_models',
                      f_name)
    except Exception as e:
      print(f'Failed because {e};', model)
  return

#### Technical finding
CrossValidator.bestModel.stages[-1]._java_obj.extractParamMap() creates a Java Gateway member like `<py4j.java_gateway.JavaMember at 0x7f11e2b55210>`. It provides the params for the best trained PySpark model; alternatively, this could return all params (or results with a small tweak) for all the iteratively trained models. This gives the same functionality as if it were an independently trained estimator.

In [None]:
spark_datasets = {
    ('X_train_unbalanced', 'X_test_unbalanced'): (X_train_unbalanced, X_test_unbalanced),
    ('X_train_dsample', 'X_test_dsample'): (X_train_dsample, X_test_dsample),
    ('X_train_random', 'X_test_random'): (X_train_random, X_test_random)
  }

spark_models ={
    'spark_lr': LogisticRegression(maxIter=100000, featuresCol='features',labelCol='label'),
    'spark_dt': classification.DecisionTreeClassifier(featuresCol="features", labelCol="label"),
    'spark_rf': RandomForestClassifier(featuresCol="features", labelCol="label")
}

param_grids ={
    'spark_lr': ParamGridBuilder()\
    .addGrid(spark_models['spark_lr'].regParam, [1.0, 2.0]) \
    .addGrid(spark_models['spark_lr'].elasticNetParam, [0.0, 0.5, 1]) \
    .build(),
    'spark_dt': ParamGridBuilder()\
    .addGrid(spark_models['spark_dt'].maxDepth, [1, 2, 3]) \
    .addGrid(spark_models['spark_dt'].minInfoGain, [0.0, .005, .01, .1]) \
    .build(),
    'spark_rf': ParamGridBuilder()\
    .addGrid(spark_models['spark_rf'].maxDepth, [1, 2, 3])\
    .addGrid(spark_models['spark_rf'].numTrees, [10, 20, 40]) \
    .build()
}

models, df, datasets = train_spark_models(spark_datasets, spark_models, param_grids,
                                          [filter1, filter2, filter3], ['spark_lr', 'spark_dt', 'spark_rf'])


F-score is ill-defined and being set to 0.0 due to no predicted samples.


Precision is ill-defined and being set to 0.0 due to no predicted samples.



In [None]:
save_models(df, models, force=False)

Unnamed: 0,index,Model,Train_Dataset,Test_Dataset,Column_Filter,Col_Num,AUC,Recall,Precision,F1-Score,...,maxDepth,maxMemoryInMB,minInfoGain,minInstancesPerNode,minWeightFractionPerNode,seed,bootstrap,featureSubsetStrategy,numTrees,subsamplingRate
25,25,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",35,0.96763,0.982206,0.945205,0.963351,...,3.0,256.0,0.005,1.0,0.0,7.882335e+18,,,,
27,27,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",28,0.96763,0.982206,0.945205,0.963351,...,3.0,256.0,0.005,1.0,0.0,7.882335e+18,,,,
29,29,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",7,0.969265,0.980427,0.945111,0.962445,...,3.0,256.0,0.0,1.0,0.0,7.882335e+18,,,,
47,47,spark_rf,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",7,0.982658,0.983986,0.940476,0.961739,...,3.0,256.0,0.0,1.0,0.0,3.577578e+18,True,auto,20.0,1.0
43,43,spark_rf,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",35,0.983552,0.985765,0.935811,0.960139,...,3.0,256.0,0.0,1.0,0.0,3.577578e+18,True,auto,20.0,1.0
45,45,spark_rf,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",28,0.985829,0.983986,0.935702,0.959237,...,3.0,256.0,0.0,1.0,0.0,3.577578e+18,True,auto,40.0,1.0
24,24,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",35,0.956112,0.978117,0.929378,0.953125,...,3.0,256.0,0.005,1.0,0.0,7.882335e+18,,,,
26,26,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",28,0.956112,0.978117,0.929378,0.953125,...,3.0,256.0,0.005,1.0,0.0,7.882335e+18,,,,
28,28,spark_dt,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",7,0.957459,0.975099,0.929175,0.951583,...,3.0,256.0,0.0,1.0,0.0,7.882335e+18,,,,
46,46,spark_rf,X_train_dsample,X_test_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",7,0.979051,0.976608,0.918559,0.946695,...,3.0,256.0,0.0,1.0,0.0,3.577578e+18,True,auto,20.0,1.0


In [None]:
df = post_processing(df)
df = df.reset_index()
df.sort_values(['F1-Score','AUC'],ascending=False).to_csv('/content/drive/Shareddrives/SIADS - 694-695 Team Drive/results/a1_final_results.csv')

### To Load Models:
Use the calls below to select an individual model to run predicitions on it now.

In [None]:
classification.RandomForestClassificationModel.load("/content/drive/Shareddrives/SIADS - 694-695 Team Drive/models/a1_spark_rf_dsample_model.sav")
classification.DecisionTreeClassificationModel.load("/content/drive/Shareddrives/SIADS - 694-695 Team Drive/models/a1_spark_dt_dsample_model.sav")
classification.LogisticRegressionModel.load("/content/drive/Shareddrives/SIADS - 694-695 Team Drive/models/a1_spark_lr_dsample_model.sav")

### Plotting results
- Multi plot Scatter Chart with F1-Score vs AUC Score per dataset type
- 3d Scatter Plot with F1-Score, regParam, elasticNetParam
- Bar Chart: F1-Score
- Bar Chart: Recall
- Bar Chart: Precision

In [18]:
df = pd.read_csv('/content/drive/Shareddrives/SIADS - 694-695 Team Drive/results/a1_final_results.csv',index_col=0)
df = post_processing(df)
df['Dataset_Category'] = df['Dataset'].apply(lambda x: x.split('_')[-1])
df['Dataset_Type'] = df['Dataset'].apply(lambda x: x.split('_')[1])
df = df.reset_index()
print(df.columns)
df.head(2)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Index(['index', 'Model', 'Dataset', 'Column_Filter', 'Col_Num', 'AUC',
       'Recall', 'Precision', 'F1-Score', 'Best_Params', 'Start_Time',
       'End_Time', 'cacheNodeIds', 'checkpointInterval', 'featuresCol',
       'impurity', 'labelCol', 'leafCol', 'maxBins', 'maxDepth',
       'maxMemoryInMB', 'minInfoGain', 'minInstancesPerNode',
       'minWeightFractionPerNode', 'predictionCol', 'probabilityCol',
       'rawPredictionCol', 'seed', 'bootstrap', 'featureSubsetStrategy',
       'numTrees', 'subsamplingRate', 'aggregationDepth', 'elasticNetParam',
       'family', 'fitIntercept', 'maxBlockSizeInMB', 'maxIter', 'regParam',
       'standardization', 'threshold', 'tol', 'Dataset_Category',
       'Dataset_Type'],
      dtype='object')


Unnamed: 0,index,Model,Dataset,Column_Filter,Col_Num,AUC,Recall,Precision,F1-Score,Best_Params,...,family,fitIntercept,maxBlockSizeInMB,maxIter,regParam,standardization,threshold,tol,Dataset_Category,Dataset_Type
0,24,spark_dt,X_train_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",31,0.958938,0.971009,0.933406,0.951836,{\n\tDecisionTreeClassifier_41fca20da5b7-cache...,...,,,,,,,,,dsample,train
1,26,spark_dt,X_train_dsample,"['totals_pageviews', 'totals_hits', 'totals_ti...",26,0.958938,0.971009,0.933406,0.951836,{\n\tDecisionTreeClassifier_41fca20da5b7-cache...,...,,,,,,,,,dsample,train


In [19]:
fig1 = px.scatter(df, x="AUC", y="F1-Score", facet_col="Col_Num", facet_row="Model", 
                 title="Likelihood to Buy Binary Classification", color='Dataset'
                 )
fig1.show() 

In [10]:
df[df['Model']=='spark_rf'].groupby('Model')['AUC', 'F1-Score'].mean()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,AUC,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
spark_rf,0.979983,0.843942


In [11]:
df[(df['Model']=='spark_lr') & ~('dsample' in df['Dataset'])].groupby('Model')['AUC', 'F1-Score'].mean()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,AUC,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
spark_lr,0.97469,0.318247


In [20]:
fig2 = px.scatter_3d(df[df['Model']=='spark_lr'], x="regParam", y="elasticNetParam", z="F1-Score",
              title="Logistic Regression Params vs F1-Score in Likelihood to Buy Binary Classification", 
              color='Dataset', symbol='Col_Num'
          )
fig2.show() 

In [23]:
fig3 = px.bar(df[df['Model']=='spark_lr'], x="index", y="F1-Score", color="Dataset_Type", text_auto='.2',
              title="Logistic Regression F1-Score in Likelihood to Buy Binary Classification", 
              facet_col="Dataset_Category", 
          )
fig3.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig3.show()

In [24]:
fig4 = px.bar(df[df['Model']=='spark_lr'], x="index", y="Recall", color="Dataset_Type", text_auto='.2',
              title="Logistic Regression Recall Scores in Likelihood to Buy Binary Classification",  
              facet_col="Dataset_Category", 
          )
fig4.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig4.show()

In [25]:
fig5 = px.bar(df[df['Model']=='spark_lr'], x="index", y="Precision", color="Dataset_Type", text_auto='.2',
              title="Logistic Regression Precision Scores in Likelihood to Buy Binary Classification",  
              facet_col="Dataset_Category", 
          )
fig5.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig5.show()

In [35]:
!pip install kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
import kaleido
titles = ["Likelihood to Buy Binary Classification",
          "Logistic Regression Params vs F1-Score in Likelihood to Buy Binary Classification",
          "Logistic Regression F1-Score in Likelihood to Buy Binary Classification",
          "Logistic Regression Recall Scores in Likelihood to Buy Binary Classification",
          "Logistic Regression Precision Scores in Likelihood to Buy Binary Classification"]

for idx,fig in enumerate([fig1, fig2, fig3, fig4, fig5]):
    file_name = '/content/drive/Shareddrives/SIADS - 694-695 Team Drive/visualizations/' + titles[idx]
    fig.write_html(file_name + '.html')


ValueError: ignored

In [None]:
spark.sparkContext.stop()

### Conclusion
Though it was our benchmark model, the logistic regression performed worse than the other models. It was not robust, and was likely overfit for the unbalanced classes.