# PySpark Logistic Regression

Our full dataset file has around 9 million samples. When trying to run feature_generator.

Machine:
* 2018 Mac Mini - 6 core

Docker Configuration:
* 9 CPUs
* 24 GB Ram
* 3 GB swap


# References:

* https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html
* https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e
* https://medium.com/@dhiraj.p.rai/logistic-regression-in-spark-ml-8a95b5f5434c
* packaging spark job - https://developerzen.com/best-practices-writing-production-grade-pyspark-jobs-cb688ac4d20f



In [1]:
import sys
sys.path.append("../..")

import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SparkSession, DataFrameReader, SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler, Tokenizer, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import when, lit, col
from pyspark.sql.types import NumericType


from sklearn.utils.class_weight import compute_class_weight

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pprint import pprint
import logging

import util.pyspark_util as pyu
import util.model_wrapper as mw



log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
sns.set()
%matplotlib inline

# allow DEBUG to be set by command line
DEBUG = bool(os.environ.get("IPYNB_DEBUG", False))

N_CLASSES = 5
FEATURE_COLUMN = "review_body"

if DEBUG:
    MIN_DF = 1
    DF_PERCENTAGE = 0.001
    DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-test1k-preprocessed.csv"
    TEST_STR="-test"
else:
    DF_PERCENTAGE = 0.001
    DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-all-preprocessed.csv"
    TEST_STR=""

REPORT_FILE = f"../../reports/201911-pyspark-report{TEST_STR}.csv"


spark = SparkSession.builder \
            .master("local[*]") \
            .appName("Pyspark Wrapper Test (local)") \
            .getOrCreate()




In [2]:



class Timer(object):
    
    def __init__(self, description: str):
        self.start_time = datetime.now()
        self.description = description
        
    def stop(self):
        self.end_time = datetime.now()
        self.print_duration_min()
        
    def print_duration_min(self):
        self.duration = int((self.end_time - self.start_time).total_seconds() / 60)
        print(f"{self.description} duration: {self.duration} minutes")
        
    def get_duraction_min(self):
        return self.duration






In [3]:
file_timer = Timer("file load time")
df = spark.read.csv(SparkFiles.get(DATA_FILE), 
                    header=True, 
                    inferSchema= True)
df.collect()
file_timer.stop()

file load time duration: 3 minutes


In [4]:
def build_ngrams(inputCol, min_df, n=3):
    log.info(f'Creating TFIDF using min_df: {min_df}')
    
    tokenizer = [Tokenizer(inputCol = inputCol, outputCol = "words")]
    
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(minDF=min_df, inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i))
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="raw_features"
    )]
    
    idf = [IDF().setInputCol("raw_features").setOutputCol("features")]
#     idf = [IDF(minDocFreq=min_df).setInputCol("raw_features").setOutputCol("features")]

    return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + idf)



pipeline_timer = Timer("pipeline time")
# calculate a reasonable min_df
min_df = max(int(df.count() * DF_PERCENTAGE), 1)

pipeline = build_ngrams(FEATURE_COLUMN, min_df)
df = pipeline.fit(df).transform(df)
pipeline_timer.stop()


INFO:__main__:Creating TFIDF using min_df: 8960


pipeline time duration: 31 minutes


In [5]:
df.printSchema()
pyu.show_df(df, ["star_rating", "features"], truncate=False)

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Split training and test data

In [6]:
split_timer = Timer("train test split")
train, test = df.randomSplit([0.9, 0.1], seed=1)
split_timer.stop()


train_size = train.count()
test_size = test.count()

print(f'Training size: {train_size} Test size: {test_size}')

train test split duration: 0 minutes
Training size: 8064337 Test size: 896465


# get number of features

In [7]:
type(train.select("features").limit(1).toPandas().features.values[0])
train.select("features").limit(1).toPandas().features.values[0].size

4068

# Assign class weights to handle imbalanced classes

In [8]:


# only do this for small files - takes too long for large datasets - we will custom compute this
# if DEBUG:
cw_timer = Timer("skelarn class weight")
labels = train.select("star_rating").toPandas().astype({"star_rating": np.int8})
class_weights = compute_class_weight('balanced', 
                                                  np.arange(1, N_CLASSES+1), 
                                                  labels.star_rating.tolist())
print(f'sklearn class weights: {class_weights}')
cw_timer.stop()
    


sklearn class weights: [1.42458562 3.01028284 2.21370441 1.19766284 0.37325191]
skelarn class weight duration: 15 minutes


In [9]:

train = train.withColumn("class_weights", lit(0))
train = train.withColumn("class_weights", when(train.star_rating == 1, class_weights[0]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 2, class_weights[1]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 3, class_weights[2]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 4, class_weights[3]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 5, class_weights[4]).otherwise(train.class_weights))


pyu.show_df(train, ["star_rating", "class_weights", "features"], 20, sample=True, truncate=True)


INFO:util.pyspark_util:sampling percentage: 2.480055086983592e-06


+-----------+-------------------+--------------------+
|star_rating|      class_weights|            features|
+-----------+-------------------+--------------------+
|          5|0.37325190696955396|(4068,[3,105,2293...|
|          4| 1.1976628431868321|(4068,[0,13,15,27...|
|          5|0.37325190696955396|(4068,[3,29,69,70...|
|          2| 3.0102828368042465|(4068,[0,9,10,11,...|
|          3|  2.213704409792707|(4068,[0,1,3,8,9,...|
|          4| 1.1976628431868321|(4068,[4,550,2672...|
|          5|0.37325190696955396|(4068,[1,2,3,30,5...|
|          4| 1.1976628431868321|(4068,[135],[3.47...|
|          5|0.37325190696955396|(4068,[3,44,2382]...|
|          2| 3.0102828368042465|(4068,[1,8,13,19,...|
|          4| 1.1976628431868321|(4068,[3,210,1159...|
|          4| 1.1976628431868321|(4068,[77,629],[3...|
|          4| 1.1976628431868321|(4068,[10,169,344...|
|          5|0.37325190696955396|(4068,[0,1,3,5,7,...|
|          5|0.37325190696955396|(4068,[2,5,11,13,...|
|         

# Train our Model

In [10]:


import importlib
import util.constants as c
importlib.reload(pyu)
importlib.reload(mw)
importlib.reload(c)

feature_count = train.select("features").limit(1).toPandas().features.values[0].size

train_timer = Timer("traing time")
lr = LogisticRegression(labelCol="star_rating", 
                        featuresCol="features", 
                        weightCol="class_weights",
                        maxIter=100)


model = pyu.PysparkModel(model = lr,
                    train_df = train,
                    test_df = test,
                    label_column = "star_rating",
                    feature_column = "features",
                    n_classes = 5,
                         pipeline = pipeline,
                         file = DATA_FILE,
                         description=f'review_body-tfidf-df_none-ngram13-{df.count()}-{feature_count}-nolda-sampling_none{TEST_STR}',
                        model_dir="../../models")

predict_test = model.run()
train_timer.stop()

pyu.show_df(predict_test, ["star_rating", "prediction", "rawPrediction", "probability"])



INFO:util.model_wrapper:derived name: LogisticRegression
INFO:util.model_wrapper:########################################
INFO:util.model_wrapper:Running model: name: LogisticRegression
	with file: /home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-all-preprocessed.csv
	with description: review_body-tfidf-df_none-ngram13-8960802-4068-nolda-sampling_none-LogisticRegression-star_rating
	status: new
INFO:util.model_wrapper:########################################
INFO:util.time_util:Start timer for: train_time_min
INFO:util.time_util:End timer for: train_time_min
INFO:util.time_util:Total time for train_time_min: 41.56
INFO:util.time_util:Start timer for: model_save_time_min
INFO:util.pyspark_util:Saving model to file: ../../models/review_body-tfidf-df_none-ngram13-8960802-4068-nolda-sampling_none-LogisticRegression-star_rating.pyspark
INFO:util.pyspark_util:Saving pipeline to file: ../../models/review_body-tfidf-df_none-ngram13-8960802-4068-nolda-sampling_none-Logistic

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



INFO:util.time_util:End timer for: cr_time_min
INFO:util.time_util:Total time for cr_time_min: 60.04
INFO:util.model_wrapper:calculating confusion matrix...
INFO:util.time_util:Start timer for: cm_time_min
INFO:util.time_util:End timer for: cm_time_min
INFO:util.time_util:Total time for cm_time_min: 38.96


traing time duration: 154 minutes


AttributeError: 'tuple' object has no attribute 'select'

In [14]:
pyu.show_df(predict_test, ["star_rating", "prediction", "rawPrediction", "probability"])


+-----------+----------+--------------------+--------------------+
|star_rating|prediction|       rawPrediction|         probability|
+-----------+----------+--------------------+--------------------+
|          1|       1.0|[-11.911528394402...|[2.40657649980915...|
|          1|       1.0|[-11.911630236476...|[6.25234982570877...|
|          1|       3.0|[-11.911529591040...|[9.79777082933501...|
|          1|       1.0|[-11.912113942829...|[4.90982145991758...|
|          1|       2.0|[-11.911882201006...|[4.00645859391851...|
|          1|       5.0|[-11.911413004707...|[1.23653160101094...|
|          1|       2.0|[-11.912172086948...|[5.94024127241781...|
|          1|       2.0|[-11.912560679208...|[8.27626456917321...|
|          1|       1.0|[-11.911379559440...|[6.85885007366470...|
|          1|       2.0|[-11.911479474188...|[3.85604110751264...|
+-----------+----------+--------------------+--------------------+
only showing top 10 rows



# Evaluate our Model

Reference:
* https://spark.apache.org/docs/2.1.0/mllib-evaluation-metrics.html#multiclass-classification

In [15]:
report_dict = model.get_report_dict()

INFO:util.model_wrapper:calculating classification report...
INFO:util.time_util:Start timer for: cr_time_min
INFO:util.time_util:End timer for: cr_time_min
INFO:util.time_util:Total time for cr_time_min: 40.23
INFO:util.model_wrapper:calculating confusion matrix...
INFO:util.time_util:Start timer for: cm_time_min
INFO:util.time_util:End timer for: cm_time_min
INFO:util.time_util:Total time for cm_time_min: 35.15


In [16]:
pprint(json.loads(report_dict["classification_report"]))

{'1': {'f1-score': 0.7142086847687906,
       'precision': 0.7071366778635949,
       'recall': 0.7214235735640226,
       'support': 125768},
 '2': {'f1-score': 0.34460360128143763,
       'precision': 0.3071832779696178,
       'recall': 0.39240548790714835,
       'support': 59622},
 '3': {'f1-score': 0.37600681661066543,
       'precision': 0.3317509551008057,
       'recall': 0.4338878238822584,
       'support': 80855},
 '4': {'f1-score': 0.4230261471417585,
       'precision': 0.40563510108769124,
       'recall': 0.4419752259792434,
       'support': 149350},
 '5': {'f1-score': 0.8069815399415072,
       'precision': 0.8616247358417057,
       'recall': 0.7588558238193275,
       'support': 480870},
 'accuracy': 0.6471306743710017,
 'macro avg': {'f1-score': 0.5329653579488319,
               'precision': 0.5226661495726831,
               'recall': 0.5497095870304001,
               'support': 896465},
 'weighted avg': {'f1-score': 0.6603768726341271,
                  'precis

In [17]:
print(np.array(json.loads(report_dict["confusion_matrix"])))

[[ 90732  22945   7919   1712   2460]
 [ 18253  23396  13248   2760   1965]
 [  7940  16604  35082  14942   6287]
 [  3428   6050  25971  66009  47892]
 [  7956   7168  23528  77307 364911]]


In [18]:
import json

def calculate_score(cr: dict):
    
    values = []
    values.append(cr["1"]["recall"])
    values.append(cr["2"]["recall"])
    values.append(cr["3"]["recall"])
    values.append(cr["4"]["recall"])
    values.append(cr["5"]["precision"])
    
    mean = 0
    for v in values:
        if v == 0:
            mean = 0
            break
        else:
            mean += 1 / v
    if mean > 0:
        mean = len(values) / mean

    return mean




score = calculate_score(json.loads(report_dict['classification_report']))
print(f'Overall score: {score}')

Overall score: 0.5174674207704685


# Save Report

In [19]:
if os.path.exists(REPORT_FILE):
    report_df = pd.read_csv(REPORT_FILE, quotechar="'")
else:
    report_df = pd.DataFrame()
report_df = report_df.append(report_dict, ignore_index=True)
report_df.to_csv(REPORT_FILE, index=False, quotechar="'")
report_df

Unnamed: 0,classification_report,cm_time_min,confusion_matrix,cr_time_min,description,file,library,model_file,model_name,model_save_time_min,pipeline_file,predict_time_min,status,status_date,test_examples,test_features,total_time_min,train_examples,train_features,train_time_min
0,"{""1"": {""precision"": 0.6127167630057804, ""recal...",0.28,"[[424, 173, 80, 31, 27], [90, 97, 73, 35, 19],...",0.3,review_body-tfidf-df_none-ngram13-49784-4254-n...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:23:18,5058,4254,0.89,44726,4254,0.27
1,"{""1"": {""precision"": 0.6406469760900141, ""recal...",0.37,"[[911, 280, 132, 46, 36], [221, 214, 150, 61, ...",0.41,review_body-tfidf-df_none-ngram13-99567-4159-n...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:26:20,10023,4159,1.14,89544,4159,0.32
2,"{""1"": {""precision"": 0.6762513312034079, ""recal...",0.55,"[[1905, 566, 257, 48, 64], [408, 465, 326, 82,...",0.59,review_body-tfidf-df_none-ngram13-199134-4082-...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:30:24,19966,4082,1.7,179168,4082,0.52
3,"{""1"": {""precision"": 0.6981843575418994, ""recal...",1.36,"[[4999, 1352, 474, 114, 142], [1023, 1170, 774...",1.41,review_body-tfidf-df_none-ngram13-497835-4107-...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:38:59,49846,4107,3.86,447989,4107,1.05
4,"{""1"": {""precision"": 0.7026024905554779, ""recal...",2.63,"[[10043, 2635, 928, 224, 272], [1991, 2504, 13...",2.6,review_body-tfidf-df_none-ngram13-995688-4073-...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:55:13,99684,4073,7.22,896004,4073,1.95
5,"{""1"": {""precision"": 0.7038692554826093, ""recal...",5.29,"[[20156, 5132, 1794, 428, 592], [4081, 5217, 2...",5.26,review_body-tfidf-df_none-ngram13-2000000-4068...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 19:27:12,200264,4068,13.93,1799736,4068,3.34
6,"{""1"": {""precision"": 0.7077948197764914, ""recal...",17.02,"[[40471, 10286, 3602, 790, 1134], [8058, 10282...",22.4,review_body-tfidf-df_none-ngram13-4000000-4062...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.06,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 20:32:13,400567,4062,48.72,3599433,4062,9.24
7,"{""1"": {""precision"": 0.7071366778635949, ""recal...",35.15,"[[90732, 22945, 7919, 1712, 2460], [18253, 233...",40.23,review_body-tfidf-df_none-ngram13-8960802-4068...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.12,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-25 20:27:57,896465,4068,117.06,8064337,4068,41.56
