# PySpark Logistic Regression

Our full dataset file has around 9 million samples. When trying to run feature_generator.

Machine:
* 2018 Mac Mini - 6 core

Docker Configuration:
* 9 CPUs
* 24 GB Ram
* 3 GB swap


# References:

* https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html
* https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e
* https://medium.com/@dhiraj.p.rai/logistic-regression-in-spark-ml-8a95b5f5434c
* packaging spark job - https://developerzen.com/best-practices-writing-production-grade-pyspark-jobs-cb688ac4d20f



In [1]:
import sys
sys.path.append("../..")

import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SparkSession, DataFrameReader, SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler, Tokenizer, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import when, lit, col
from pyspark.sql.types import NumericType


from sklearn.utils.class_weight import compute_class_weight

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pprint import pprint
import logging

import util.pyspark_util as pyu
import util.model_wrapper as mw



log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
sns.set()
%matplotlib inline

# allow DEBUG to be set by command line
DEBUG = bool(os.environ.get("IPYNB_DEBUG", False))

N_CLASSES = 5
FEATURE_COLUMN = "review_body"


if DEBUG:
    MIN_DF = 1
    DF_PERCENTAGE = 0.001
    DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-test1k-preprocessed.csv"
    TEST_STR="-test"
else:
    DF_PERCENTAGE = 0.001
    DATA_FILE = "/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-50k-preprocessed.csv"
    TEST_STR=""

REPORT_FILE = f"../../reports/201911-pyspark-report{TEST_STR}.csv"


spark = SparkSession.builder \
            .master("local[*]") \
            .appName("Pyspark Wrapper Test (local)") \
            .getOrCreate()




In [2]:



class Timer(object):
    
    def __init__(self, description: str):
        self.start_time = datetime.now()
        self.description = description
        
    def stop(self):
        self.end_time = datetime.now()
        self.print_duration_min()
        
    def print_duration_min(self):
        self.duration = int((self.end_time - self.start_time).total_seconds() / 60)
        print(f"{self.description} duration: {self.duration} minutes")
        
    def get_duraction_min(self):
        return self.duration






In [3]:
file_timer = Timer("file load time")
df = spark.read.csv(SparkFiles.get(DATA_FILE), 
                    header=True, 
                    inferSchema= True)
df.collect()
file_timer.stop()

file load time duration: 0 minutes


In [4]:
def build_ngrams(inputCol, min_df, n=3):
    log.info(f'Creating TFIDF using min_df: {min_df}')
    
    tokenizer = [Tokenizer(inputCol = inputCol, outputCol = "words")]
    
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(minDF=min_df, inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i))
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="raw_features"
    )]
    
    idf = [IDF().setInputCol("raw_features").setOutputCol("features")]
#     idf = [IDF(minDocFreq=min_df).setInputCol("raw_features").setOutputCol("features")]

    return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + idf)



pipeline_timer = Timer("pipeline time")
# calculate a reasonable min_df
min_df = max(int(df.count() * DF_PERCENTAGE), 1)

pipeline = build_ngrams(FEATURE_COLUMN, min_df)
df = pipeline.fit(df).transform(df)
pipeline_timer.stop()


INFO:__main__:Creating TFIDF using min_df: 49


pipeline time duration: 0 minutes


In [5]:
df.printSchema()
pyu.show_df(df, ["star_rating", "features"], truncate=False)

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Split training and test data

In [6]:
split_timer = Timer("train test split")
train, test = df.randomSplit([0.9, 0.1], seed=1)
split_timer.stop()


train_size = train.count()
test_size = test.count()

print(f'Training size: {train_size} Test size: {test_size}')

train test split duration: 0 minutes
Training size: 44726 Test size: 5058


# get number of features

In [7]:
type(train.select("features").limit(1).toPandas().features.values[0])
train.select("features").limit(1).toPandas().features.values[0].size

4254

# Assign class weights to handle imbalanced classes

In [8]:


# only do this for small files - takes too long for large datasets - we will custom compute this
# if DEBUG:
cw_timer = Timer("skelarn class weight")
labels = train.select("star_rating").toPandas().astype({"star_rating": np.int8})
class_weights = compute_class_weight('balanced', 
                                                  np.arange(1, N_CLASSES+1), 
                                                  labels.star_rating.tolist())
print(f'sklearn class weights: {class_weights}')
cw_timer.stop()
    


sklearn class weights: [1.44184397 3.07184066 2.12273374 1.17917216 0.37569089]
skelarn class weight duration: 0 minutes


In [9]:

train = train.withColumn("class_weights", lit(0))
train = train.withColumn("class_weights", when(train.star_rating == 1, class_weights[0]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 2, class_weights[1]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 3, class_weights[2]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 4, class_weights[3]).otherwise(train.class_weights))
train = train.withColumn("class_weights", when(train.star_rating == 5, class_weights[4]).otherwise(train.class_weights))


pyu.show_df(train, ["star_rating", "class_weights", "features"], 20, sample=True, truncate=True)


INFO:util.pyspark_util:sampling percentage: 0.00044716719581451504


+-----------+-------------------+--------------------+
|star_rating|      class_weights|            features|
+-----------+-------------------+--------------------+
|          1| 1.4418439716312057|(4254,[0,8,30,32,...|
|          3|  2.122733744660655|(4254,[0,13,15,94...|
|          3|  2.122733744660655|(4254,[8,10,2268]...|
|          3|  2.122733744660655|(4254,[0,1,5,6,8,...|
|          4| 1.1791721592407065|(4254,[10,26,142,...|
|          4| 1.1791721592407065|(4254,[0,1,2,4,8,...|
|          4| 1.1791721592407065|(4254,[1,6,7,13,1...|
|          5|0.37569088618227636|(4254,[0,1,2,14,1...|
|          5|0.37569088618227636|(4254,[1143],[5.7...|
|          5|0.37569088618227636|(4254,[132,193,16...|
|          5|0.37569088618227636|(4254,[0,16,21,26...|
|          5|0.37569088618227636|(4254,[4,8,20,32,...|
|          2| 3.0718406593406593|(4254,[0,3,6,17,2...|
|          4| 1.1791721592407065|(4254,[0,2,3,4,5,...|
|          4| 1.1791721592407065|(4254,[3,5,48,50,...|
|         

# Train our Model

In [10]:


import importlib
import util.constants as c
importlib.reload(pyu)
importlib.reload(mw)
importlib.reload(c)

feature_count = train.select("features").limit(1).toPandas().features.values[0].size

train_timer = Timer("traing time")
lr = LogisticRegression(labelCol="star_rating", 
                        featuresCol="features", 
                        weightCol="class_weights",
                        maxIter=100)


model = pyu.PysparkModel(model = lr,
                    train_df = train,
                    test_df = test,
                    label_column = "star_rating",
                    feature_column = "features",
                    n_classes = 5,
                         pipeline = pipeline,
                         file = DATA_FILE,
                         description=f'review_body-tfidf-df_none-ngram13-{df.count()}-{feature_count}-nolda-sampling_none{TEST_STR}',
                        model_dir="../../models")

report_dict, predict_test = model.run()
train_timer.stop()

pyu.show_df(predict_test, ["star_rating", "prediction", "rawPrediction", "probability"])



INFO:util.model_wrapper:derived name: LogisticRegression
INFO:util.model_wrapper:########################################
INFO:util.model_wrapper:Running model: name: LogisticRegression
	with file: /home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-50k-preprocessed.csv
	with description: review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegression-star_rating
	status: new
INFO:util.model_wrapper:########################################
INFO:util.time_util:Start timer for: train_time_min
INFO:util.time_util:End timer for: train_time_min
INFO:util.time_util:Total time for train_time_min: 0.27
INFO:util.time_util:Start timer for: model_save_time_min
INFO:util.pyspark_util:Saving model to file: ../../models/review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegression-star_rating.pyspark
INFO:util.pyspark_util:Saving pipeline to file: ../../models/review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegress

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



INFO:util.time_util:End timer for: cr_time_min
INFO:util.time_util:Total time for cr_time_min: 0.3
INFO:util.model_wrapper:calculating confusion matrix...
INFO:util.time_util:Start timer for: cm_time_min
INFO:util.time_util:End timer for: cm_time_min
INFO:util.time_util:Total time for cm_time_min: 0.28


traing time duration: 1 minutes
+-----------+----------+--------------------+--------------------+
|star_rating|prediction|       rawPrediction|         probability|
+-----------+----------+--------------------+--------------------+
|          1|       1.0|[-7.6362799614976...|[8.34402339204206...|
|          1|       1.0|[-7.6256963701932...|[1.04760082690356...|
|          1|       4.0|[-7.6501462506985...|[8.03360525043608...|
|          1|       1.0|[-7.7807623610401...|[2.01165931554026...|
|          1|       1.0|[-7.6921277793741...|[1.33531683134521...|
|          1|       1.0|[-7.6112744642495...|[6.81083623904764...|
|          1|       2.0|[-7.6178056389892...|[4.67529538542913...|
|          1|       1.0|[-7.6414397495410...|[1.30420401297447...|
|          1|       2.0|[-7.6609078996627...|[5.17876044339633...|
|          1|       2.0|[-7.6698479662183...|[1.65296690962550...|
+-----------+----------+--------------------+--------------------+
only showing top 10 rows



In [11]:
predict_test.printSchema()

root
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



# Evaluate our Model

Reference:
* https://spark.apache.org/docs/2.1.0/mllib-evaluation-metrics.html#multiclass-classification

In [12]:
report_dict

{'model_name': 'LogisticRegression',
 'description': 'review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegression-star_rating',
 'library': 'pyspark',
 'file': '/home/jupyter/dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-50k-preprocessed.csv',
 'model_file': '../../models/review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegression-star_rating.pyspark',
 'pipeline_file': '../../models/review_body-tfidf-df_none-ngram13-49784-4254-nolda-sampling_none-LogisticRegression-star_rating.pipeline',
 'status': 'success',
 'status_date': '2019-11-22 18:23:18',
 'classification_report': '{"1": {"precision": 0.6127167630057804, "recall": 0.5768707482993197, "f1-score": 0.5942536790469518, "support": 735}, "2": {"precision": 0.19477911646586346, "recall": 0.3089171974522293, "f1-score": 0.23891625615763545, "support": 314}, "3": {"precision": 0.2491638795986622, "recall": 0.3274725274725275, "f1-score": 0.28300094966761635, "support": 455}, "4

In [13]:
pprint(json.loads(report_dict["classification_report"]))

{'1': {'f1-score': 0.5942536790469518,
       'precision': 0.6127167630057804,
       'recall': 0.5768707482993197,
       'support': 735},
 '2': {'f1-score': 0.23891625615763545,
       'precision': 0.19477911646586346,
       'recall': 0.3089171974522293,
       'support': 314},
 '3': {'f1-score': 0.28300094966761635,
       'precision': 0.2491638795986622,
       'recall': 0.3274725274725275,
       'support': 455},
 '4': {'f1-score': 0.3414899401848831,
       'precision': 0.30935960591133005,
       'recall': 0.38106796116504854,
       'support': 824},
 '5': {'f1-score': 0.7562688064192579,
       'precision': 0.835920177383592,
       'recall': 0.6904761904761905,
       'support': 2730},
 'accuracy': 0.5672202451561882,
 'macro avg': {'f1-score': 0.442785926295269,
               'precision': 0.44038790847304565,
               'recall': 0.45696092497306307,
               'support': 5058},
 'weighted avg': {'f1-score': 0.590463254817851,
                  'precision': 0.625118

In [14]:
print(np.array(json.loads(report_dict["confusion_matrix"])))

[[ 424  173   80   31   27]
 [  90   97   73   35   19]
 [  61   85  149  109   51]
 [  32   61  144  314  273]
 [  85   82  152  526 1885]]


In [15]:
import json

def calculate_score(cr: dict):
    
    values = []
    values.append(cr["1"]["recall"])
    values.append(cr["2"]["recall"])
    values.append(cr["3"]["recall"])
    values.append(cr["4"]["recall"])
    values.append(cr["5"]["precision"])
    
    mean = 0
    for v in values:
        if v == 0:
            mean = 0
            break
        else:
            mean += 1 / v
    if mean > 0:
        mean = len(values) / mean

    return mean




score = calculate_score(json.loads(report_dict['classification_report']))
print(f'Overall score: {score}')

Overall score: 0.4221266805001182


# Save Report

In [16]:
if os.path.exists(REPORT_FILE):
    report_df = pd.read_csv(REPORT_FILE, quotechar="'")
else:
    report_df = pd.DataFrame()
report_df = report_df.append(report_dict, ignore_index=True)
report_df.to_csv(REPORT_FILE, index=False, quotechar="'")
report_df

Unnamed: 0,classification_report,cm_time_min,confusion_matrix,cr_time_min,description,file,library,model_file,model_name,model_save_time_min,pipeline_file,predict_time_min,status,status_date,test_examples,test_features,total_time_min,train_examples,train_features,train_time_min
0,152,526.0,1885]]',0.35,review_body-tfidf-df_none-ngram13-49784-4254-n...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.06,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 07:50:22,5058.0,4254.0,1.0,44726.0,4254.0,0.3
1,"{""1"": {""precision"": 0.6406469760900141, ""recal...",0.32,"[[911, 280, 132, 46, 36], [221, 214, 150, 61, ...",0.45,review_body-tfidf-df_none-ngram13-99567-4159-n...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.05,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 07:53:26,10023.0,4159.0,1.15,89544.0,4159.0,0.33
2,"{""1"": {""precision"": 0.6127167630057804, ""recal...",0.28,"[[424, 173, 80, 31, 27], [90, 97, 73, 35, 19],...",0.3,review_body-tfidf-df_none-ngram13-49784-4254-n...,/home/jupyter/dataset/amazon_reviews/amazon_re...,pyspark,../../models/review_body-tfidf-df_none-ngram13...,LogisticRegression,0.04,../../models/review_body-tfidf-df_none-ngram13...,0.0,success,2019-11-22 18:23:18,5058.0,4254.0,0.89,44726.0,4254.0,0.27
