# 'Sentiment Analysis (Binary Classification) on Skytrax Airline Review Data'

## PART 1 : Load the data, Data preprocessing

###  set up the work environment

In [3]:
#download the data ; need to run only for the first time 
#check the message 'Saving to: ' for filename
!wget https://raw.githubusercontent.com/quankiquanki/skytrax-reviews-dataset/master/data/airline.csv

--2018-04-25 14:59:52--  https://raw.githubusercontent.com/quankiquanki/skytrax-reviews-dataset/master/data/airline.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34752262 (33M) [text/plain]
Saving to: ‘airline.csv’


2018-04-25 14:59:53 (52.1 MB/s) - ‘airline.csv’ saved [34752262/34752262]



In [155]:
# this bit is to check and manage the current working directory
import os
os.getcwd()
#%cd '/gpfs/global_fs01/sym_shared/YPProdSpark/user/s381-cb1adb3c648083-0c7922794fdd/notebook/work'

#remove duplicated file
#os.remove('airline.csv')

'/gpfs/global_fs01/sym_shared/YPProdSpark/user/s381-cb1adb3c648083-0c7922794fdd/notebook/work'

In [156]:
#show contents in the directory : to check the data is loaded successfully
%ls 

airline.csv  [0m[01;34mCity-Data-Science[0m/  [01;34mttl_RDD_full.pkl[0m/


In [71]:
#load the necessary package

from pyspark.ml import *
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.param import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *
from pyspark.sql import *
from pyspark.ml.linalg import Vector

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SQLContext

import pixiedust

spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)


### load the data, and perform initial observation of dataset

In [72]:
#load in data
sentiment_df = spark.read.load("airline.csv", format="csv", sep=",", inferSchema="true", header="true")

#print the schema
#sentiment_df.printSchema()

#change the datatype where necessary
sentiment_df = sentiment_df.withColumn("overall_rating", sentiment_df["overall_rating"].cast(FloatType()))
sentiment_df.printSchema()

# save for later just in case
#"seat_comfort_rating","cabin_staff_rating","food_beverages_rating","inflight_entertainment_rating","ground_service_rating","wifi_connectivity_rating","value_money_rating"


#check total number of samples
print('the number of data sample is:', sentiment_df.count())

#display the first five samples
sentiment_df.show(5)

#because the whole dataframe is too long in width to display properly fit into the spark interface, select key columns to check the data load properly
sentiment_df.select("airline_name","date","content","cabin_flown","overall_rating").show(5)


root
 |-- airline_name: string (nullable = true)
 |-- link: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_country: string (nullable = true)
 |-- date: string (nullable = true)
 |-- content: string (nullable = true)
 |-- aircraft: string (nullable = true)
 |-- type_traveller: string (nullable = true)
 |-- cabin_flown: string (nullable = true)
 |-- route: string (nullable = true)
 |-- overall_rating: float (nullable = true)
 |-- seat_comfort_rating: string (nullable = true)
 |-- cabin_staff_rating: string (nullable = true)
 |-- food_beverages_rating: string (nullable = true)
 |-- inflight_entertainment_rating: string (nullable = true)
 |-- ground_service_rating: string (nullable = true)
 |-- wifi_connectivity_rating: string (nullable = true)
 |-- value_money_rating: string (nullable = true)
 |-- recommended: string (nullable = true)

the number of data sample is: 41455
+-------------+--------------------+------------------

### Data Preprocessing (prior to pipeline)<br>
1) Label Transformation : transfrom the 'overall_rating' (customer rating) to binary sentiment label <br>
2) Remove punctuation : remove punctuation as a text pre-processing step<br>
3) Split Dataset : split test and train dataset / subsampling of train set varying the size<br>
##### 1) Label Transformation : transfrom the 'overall_rating' (customer rating) to binary sentiment label 

In [73]:
# Create the sentiment class label

# to make the sentiment class label, refined the dataframe with rows with overall_rating (without rows with 'null' values in overall_rating)
sentiment_df_refined = sentiment_df.where(sentiment_df["overall_rating"].isNotNull())
print('the number of refined data sample is:', sentiment_df_refined.count())

# define the function to label the data : for reviews with rating <= 3.0 are labelled as negative and rating >= 7.0 as negative
# keep the remaining (rating between 4 ~ 6) as neutral for further analysis later if needed

# user-defined function
udfSentimentLabel = udf((lambda x : 1.0 if x >= 7.0     # positive = 1.0
                         else(0.0 if x <= 3.0          # negative = 0.0
                              else 0.5)), FloatType()) # neutral = 0.5

# apply the function and add created label as a new column 'label'
sentiment_df_refined = sentiment_df_refined.withColumn("label", udfSentimentLabel(sentiment_df_refined.overall_rating))
sentiment_df_refined.select("airline_name","date","content","cabin_flown","overall_rating","label").show(5)
sentiment_df_refined.cache() #cache() used here to solve Spark2.1 ver error (error when apply filtering after withColumn using UDF) & save time


# refine dataset for binary classification task and check
sentiment_df_binary = sentiment_df_refined.filter(col('label') != 0.5) #exclude neutral sentiment
print('dataset for binary classification (display ONLY IMPORTANT columns due to limited space')
sentiment_df_binary.select("airline_name","date","content","cabin_flown","overall_rating","label").show(5)
sentiment_df_binary.cache() 

# check the label proportion (positive:negative)
label_count = sentiment_df_binary.groupBy('label').count().show()

# we can see the proportion is approximately positive : negative = 2 : 1 

the number of refined data sample is: 36627
+-------------+----------+--------------------+--------------+--------------+-----+
| airline_name|      date|             content|   cabin_flown|overall_rating|label|
+-------------+----------+--------------------+--------------+--------------+-----+
|adria-airways|2015-04-10|Outbound flight F...|       Economy|           7.0|  1.0|
|adria-airways|2015-01-05|Two short hops ZR...|Business Class|          10.0|  1.0|
|adria-airways|2014-09-14|Flew Zurich-Ljubl...|       Economy|           9.0|  1.0|
|adria-airways|2014-09-06|Adria serves this...|Business Class|           8.0|  1.0|
|adria-airways|2014-06-16|"WAW-SKJ Economy....|       Economy|           4.0|  0.5|
+-------------+----------+--------------------+--------------+--------------+-----+
only showing top 5 rows

dataset for binary classification (display ONLY IMPORTANT columns due to limited space
+-------------+----------+--------------------+--------------+--------------+-----+
| ai

##### 2) remove punctuation : remove punctuation as a text pre-processing step

In [5]:
# because remove punctuation is not a part of spark feature extractor, we carry out this step prior to pipeline as part of data preprocessing

# user-defined function to remove punctuation (we does not remove '-' because this is generally used to indicate the flight route)
import re
udfremovepunc = udf((lambda x : re.sub('[()\[\],.?!";_]','',x)), StringType())

# apply the function and add as a new column 
sentiment_df_binary_rp = sentiment_df_binary.withColumn("content_removepunc", udfremovepunc(sentiment_df_binary.content))
sentiment_df_binary_rp.select("content","content_removepunc").show(40)


+--------------------+--------------------+
|             content|  content_removepunc|
+--------------------+--------------------+
|Outbound flight F...|Outbound flight F...|
|Two short hops ZR...|Two short hops ZR...|
|Flew Zurich-Ljubl...|Flew Zurich-Ljubl...|
|Adria serves this...|Adria serves this...|
|Sarajevo-Frankfur...|Sarajevo-Frankfur...|
|LJU to FRA and ba...|LJU to FRA and ba...|
|On my Ljubljana -...|On my Ljubljana -...|
|Flights from LJU ...|Flights from LJU ...|
|I was very satisf...|I was very satisf...|
|I was on JP650 th...|I was on JP650 th...|
|VIE-LJU LJU-MUC A...|VIE-LJU LJU-MUC A...|
|If I have to fly ...|If I have to fly ...|
|Istanbul-Ljubljan...|Istanbul-Ljubljan...|
|BEG-LJU-BEG with ...|BEG-LJU-BEG with ...|
|Ljubljana to Brus...|Ljubljana to Brus...|
|LJU-FRA and FRA-L...|LJU-FRA and FRA-L...|
|Smart and efficie...|Smart and efficie...|
|FRA-VIE in Busine...|FRA-VIE in Busine...|
|London Gatwick to...|London Gatwick to...|
|ZRH to LJU as a c...|ZRH to LJU

##### 3) split the dataset & subsampling trainset
##### 3-1) split the test and train dataset

In [74]:
# [NOTE] to balance the positive/negative samples in both training and test set, we follow these steps

# 1) we first devide the dataset (sentiment_df_binary_rp) into positive / negative sets
sentiment_df_binary_rp_posi = sentiment_df_binary_rp.filter(col('label') == 1.0)
sentiment_df_binary_rp_nega = sentiment_df_binary_rp.filter(col('label') == 0.0)


# 2) then do randomSplit for each positive / negative sets (ratio - train : test = 0.8 : 0.2)
trainingset_ratio = 0.8
testset_ratio = 0.2
posi_train, posi_test = sentiment_df_binary_rp_posi.randomSplit([trainingset_ratio, testset_ratio])
nega_train, nega_test = sentiment_df_binary_rp_nega.randomSplit([trainingset_ratio, testset_ratio])

print('the number of positive_train =', posi_train.count())
print('the number of positive_test =',posi_test.count())
print('the number of negative_test =',nega_train.count())
print('the number of negative_test =',nega_test.count())

# 3) merge each train and test set (from positive / negative sets) into final one sets

final_trainset = posi_train.union(nega_train)
final_testset = posi_test.union(nega_test)

print('the number of final training set =',final_trainset.count())
print('the number of final test set =',final_testset.count())

# The test and tratining sets will be used multiple times (for different ML algorithms, different models with varying parameters)
# Therefore caching is used to speed things up.

final_trainset.cache()
final_testset.cache()

the number of positive_train = 15931
the number of positive_test = 3951
the number of negative_test = 8498
the number of negative_test = 2108
the number of final training set = 24429
the number of final test set = 6059


DataFrame[airline_name: string, link: string, title: string, author: string, author_country: string, date: string, content: string, aircraft: string, type_traveller: string, cabin_flown: string, route: string, overall_rating: float, seat_comfort_rating: string, cabin_staff_rating: string, food_beverages_rating: string, inflight_entertainment_rating: string, ground_service_rating: string, wifi_connectivity_rating: string, value_money_rating: string, recommended: string, label: float, content_removepunc: string]

##### 3-2) here we do subset sampling (downsampling) training dataset to experiment with different training-set sizes as parameter tunning, using sample()



In [7]:
# we'll use an exponential size grid (e.g. 10,100,1000,10000, ...)

# get the proportion of the number to sample
size_final_trainset = final_trainset.count()
size_10 = 10/size_final_trainset
size_100 = 100/size_final_trainset
size_1000 = 1000/size_final_trainset
size_10000 = 10000/size_final_trainset

size = [size_10,size_100,size_1000,size_10000]

# feed the proportion to sample() and generate the training set
trainset_10 = final_trainset.sample(False, size_10)
trainset_100 = final_trainset.sample(False, size_100)
trainset_1000 = final_trainset.sample(False, size_1000)
trainset_10000 = final_trainset.sample(False, size_10000)


# PART 2 : Construct the Spark ML Pipeline

### Structure of pipeline

#####  1. Text Feature Processing : feature extractors/transformers<br>
1) Tokenisation <br>
2) Removing stopwords<br>
3) N-gram : use individual n gram here, will experiment with combined n-gram in PART 3.<br>
4) TF (term frequency) : create two pipeline 4-1) using count vectorizer 4-2) using hashingTF<br>
5) IDF (inverse document frequency)<br>

#####  2. Machine Learning Algorithm : Logistic Regression<br>

In [8]:
# this cell is to manage/check all the package necessary to build the pipeline are properly imported
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import CountVectorizer, HashingTF,StopWordsRemover,IDF,Tokenizer,NGram, VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder


#### 1. text feature processing feature extractors/transformers : let's start with default parameter setting!

In [75]:
# 1) tokenisation 
# split each sentence into words using Tokenizer. Tokenizer also converts the all words in lowercase
tokenizer = Tokenizer().setInputCol("content_removepunc").setOutputCol("token_words")


# 2) removing stopwords
# get rid of common words, such as "and", "the", "a", "an", "this", "I", "we",
remover= StopWordsRemover().setInputCol("token_words").setOutputCol("stopword_filtered").setCaseSensitive(False)


#3) n-gram
# create/transform vector with different Ngram
# we can play with n to keep as a parameter : 1-gram, 2-grams, 3-grams collocations (or higher n)
# later in part 3, we'll experiment with combined n-gram as well
# here we start with default setting n=2 (biigram)
ngram = NGram(inputCol="stopword_filtered", outputCol="nGrams")


#4) TF : Term Frequency 
# for term-frequency(TF) method, here we select CountVectorizer and Hashing TF and compare these two by building two pipelines using two respectively

# we think it's worthwhile to compare the two because both has its own advantages and disadvantages
# hashing is useful for processing big data, as it uses less memory and speeds up the computation through converting word to vector. 
# However, misclassification may occur due to collision and original input cannot be restored. 
# Count vectorizer uses more memory as it stores vocabulary list thus may slow down computation. 
# However, the performance is more controllable as analyst can specify the threshold for minimum term frequency and minimum number of documents a term 
# must appear to be included in the vocabulary. Also, it is partially reversible to original inputs.


# 4-1) using count vectorizer

# we can play with vocabSize to keep, minTF and minDF as a parameter
# here we start with default setting = (minTF=1.0, minDF=1.0, vocabSize=262144)
countvec = CountVectorizer(inputCol='nGrams', outputCol='rawFeatures')

# 4-2) using hashing
# we can play with setNumFeatures as a parameter
# here we start with default setting numFeatures=262144
hashing = HashingTF().setInputCol("nGrams").setOutputCol("rawFeatures")


#5) IDF : Inverse Document Frequency - to measure weigths (importance) of each term
idf = IDF().setInputCol("rawFeatures").setOutputCol("features")


# BEFORE we construct the pipeline with ML algorithm, let's observe how these 5 feature extractors behave : WITH the smallest training set (size of 10)
# 1st pipeline uses count vectorizer as TF method, 2nd pipeline use hashing as TF method.
example_pipe_cv = Pipeline(stages=[tokenizer,remover,ngram,countvec,idf])
example_pipe_hash = Pipeline(stages=[tokenizer,remover,ngram,hashing,idf])
example_result_cv = example_pipe_cv.fit(trainset_10)
example_result_hash = example_pipe_hash.fit(trainset_10)

print('pipe with CountVectorizer')
example_result_cv.transform(trainset_10).select("content_removepunc","token_words","stopword_filtered","nGrams","rawFeatures","features").show()


print('pipe with HashingTF')
example_result_hash.transform(trainset_10).select("content_removepunc","token_words","stopword_filtered","nGrams","rawFeatures","features").show()

pipe with CountVectorizer
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  content_removepunc|         token_words|   stopword_filtered|              nGrams|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|VIE-LJU LJU-MUC A...|[vie-lju, lju-muc...|[vie-lju, lju-muc...|[vie-lju lju-muc,...|(828,[1,17,40,50,...|(828,[1,17,40,50,...|
|Recently flew wit...|[recently, flew, ...|[recently, flew, ...|[recently flew, f...|(828,[10,11,18,20...|(828,[10,11,18,20...|
|PEK-PVG On time q...|[pek-pvg, on, tim...|[pek-pvg, time, q...|[pek-pvg time, ti...|(828,[89,96,105,1...|(828,[89,96,105,1...|
|Beijing to Los An...|[beijing, to, los...|[beijing, los, an...|[beijing los, los...|(828,[0,8,12,45,5...|(828,[0,8,12,45,5...|
|Birmingham-Newark...|[birmingham-newar...|[birmingham-newar...|[birmingham-ne

#### 2. Machine learning algorithm : Logistic Regression

In [76]:
# let's start with default parameter setting!
LR = LogisticRegression()

# Check the parameter we can adjust/experiment with
print ("Logistic Regression",LR.explainParams())

Logistic Regression aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
rawPredictionCol: raw prediction (a.k.a. confidence

#### 3. Construct the pipeline (default setting)

In [22]:
# now we can build the whole pipeline
pipeline_LR_cv = Pipeline(stages=[tokenizer,remover,ngram,countvec,idf,LR]) # 1st pipeline uses count vectorizer as TF method
pipeline_LR_hash = Pipeline(stages=[tokenizer,remover,ngram,hashing,idf,LR]) # 2nd pipeline use hashing as TF method

# PART 3 : train and test the pipeline (with default set of parameters)

### Train/test with Default set of parameters
#### Evaluation metrics for different model's performance on both training/test sets
1) classification perfomance : accuracy rate, Area Under the ROC curve<br>
2) computatioanl efficiency : computation time<br>

##### we now we train and test the pipeline

In [23]:
# along with measuring the time consumed for training and testing

# fit the ML pipeline (train the model)
%time model_LR_cv = pipeline_LR_cv.fit(final_trainset)
%time model_LR_hash = pipeline_LR_hash.fit(final_trainset)

# apply the trained model to test set and obtain the classification result
%time train_predictions_LR_cv = model_LR_cv.transform(final_trainset)
%time train_predictions_LR_hash = model_LR_hash.transform(final_trainset)
%time test_predictions_LR_cv = model_LR_cv.transform(final_testset)
%time test_predictions_LR_hash = model_LR_hash.transform(final_testset)

CPU times: user 277 ms, sys: 70.4 ms, total: 347 ms
Wall time: 1min 51s
CPU times: user 169 ms, sys: 40.4 ms, total: 209 ms
Wall time: 45 s
CPU times: user 16.1 ms, sys: 3.05 ms, total: 19.2 ms
Wall time: 100 ms
CPU times: user 11.9 ms, sys: 5.89 ms, total: 17.8 ms
Wall time: 99.2 ms
CPU times: user 16.7 ms, sys: 54 µs, total: 16.7 ms
Wall time: 116 ms
CPU times: user 16 ms, sys: 958 µs, total: 16.9 ms
Wall time: 97.8 ms


##### Result of 1st/2nd ML pipeline (default parameter setting): Logistic regression with CountVectorizer / HashingTF

In [25]:
#------- Result of 1st ML pipeline : Logistic regression with CountVectorizer - basic model (default parameter setting)
print (">> Result of 1st ML pipeline : Logistic regression with CountVectorizer - basic model with default parameter setting")


# display the predicted labels along with true labels
test_predictions_LR_cv.select("airline_name","date","content","cabin_flown","overall_rating","label","prediction").show(5)

# we selecte two evaluation metrics : accuracy rate, AUC : Area Under ROC Curve

# count the number of data
num_train = final_trainset.count()
num_test = final_testset.count()

# count the correct predictions
correct_train = train_predictions_LR_cv.filter(col('label') == col('prediction')).count() 
correct_test = test_predictions_LR_cv.filter(col('label') == col('prediction')).count() 

# calculate the accuracy rate
accuracy_train = correct_train/num_train
accuracy_test = correct_test/num_test 
print('Training Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_train,num_train, correct_train))
print('Test Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_test,num_test, correct_test))

# calculate the AUR using BinaryClassificationEvaluator()
evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC") # set evaluator
print ("Area under ROC curve - train:",evaluator.evaluate(train_predictions_LR_cv))
print ("Area under ROC curve - test:",evaluator.evaluate(test_predictions_LR_cv))

# -------- Result of 2nd ML pipeline : Logistic regression with hashingTF - basic model (default parameter setting)"------------
print (">> Result of 2nd ML pipeline : Logistic regression with hashingTF - basic model with default parameter setting")

# display the predicted labels along with true labels
test_predictions_LR_hash.select("airline_name","date","content","cabin_flown","overall_rating","label","prediction").show(5)

# check evaluation metrics (accuracy rate, AUC : Area Under ROC Curve)

# count the correct predictions
correct_train = train_predictions_LR_hash.filter(col('label') == col('prediction')).count() 
correct_test = test_predictions_LR_hash.filter(col('label') == col('prediction')).count() 

# calculate the accuracy rate
accuracy_train = correct_train/num_train
accuracy_test = correct_test/num_test 
print('Training Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_train,num_train, correct_train))
print('Test Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_test,num_test, correct_test))

# calculate the AUR using BinaryClassificationEvaluator()
evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")
print ("Area under ROC curve - train:",evaluator.evaluate(train_predictions_LR_hash))
print ("Area under ROC curve - test:",evaluator.evaluate(test_predictions_LR_hash))


>> Result of 1st ML pipeline : Logistic regression with CountVectorizer - basic model with default parameter setting
+---------------+----------+--------------------+--------------+--------------+-----+----------+
|   airline_name|      date|             content|   cabin_flown|overall_rating|label|prediction|
+---------------+----------+--------------------+--------------+--------------+-----+----------+
|  adria-airways|2011-09-20|Smart and efficie...|       Economy|           9.0|  1.0|       1.0|
|  adria-airways|2013-08-19|If I have to fly ...|Business Class|           7.0|  1.0|       1.0|
|  adria-airways|2014-01-13|On my Ljubljana -...|Business Class|           8.0|  1.0|       1.0|
|aegean-airlines|2011-09-15|LHR-ATH-LHR. Supe...|       Economy|          10.0|  1.0|       1.0|
|aegean-airlines|2014-01-17|Athens-Dusseldorf...|       Economy|           8.0|  1.0|       1.0|
+---------------+----------+--------------------+--------------+--------------+-----+----------+
only showi

# PART 4 : Parameter tunning (optimisation) & Evaluation

### 1. Optimise the parameters of pipeline/model (using ParamGridBuilder and TrainValidationSplit)
These are the parameters we select to adjust and optimize  
#### - Feature processing
 - Common (both 1st, 2nd pipeline): N-gram / 1-gram(unigram), 2-grams, 3-grams collocations <br>
 - 1st pipeline with CountVectorizer : vocabSize (words size to keep) and minDF (the minimum number of different documents a term must appear in to be included in the vocabulary)<br>
 - 2nd pipeline with hashingTF : setNumFeatures (the size of number for hashing)<br>
   
#### - ML parameters 
   : regularization parameter, elastic net regularization (ratio of L1 and L2 regularizer)

### 2.Additional experiment with optimised parameters

##### - Combined N-gram 
   : extend the experiment to (uni+bi) gram, (uni+bi+tri) gram<br>
##### - Different Training set size 
   : varying the train sets created in PART 2 (trainset_10,trainset_100, trainset_1000, trainset_10000, original whole training set)<br>


### 1-1) Construct the range of parameter to adjust/optimise : ParamGridBuilder

In [26]:
# 1) Varying Feature processing and ML parameters using ParamGridBuilder
# comment for each line of .addGrid() 

# ParamGridBuilder for 1st pipeline
# BELOW CODE is IN ORDER - a) feature processing - ngram - n b) feature processing - countVectorizer - vocabSize c) feature processing - countVectorizer - minDF 
# d) logistic regression parameter - regularization  e) logistic regression parameter - elastic net regularization
# for ngram, we'll extend the experiment to (uni+bi) gram, (uni+bi+tri) gram after this step (as this requires build up separate pipeline)
paramGrid_cv = ParamGridBuilder().addGrid(ngram.n,[1,2,3]) \
.addGrid(countvec.vocabSize,[10,20,50,100,300]) \
.addGrid(countvec.minDF,[1 ,2, 3]) \
.addGrid(LR.regParam,[0.001,0.01,0.05,0.1,0.2]) \
.addGrid(LR.elasticNetParam,[0,0.25,0.5,0.75,1]).build()


# ParamGridBuilder for 2nd pipeline
# BELOW CODE is IN ORDER - a) feature processing - ngram - n b) feature processing - hashing - number of features 
# c) logistic regression parameter - regularization  d) logistic regression parameter - elastic net regularization
# for ngram, we'll extend the experiment to (uni+bi) gram, (uni+bi+tri) gram after this step (as this requires build up separate pipeline)

paramGrid_hash = ParamGridBuilder().addGrid(ngram.n,[1,2,3]) \
.addGrid(hashing.setNumFeatures,[100,1000, 10000, 100000]) \
.addGrid(LR.regParam,[0.001,0.01,0.05,0.1,0.2]) \
.addGrid(LR.elasticNetParam,[0,0.25,0.5,0.75,1]).build()


### 1-2) Construct Validator to perform grid-search

In [None]:
# 2) construct validator, using TrainValidationSplit
validator_LR_cv = TrainValidationSplit().setEstimator(pipeline_LR_cv).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid_cv) #for 1st pipeline(cv)
validator_LR_hash = TrainValidationSplit().setEstimator(pipeline_LR_hash).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid_hash) #for 2nd pipieline(hash)

### 1-3) Perform grid-search, Find the best parameter combination, Evaluate the result for both 1st, 2nd pipeline

##### Perform grid-search (1st pipeline with Countvectorizer)

[NOTE] this step takes long computation time : approximately 4hours for each pipeline. 
        Please adjust the ParamGridBuilder above to smaller set of parameters for the purpose of checking whether the code works or not

In [39]:
# -------- Grid Search for Parameter Optimisation [1st ML pipeline]  Logistic regression with CountVectorizer "------------
print('--------------Grid Search Result of [1st ML pipeline] Logistic regression with CountVectorizer ------------')

# 3-1) perform grid search and fit the best model using validator
print('computation time consumed for parameter tunning') # think this is not that meaningful (2nd line has less parameter so it's natural to take less time)
%time tunedModel_cv = validator_LR_cv.fit(final_trainset) # use the whole dataset, using TrainValidationSplit

--------------Grid Search Result of [1st ML pipeline] Logistic regression with CountVectorizer ------------
computation time consumed for parameter tunning
CPU times: user 2min 21s, sys: 40.9 s, total: 3min 2s
Wall time: 4h 16min 1s


##### Grid-search result (1st pipeline with Countvectorizer) and Best parameters combination

In [None]:
# 3-2) let's see the performance grid of each parameter and which one is the best

performance_cv = tunedModel_cv.validationMetrics  # get list of validation metrics
grid_cv = paramGrid_cv  # get list of parameter grid we built

# based on the performance grid, let's find the best parameter combination and its performance through loop

def GetBestParam(performance_input, grid_input):
    performancegrid = zip(grid_input,performance_input)
    bestMetric = 0 # evaluator set to AUC thus 1 is the best metric : start with 0
    bestGrid = 0
    # now iterate through all grid and find the parameters yield the highest AUC
    for grid, perf in performancegrid:    
        if perf > bestMetric:   # if the perf(AUC) is higher than previously stored 'bestMetric'(AUC),
            bestMetric = perf   # store(replace) the perf to bestMetrics 
            bestGrid = grid     #and its parameter grid to 'bestGrid'
    return bestGrid, bestMetric # and return the best paramters grid and AUC

print('-------this is the best optimised paramter combinations - for 1st pipeline [LR model with CV]----------')
print(GetBestParam(performance_cv, grid_cv))

-------this is the best optimised paramter combinations - for 1st pipeline [LR model with CV]----------
({Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 300, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 3, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty

In [112]:
# this shows the all grid-search results by constructing dataframe with parameter grids and performances (AUC)
import pandas as pd
pd.set_option('display.max_colwidth', -1) # to display full length of parameter settings
performance_grid_cv = pd.DataFrame({'grid': grid_cv, 'performance(AUC)': performance_cv}) # create dataframe
display(performance_grid_cv)


grid,performance(AUC)
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.7609200538040434
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.25}",0.7608713428056946
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5}",0.7608269538863375
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.75}",0.7607920268155797
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 1}",0.7606757395281569
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.7607328026521204
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.25}",0.7602568483049563
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5}",0.7593544682995947
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.75}",0.7580592463579179
"{Param(parent='NGram_4f12808ff4b209a00b60', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 10, Param(parent='CountVectorizer_4d12b27f02308ac1b31c', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 1, Param(parent='LogisticRegression_4619a127e07332239fee', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4619a127e07332239fee', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 1}",0.7567761145561194


##### Performance of pipeline with optimised set of parameters (1st pipeline with Countvectorizer) 

In [66]:
# 3-3) here is the performance of model tunned with best combinations of parameter !

# to measure the training time apart from parameter tunning we build new pipeline with feeding best parameter found above in
# detach the best model from the TVSplit validator with Parammap

ngram_op = NGram(n=1, inputCol="stopword_filtered", outputCol="nGrams")
countvec_op = CountVectorizer(inputCol="nGrams", outputCol="rawFeatures", minDF=3, vocabSize=300)        
LR_op = LogisticRegression().setRegParam(0.01).setElasticNetParam(0)
op_pipe_cv = Pipeline(stages=[tokenizer,remover,ngram_op,countvec_op,idf,LR_op])

% time opModel_cv = op_pipe_cv.fit(final_trainset) 
% time train_predictions_opModel_cv = opModel_cv.transform(final_trainset) 
% time test_predictions_opModel_cv = opModel_cv.transform(final_testset)

# count the number of data
num_train = final_trainset.count()
num_test = final_testset.count()

# count the correct predictions
correct_train_cv = train_predictions_opModel_cv.filter(col('label') == col('prediction')).count() 
correct_test_cv = test_predictions_opModel_cv.filter(col('label') == col('prediction')).count() 

# calculate the accuracy rate
accuracy_train_cv = correct_train_cv/num_train
accuracy_test_cv = correct_test_cv/num_test 

print('[Best Model Performance]')
print('Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_train_cv,num_train, correct_train_cv))
print('Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_test_cv,num_test, correct_test_cv))

# calculate the AUR using BinaryClassificationEvaluator()
print ("Area under ROC curve - Besttrain:",evaluator.evaluate(train_predictions_opModel_cv))
print ("Area under ROC curve - Besttest:",evaluator.evaluate(test_predictions_opModel_cv))


CPU times: user 88.7 ms, sys: 25.8 ms, total: 114 ms
Wall time: 9.62 s
CPU times: user 10.4 ms, sys: 5.28 ms, total: 15.7 ms
Wall time: 81.7 ms
CPU times: user 13.3 ms, sys: 1.93 ms, total: 15.2 ms
Wall time: 80.9 ms
[Best Model Performance]
Accuracy 90.29% (data items: 24456, correct: 22081)
Accuracy 91.05% (data items: 6032, correct: 5492)
Area under ROC curve - Besttrain: 0.9587133802363366
Area under ROC curve - Besttest: 0.9640506845266875


##### Perform grid-search (2nd pipeline with HashingTF) 

[NOTE] this step takes long computation time : approximately 4hours for each pipeline. 
        Please adjust the ParamGridBuilder above to smaller set of parameters for the purpose of checking whether the code works or not

In [33]:
# -------- Grid Search for Parameter Optimisation [2nd ML pipeline]  Logistic regression with hashingTF "------------

print('--------------Grid Search Result of [2nd ML pipeline] Logistic regression with hashingTF ------------')


# 3-1) perform grid search and fit the best model using validator
print('computation time consumed for parameter tunning') # think this is not that meaningful (2nd line has less parameter so it's natural to take less time)
%time tunedModel_hash = validator_LR_hash.fit(final_trainset) # here use train set size = 1000


--------------Grid Search Result of [2nd ML pipeline] Logistic regression with hashingTF ------------
computation time consumed for parameter tunning
CPU times: user 1min 10s, sys: 19.6 s, total: 1min 30s
Wall time: 4h 8min 54s


##### Grid-search result (2nd pipeline with HashingTF) and Best parameters combination

In [97]:
# 3-2) let's see the performance grid of each parameter and which one is the best

performance_hash = tunedModel_hash.validationMetrics  # get list of validation metrics
grid_hash = paramGrid_hash  # get list of parameter grid we built

#use previosuly defined GetBestParam() function to get best parameter grid and performance
print('-------this is the best optimised paramter combinations - for 2nd pipeline [LR model with hashing]----------')
print(GetBestParam(performance_hash, grid_hash)) 

-------this is the best optimised paramter combinations - for 2nd pipeline [LR model with hashing]----------
({Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, <bound method HasNumFeatures.setNumFeatures of HashingTF_4796a88b4bdde00e2979>: 1000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.25}, 0.9731414338488917)


In [79]:
# this shows the all grid-search results
performance_grid_hash = pd.DataFrame({'grid': grid_hash, 'performance(AUC)': performance_hash}) # create dataframe
display(performance_grid_hash)

grid,performance(AUC)
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 100, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9588394384028952
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 1000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9588394384028852
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 10000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.958839438402893
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 100000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.958839438402886
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 100, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9627848612541534
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 1000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9627848612541524
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 10000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9627848612541444
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 100000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9627848612541434
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 100, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.05, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9649019519845854
"{Param(parent='NGram_45f6befd836579ae0c9d', name='n', doc='number of elements per n-gram (>=1)'): 1, : 1000, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='regParam', doc='regularization parameter (>= 0).'): 0.05, Param(parent='LogisticRegression_4534b2d39f6e9817de1e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0}",0.9649019519845814


##### Performance of pipeline with optimised set of parameters (2nd pipeline with HashingTF) 

In [65]:
# 3-3) here is the performance of model tunned with best combinations of parameter !

# to measure the computation time apart from parameter tunning we train the model with the best parameter found above

ngram_op = NGram(n=1, inputCol="stopword_filtered", outputCol="nGrams")
hashing_op = HashingTF().setNumFeatures(1000).setInputCol("nGrams").setOutputCol("rawFeatures")        
LR_op2 = LogisticRegression().setRegParam(0.01).setElasticNetParam(0.25)
op_pipe_hash = Pipeline(stages=[tokenizer,remover,ngram_op,hashing_op,idf,LR_op2])


% time opModel_hash = op_pipe_hash.fit(final_trainset) 
% time train_predictions_opModel_hash = opModel_hash.transform(final_trainset) 
% time test_predictions_opModel_hash = opModel_hash.transform(final_testset)


# count the number of data
num_train = final_trainset.count()
num_test = final_testset.count()

# count the correct predictions
correct_train_hash = train_predictions_opModel_hash.filter(col('label') == col('prediction')).count() 
correct_test_hash = test_predictions_opModel_hash.filter(col('label') == col('prediction')).count() 

# calculate the accuracy rate
accuracy_train_hash = correct_train_hash/num_train
accuracy_test_hash = correct_test_hash/num_test 
print('[Best Model Performance]')
print('Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_train_hash,num_train, correct_train_hash))
print('Accuracy {:.2%} (data items: {}, correct: {})'.format(accuracy_test_hash,num_test, correct_test_hash))

# calculate the AUR using BinaryClassificationEvaluator()
print ("Area under ROC curve - Besttrain:",evaluator.evaluate(train_predictions_opModel_hash))
print ("Area under ROC curve - Besttest:",evaluator.evaluate(test_predictions_opModel_hash))

CPU times: user 175 ms, sys: 62.8 ms, total: 238 ms
Wall time: 11.4 s
CPU times: user 13.3 ms, sys: 4.11 ms, total: 17.4 ms
Wall time: 90 ms
CPU times: user 14.3 ms, sys: 2.76 ms, total: 17 ms
Wall time: 81.7 ms
[Best Model Performance]
Accuracy 90.28% (data items: 24456, correct: 22079)
Accuracy 90.29% (data items: 6032, correct: 5446)
Area under ROC curve - Besttrain: 0.9611205701181036
Area under ROC curve - Besttest: 0.9622651137735343


#### We measured the evaluation metrics 3-times and averaged (please see poster) 

•It turned out that for our dataset and task, pipeline using countvectorizer as TF method (1st pipeline) outperform the one using hashing (2nd pipeline)<br>
from the both aspect of performance and computational efficiency<br>
   
•Therefore, we continue experiment with the pipeline using CountVectorizer as TF method & optimised set of parameters from the step above<br>

### 2-1) Combined N-gram : extend the experiment to (uni+bi) gram, (uni+bi+tri) gram using VectorAssembler

•	Using these optimised parameters, we continue to experiment with build up pipeline and compare using same parameters but with combined N-gram model

##### build up n-gram and countvectorizer and merge resulting vector using 'VectorAssembler'

In [86]:
# 4-1) build up n-gram and countvectorizer for each ngram separately and then merge resulting vector using 'VectorAssembler'

tokenizer = Tokenizer().setInputCol("content_removepunc").setOutputCol("token_words")
remover= StopWordsRemover().setInputCol("token_words").setOutputCol("stopword_filtered").setCaseSensitive(False)

# generate uni, bi, trigrams
ngrams_uni = NGram(n=1, inputCol="stopword_filtered", outputCol="ngrams_uni")
ngrams_bi = NGram(n=2, inputCol="stopword_filtered", outputCol="ngrams_bi")
ngrams_tri = NGram(n=3, inputCol="stopword_filtered", outputCol="ngrams_tri")

# build countvectorizer for each uni, bi, trigrams
countvec_uni = CountVectorizer(inputCol="ngrams_uni", outputCol="rawFeatures_uni", minDF=3, vocabSize=300)
countvec_bi = CountVectorizer(inputCol="ngrams_bi", outputCol="rawFeatures_bi", minDF=3, vocabSize=300)  
countvec_tri = CountVectorizer(inputCol="ngrams_tri", outputCol="rawFeatures_tri", minDF=3, vocabSize=300)

# merge/combine the resulting vector from countvectorizer using 'VectorAssembler' into uni+bi , uni+bi+tri gram
assembler_unibi = VectorAssembler(inputCols=["rawFeatures_uni","rawFeatures_bi"],outputCol="rawFeatures_combined")
assembler_unibitri = VectorAssembler(inputCols=["rawFeatures_uni","rawFeatures_bi","rawFeatures_tri"],outputCol="rawFeatures_combined")
idf_assem = IDF().setInputCol("rawFeatures_combined").setOutputCol("features").setMinDocFreq(0)           
LR = LogisticRegression().setRegParam(0.01).setElasticNetParam(0)

# now we can build the pipeline, each for uni+bi gram and uni+bi+tri gram
pipeline_unibi = Pipeline(stages=[tokenizer, remover, ngrams_uni, ngrams_bi, countvec_uni, countvec_bi, assembler_unibi, idf_assem, LR])
pipeline_unibitri = Pipeline(stages=[tokenizer, remover, ngrams_uni, ngrams_bi, ngrams_tri, countvec_uni, countvec_bi, countvec_tri, assembler_unibitri, idf_assem, LR])


##### train model with pipeline and measure computation time : uni+bi gram and uni+bi+tri gram

In [87]:
print('Training the best model [uni+bi] gram')
print('>>> Result : Time consumed for training')
%time unibi_bestModel = pipeline_unibi.fit(final_trainset)

print('Training the best model [uni+bi+tri] gram')
print('>>> Result : Time consumed for training')
%time unibitri_bestModel = pipeline_unibitri.fit(final_trainset)


Training the best model [uni+bi] gram
>>> Result : Time consumed for training
CPU times: user 119 ms, sys: 30.7 ms, total: 150 ms
Wall time: 12.3 s
Training the best model [uni+bi+tri] gram
>>> Result : Time consumed for training
CPU times: user 133 ms, sys: 34.3 ms, total: 168 ms
Wall time: 20.5 s


##### fit / evaluate trained model : optimised parameters + countvectorizer + unibi gram

In [88]:
# 4-2) fit / evaluate trained model : optimised parameters + countvectorizer + unibi gram
print('>>>> Result : Best Model with uni+bi gram Performance')
print('>> Result : Time consumed for fitting training set')
%time train_predictions_unibi = unibi_bestModel.transform(final_trainset)
print('>> Result : Time consumed for fitting test set')
%time test_predictions_unibi = unibi_bestModel.transform(final_testset)

# here codes for evaluation metrics are already repeatedly used in previous section ; so comments are omitted
num_train = final_trainset.count()
num_test = final_testset.count()
train_correct_unibi = train_predictions_unibi.filter(col('label') == col('prediction')).count()  
test_correct_unibi = test_predictions_unibi.filter(col('label') == col('prediction')).count() 
train_accuracy_unibi = train_correct_unibi/num_train
test_accuracy_unibi = test_correct_unibi/num_test
   
print('Train Accuracy {:.2%} (data items: {}, correct: {})'.format(train_accuracy_unibi,num_train, train_correct_unibi))
print('Test Accuracy {:.2%} (data items: {}, correct: {})'.format(test_accuracy_unibi,num_test, test_correct_unibi))
print("Area under ROC curve - train:",evaluator.evaluate(train_predictions_unibi))
print("Area under ROC curve - test:",evaluator.evaluate(test_predictions_unibi))


# 4-3)fit / evaluate trained model : optimised parameters + countvectorizer + unibitri gram
print('>>>> Result : Best Model with uni+bi+tri gram Performance')
print('>> Result : Time consumed for fitting training set')
%time train_predictions_unibitri = unibitri_bestModel.transform(final_trainset)
print('>> Result : Time consumed for fitting test set')
%time test_predictions_unibitri = unibitri_bestModel.transform(final_testset)

# here codes for evaluation metrics are already repeatedly used in previous section ; so comments are omitted
num_train = final_trainset.count()
num_test = final_testset.count()
train_correct_unibitri = train_predictions_unibitri.filter(col('label') == col('prediction')).count()  
test_correct_unibitri = test_predictions_unibitri.filter(col('label') == col('prediction')).count() 
train_accuracy_unibitri = train_correct_unibitri/num_train
test_accuracy_unibitri = test_correct_unibitri/num_test
   
print('Train Accuracy {:.2%} (data items: {}, correct: {})'.format(train_accuracy_unibitri,num_train, train_correct_unibitri))
print('Test Accuracy {:.2%} (data items: {}, correct: {})'.format(test_accuracy_unibitri,num_test, test_correct_unibitri))
print("Area under ROC curve - train:",evaluator.evaluate(train_predictions_unibitri))
print("Area under ROC curve - test:",evaluator.evaluate(test_predictions_unibitri))



>>>> Result : Best Model with uni+bi gram Performance
>> Result : Time consumed for fitting training set
CPU times: user 16.1 ms, sys: 9.1 ms, total: 25.2 ms
Wall time: 197 ms
>> Result : Time consumed for fitting test set
CPU times: user 17.8 ms, sys: 2.6 ms, total: 20.4 ms
Wall time: 202 ms
Train Accuracy 91.53% (data items: 24429, correct: 22359)
Test Accuracy 90.56% (data items: 6059, correct: 5487)
Area under ROC curve - train: 0.9669624546868165
Area under ROC curve - test: 0.9613549904739198
>>>> Result : Best Model with uni+bi+tri gram Performance
>> Result : Time consumed for fitting training set
CPU times: user 19.7 ms, sys: 4.7 ms, total: 24.4 ms
Wall time: 228 ms
>> Result : Time consumed for fitting test set
CPU times: user 18.8 ms, sys: 4.92 ms, total: 23.7 ms
Wall time: 217 ms
Train Accuracy 91.73% (data items: 24429, correct: 22409)
Test Accuracy 90.28% (data items: 6059, correct: 5470)
Area under ROC curve - train: 0.969789448108186
Area under ROC curve - test: 0.96072

#### It turned out that uni+bi gram yields best result compared to unigram and uni+bi+tri gram
#### Finally, the best optimised pipeline are defined

•	 N gram using uni+bi gram / Countvectorizer as TF method (VocabSize = 300, MinDF = 3) / Logistic Regression Regularization Parameters (regParam = 0.01, elasticNetParam = 0)

### 2-2) Different Training set size : varying the train sets created in PART 2 (trainset_10,trainset_100, trainset_1000, trainset_10000, original whole training set)

#### 	Using these best optimised pipeline, let's train the the different size of training set and see how it affects the performance with the best tuned pipeline

In [89]:
# 5) Varying the training set size  : with the best pipeline

# [NOTE] trainset size of 10 cannot be trained with the optimised parameter minDF = 3 (countvectorize) : this is too big for document of 10
# it turned out that with minDF = 2, the model is also not trainable. alternatively we use minDF = 1 only for the train set size of 10

# thus this bit is only for train set size = 10 : the variant of best pipeline with minDF = 0
countvec_uni_forsize10 = CountVectorizer(inputCol="ngrams_uni", outputCol="rawFeatures_uni", minDF=1, vocabSize=300)
countvec_bi_forsize10 = CountVectorizer(inputCol="ngrams_bi", outputCol="rawFeatures_bi", minDF=1, vocabSize=300)  
pipeline_forsize10 = Pipeline(stages=[tokenizer, remover, ngrams_uni, ngrams_bi, countvec_uni_forsize10, countvec_bi_forsize10, assembler_unibi, idf_assem, LR])
 
# list of Train data set (for loop)
list_Traindata = [trainset_10, trainset_100, trainset_1000, trainset_10000, final_trainset]
name_Traindata = ['train set size = 10','train set size = 100', 'train set size = 1,000', 'train set size = 10,000', 'train set size = full train set (24,545)']

# list to store the result
train_accuracy_bestModel = []
test_accuracy_bestModel = []
train_AUR_bestModel = []
test_AUR_bestModel = []


# We loop through the each train dataset, to train the model and fit the model to train / test set
i = -1    #this is for printing name of dataset currently running

for sets in list_Traindata :  
    # train model
    print('Training the best model [with optimised Parameter Combinations & CountVectorizer as TF method & Uni+Bi gram]')
    i = i + 1
    print(name_Traindata[i])
    print('>>> Result : Time consumed for training')
    if i > 1 : 
        %time bestModel = pipeline_unibi.fit(sets) # if trainset > 10, train model with the best pipeline
    else : 
        %time bestModel = pipeline_forsize10.fit(sets) # if trainset = 10, train model with the variant of best pipeline with minDF = 0

    # fit trained model
    print('>> Result : Time consumed for fitting training set')
    %time train_predictions = bestModel.transform(sets)
    print('>> Result : Time consumed for fitting test set')
    %time test_predictions = bestModel.transform(final_testset)
    
    # evaluation metrics : here codes for evaluation metrics are already repeatedly used in previous section ; so comments are omitted
    num_train = sets.count()
    num_test = final_testset.count()
    train_correct = train_predictions.filter(col('label') == col('prediction')).count()  
    test_correct = test_predictions.filter(col('label') == col('prediction')).count() 
    train_accuracy = train_correct/num_train
    test_accuracy = test_correct/num_test
    
    #store the result to the previously created list
    train_accuracy_bestModel.append(train_accuracy)  
    test_accuracy_bestModel.append(test_accuracy)
    train_AUR_bestModel.append(evaluator.evaluate(train_predictions))
    test_AUR_bestModel.append(evaluator.evaluate(test_predictions))    
    print('>> Result : Model Performance')
    print('Train Accuracy {:.2%} (data items: {}, correct: {})'.format(train_accuracy,num_train, train_correct))
    print('Test Accuracy {:.2%} (data items: {}, correct: {})'.format(test_accuracy,num_test, test_correct))
    print("Area under ROC curve - train:",evaluator.evaluate(train_predictions))
    print("Area under ROC curve - test:",evaluator.evaluate(test_predictions))

#print the result list : accuracy rate and AUC for training and testing
print(train_accuracy_bestModel)
print(test_accuracy_bestModel)
print(train_AUR_bestModel)
print(test_AUR_bestModel)               

Training the best model [with optimised Parameter Combinations & CountVectorizer as TF method & Uni+Bi gram]
train set size = 10
>>> Result : Time consumed for training
CPU times: user 72.3 ms, sys: 24.1 ms, total: 96.4 ms
Wall time: 2.03 s
>> Result : Time consumed for fitting training set
CPU times: user 16.3 ms, sys: 4.94 ms, total: 21.2 ms
Wall time: 206 ms
>> Result : Time consumed for fitting test set
CPU times: user 23.2 ms, sys: 903 µs, total: 24.1 ms
Wall time: 187 ms
>> Result : Model Performance
Train Accuracy 100.00% (data items: 16, correct: 16)
Test Accuracy 65.19% (data items: 6059, correct: 3950)
Area under ROC curve - train: 1.0
Area under ROC curve - test: 0.723937854466743
Training the best model [with optimised Parameter Combinations & CountVectorizer as TF method & Uni+Bi gram]
train set size = 100
>>> Result : Time consumed for training
CPU times: user 89.8 ms, sys: 45.5 ms, total: 135 ms
Wall time: 3.04 s
>> Result : Time consumed for fitting training set
CPU tim

## Appendix : further analysis

In [90]:
# countVectorizer provide 'vocabulary' as a result of fitting the model, which is 'An array of terms in the vocabulary'
# we can match these lists of vocabulary used in TF to weights coefficient of Logistic Regression model 
# (we can't do with hasingTF : can't reverse back as it does not store vocabulary)
# in this way, we can see which words has been assigned the high weights (by our model) to be classfied as positive/negative sentiments

# let's look at the weight of best model
vocabulary_list_uni = unibi_bestModel.stages[4].vocabulary # extract vocabulary list of unigram
vocabulary_list_bi = unibi_bestModel.stages[5].vocabulary # extract vocabulary list of bigram
vocabulary_list = vocabulary_list_uni+vocabulary_list_bi # combine vocabulary list of both unigram and bigram
LR_coefficient = unibi_bestModel.stages[-1].coefficients.toArray() # coefficients of Logistic Regression
vocab_and_weights = pd.DataFrame({'vocabulary': vocabulary_list, 'coefficient(weight)': LR_coefficient}) # create dataframe

print("--------- 1. This is the top 25 positive weighted words of best LR model--------------")
print(vocab_and_weights.sort_values('coefficient(weight)', ascending=False).head(30)) #ascending (top 25 positive weighted words)
print("--------- 2. This is the top 25 negative weighted words of best LR model--------------")
print(vocab_and_weights.sort_values('coefficient(weight)', ascending=True).head(30)) #descending (top 25 negative weighted words)


--------- 1. This is the top 25 positive weighted words of best LR model--------------
     coefficient(weight)            vocabulary
34              0.587424             excellent
29              0.501479           comfortable
2               0.474888                  good
31              0.469102                 great
376             0.438182  pleasantly surprised
28              0.431902              friendly
360             0.388668        definitely fly
132             0.351612             efficient
111             0.346142              pleasant
130             0.331957             attentive
102             0.322356               helpful
172             0.308712          professional
504             0.306539        ahead schedule
234             0.301976                smooth
47              0.298341                  nice
263             0.297834                  easy
310             0.283976          flights time
115             0.274600                   bit
67              0.27