In [1]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode,lit,array,lower
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType
import numpy as np
import pickle 

from pyspark.sql import functions as F
from itertools import chain
from sklearn.metrics import classification_report

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.\
        builder.\
        appName("ml").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2048m").\
        getOrCreate()

23/01/19 11:09:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
class SentimentModel:
    
    def __init__(self):
        self.df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
        self.df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')
        self.clean_data()
        self.split_content()
        self.convert_feature()
        self.model = {}
        
    def getNGram(self,df,n):
        ngram = NGram(n=n)
        ngram.setInputCol("comment_term")
        ngram.setOutputCol("nGrams")
        df_nGram = ngram.transform(df)
        result_nGram = df_nGram.withColumn('word',explode(df_nGram.nGrams))\
            .groupBy(['word'])\
            .count()
        return result_nGram
        
    def clean_data(self):
    
        df = self.df_train.withColumn('comment_term',split(self.df_train.clean_content, ' ', -1))

        result_nGram = self.getNGram(df,1)
        result_nGram.createOrReplaceTempView('result_nGram')
        
        stop_word = spark.sql("""
            select word from result_nGram
            where count < 10
        """).toPandas()
        stop_word = stop_word['word'].to_list()
        
        dict_stop_word = {x:1 for x in stop_word}
        self.dict_stop_word = dict_stop_word
        self.df_test.createOrReplaceTempView('df_test')
        self.df_train.createOrReplaceTempView('df_train')
        
        
        def remove_stop_word(txt):
            txt = txt.strip()
            ls_words = txt.split()
            ls_new_words = []
            for word in ls_words:
                if dict_stop_word.get(word) == None:
                    ls_new_words.append(word)
            return ' '.join(ls_new_words)
        spark.udf.register("remove_stop_word", remove_stop_word,StringType())
        
        self.df_test = spark.sql("""
            select remove_stop_word(clean_content) clean_content,rating,sentiment,true_label,label 
            from df_test
        """)
        
        self.df_train = spark.sql("""
            select remove_stop_word(clean_content) clean_content,rating,sentiment,label 
            from df_train
        """)
        

    
    def set_weight(self, w_a = 5,w_b = 5, w_c = 1):
        class_weights_spark = {0:w_a,1:w_b,2:w_c}
        mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
        self.train_idf = self.train_idf.withColumn("weight", mapping_expr.getItem(F.col("label")))
        
    def split_content(self):
        self.train_set = self.df_train.select(split(self.df_train.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label')
        self.test_set = self.df_test.select(split(self.df_test.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label','true_label')
    
    def convert_feature(self):
        count = CountVectorizer(inputCol="cmt_token", outputCol="rawFeatures")
        idf = IDF(inputCol="rawFeatures", outputCol="featuresTFIDF")
        pipeline = Pipeline(stages=[count, idf])
        self.model_tfidf = pipeline.fit(self.train_set)
        self.train_idf = self.model_tfidf.transform(self.train_set)
        self.test_idf = self.model_tfidf.transform(self.test_set)
    
    def model_logistic(self,weight):
        if weight == True:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0,weightCol="weight")
        else:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0)
        
        paramGrid = ParamGridBuilder()\
                    .addGrid(lr.maxIter, [10, 20, 50])\
                    .addGrid(lr.regParam, [0.1,0.3,0.5])\
                    .addGrid(lr.elasticNetParam,  [0.0, 0.1, 0.2])\
                    .build()
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        crossval = CrossValidator(estimator=lr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5) 
        model = crossval.fit(self.train_idf)
        if weight == True:
            self.model['lr_yes'] = model
        else:
            self.model['lr_no'] = model
        predictions = model.transform(self.test_idf)
        return predictions
    
    def model_rf(self,weight):
        if weight == True:
            trainer = RandomForestClassifier(featuresCol = "featuresTFIDF",weightCol="weight")
        else:
            trainer = RandomForestClassifier(featuresCol = "featuresTFIDF")
            
        paramGrid = ParamGridBuilder()\
                .addGrid(trainer.numTrees, [10,20,50])\
               .addGrid(trainer.maxDepth, [2,6,8])\
               .addGrid(trainer.minInstancesPerNode, [1,3,5])\
               .build()
        
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        crossval = CrossValidator(estimator=trainer,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5) 
        model = crossval.fit(self.train_idf)
        if weight == True:
            self.model['rf_yes'] = model
        else:
            self.model['rf_no'] = model
        predictions = model.transform(self.test_idf)
        return predictions
    
    def evaluate(self,predictions):
        result = predictions.select('true_label', 'prediction')
        result = result[['true_label','prediction']].toPandas()
        
        print(f'accuracy_score: ',accuracy_score(result.true_label, result.prediction))
        print(f'prediction: ',precision_score(result.true_label, result.prediction, average='weighted'))
        print(f'recall_score: ',recall_score(result.true_label, result.prediction, average='weighted'))
        print(f'f1_score: ',f1_score(result.true_label, result.prediction, average='weighted'))
        print(classification_report(result.true_label, result.prediction))
        
    def save_model(self):
        list_model = ['lr_yes','lr_no','rf_yes','rf_no']
        for model_name in list_model:
            self.model[model_name].write().overwrite().save(f'hdfs://namenode:9000/save_model/{model_name}')
        
        self.model_tfidf.write().overwrite().save(f'hdfs://namenode:9000/save_model/model_tfidf')
        
        with open('data/dict_stop_word.pkl', 'wb') as f:
            pickle.dump(self.dict_stop_word, f)

In [5]:
model = SentimentModel()

                                                                                

# Oversampling

In [6]:
lb0_cnt = model.train_set.filter(col('label') == 0).count()
lb1_cnt = model.train_set.filter(col('label') == 1).count()
lb2_cnt = model.train_set.filter(col('label') == 2).count()

In [7]:
w_a = int(lb2_cnt/lb0_cnt)
w_b = int(lb2_cnt/lb1_cnt)

In [8]:
model.set_weight(w_a,w_b)

# LogicRegression

## Weight balance

In [None]:
predictions_wb = model.model_logistic(weight=True)

In [10]:
model.evaluate(predictions_wb)

23/01/18 17:01:05 WARN DAGScheduler: Broadcasting large task binary with size 1167.3 KiB


accuracy_score:  0.8245931283905967
prediction:  0.8410244847882596
recall_score:  0.8245931283905967
f1_score:  0.8317050234923122
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       805
           1       0.40      0.50      0.44       538
           2       0.93      0.89      0.91      3634

    accuracy                           0.82      4977
   macro avg       0.68      0.71      0.70      4977
weighted avg       0.84      0.82      0.83      4977



## No balance

In [None]:
predictions_no_wb = model.model_logistic(weight=False)

In [12]:
model.evaluate(predictions_no_wb)

23/01/18 17:45:37 WARN DAGScheduler: Broadcasting large task binary with size 1167.3 KiB


accuracy_score:  0.8111312035362668
prediction:  0.778714701546379
recall_score:  0.8111312035362668
f1_score:  0.7603358919740008
              precision    recall  f1-score   support

           0       0.87      0.52      0.65       805
           1       0.45      0.03      0.05       538
           2       0.81      0.99      0.89      3634

    accuracy                           0.81      4977
   macro avg       0.71      0.51      0.53      4977
weighted avg       0.78      0.81      0.76      4977



# RandomForestClassifier

## Weight balance

In [None]:
rf_predictions_wb = model.model_rf(weight=True)

In [14]:
model.evaluate(rf_predictions_wb)

23/01/19 03:29:54 WARN DAGScheduler: Broadcasting large task binary with size 1390.0 KiB


accuracy_score:  0.7862165963431786
prediction:  0.7924247681816011
recall_score:  0.7862165963431786
f1_score:  0.7775392457146962
              precision    recall  f1-score   support

           0       0.54      0.82      0.65       805
           1       0.51      0.21      0.30       538
           2       0.89      0.86      0.88      3634

    accuracy                           0.79      4977
   macro avg       0.64      0.63      0.61      4977
weighted avg       0.79      0.79      0.78      4977



## No balance

In [None]:
rf_predictions_no_wb = model.model_rf(weight=False)

In [10]:
model.evaluate(rf_predictions_no_wb)

23/01/19 20:58:07 WARN DAGScheduler: Broadcasting large task binary with size 1065.9 KiB


accuracy_score:  0.7301587301587301
prediction:  0.5331317712270093
recall_score:  0.7301587301587301
f1_score:  0.6162807630697539
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       805
           1       0.00      0.00      0.00       538
           2       0.73      1.00      0.84      3634

    accuracy                           0.73      4977
   macro avg       0.24      0.33      0.28      4977
weighted avg       0.53      0.73      0.62      4977



# Analysis result

## Save model 

In [None]:
model.save_model()

## Best param LR

In [16]:
model.model['lr_yes'].getEstimatorParamMaps()[ np.argmax(model.model['lr_yes'].avgMetrics) ]

{Param(parent='LogisticRegression_8a1de7e06574', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='LogisticRegression_8a1de7e06574', name='regParam', doc='regularization parameter (>= 0).'): 0.5,
 Param(parent='LogisticRegression_8a1de7e06574', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

In [17]:
model.model['lr_no'].getEstimatorParamMaps()[ np.argmax(model.model['lr_no'].avgMetrics) ]

{Param(parent='LogisticRegression_6cd7ff919380', name='maxIter', doc='max number of iterations (>= 0).'): 20,
 Param(parent='LogisticRegression_6cd7ff919380', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_6cd7ff919380', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

## Best param Rf

In [18]:
model.model['rf_yes'].getEstimatorParamMaps()[ np.argmax(model.model['rf_yes'].avgMetrics) ]

{Param(parent='RandomForestClassifier_a43184117d6e', name='numTrees', doc='Number of trees to train (>= 1).'): 50,
 Param(parent='RandomForestClassifier_a43184117d6e', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 6,
 Param(parent='RandomForestClassifier_a43184117d6e', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1}

In [11]:
model.model['rf_no'].getEstimatorParamMaps()[ np.argmax(model.model['rf_no'].avgMetrics) ]

{Param(parent='RandomForestClassifier_e24d6640e138', name='numTrees', doc='Number of trees to train (>= 1).'): 10,
 Param(parent='RandomForestClassifier_e24d6640e138', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,
 Param(parent='RandomForestClassifier_e24d6640e138', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 5}

## Avg accuracy

In [19]:
model.model['lr_yes'].avgMetrics

[0.7764573125965952,
 0.7456340038571283,
 0.694868138187162,
 0.7766417513383568,
 0.6603483030996044,
 0.5699171978957835,
 0.7776969799924368,
 0.5873499331291506,
 0.4123602067998112,
 0.774367323242944,
 0.7359863382765413,
 0.6904136621058743,
 0.7723524748217678,
 0.6607537375301171,
 0.5708691275277076,
 0.7726548724826142,
 0.5872946996660414,
 0.4123602067998112,
 0.773169535841704,
 0.7351117228190561,
 0.6904737189325507,
 0.772075860998275,
 0.6607463578471462,
 0.5708691275277076,
 0.7725931431128752,
 0.5872928558127568,
 0.4123602067998112]

In [20]:
model.model['lr_no'].avgMetrics

[0.8427689582216817,
 0.813631009358676,
 0.8008990360347747,
 0.8270214764026717,
 0.7934305370142724,
 0.7923292488967033,
 0.8175054003187974,
 0.792344920769156,
 0.7923329303378015,
 0.8438214947877416,
 0.8138266244500139,
 0.8007748666813412,
 0.8276427198694821,
 0.7934508287094659,
 0.7923292488967033,
 0.8178805210158259,
 0.792344920769156,
 0.7923329303378015,
 0.8436574566525537,
 0.8138431540096609,
 0.8007260000977876,
 0.8275809224566997,
 0.7934425264113083,
 0.7923292488967033,
 0.8178712927022566,
 0.792344920769156,
 0.7923329303378015]

In [21]:
model.model['rf_yes'].avgMetrics

[0.7138675397647222,
 0.7138675397647222,
 0.7138675397647222,
 0.7465267035308213,
 0.7199575766075282,
 0.7470936372750179,
 0.7416897688375652,
 0.7295085577071947,
 0.7299894681347712,
 0.718955712961225,
 0.718955712961225,
 0.718955712961225,
 0.7361737424047989,
 0.7616374644691588,
 0.7269110658394248,
 0.742663707901961,
 0.7503309425088778,
 0.7469322706227179,
 0.7640029393007719,
 0.7640029393007719,
 0.7640020204364881,
 0.7810081686184898,
 0.7769529385892355,
 0.7786959499214202,
 0.7775047554105232,
 0.776793100003014,
 0.7803226995833168]

23/01/19 06:48:56 WARN DAGScheduler: Broadcasting large task binary with size 1327.9 KiB
[Stage 11764:>                                                      (0 + 2) / 3]

In [12]:
model.model['rf_no'].avgMetrics

[0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.7923371923971239,
 0.7923353520849636,
 0.7923390293951313,
 0.7923998968037614,
 0.7923869661639318,
 0.7924026191211583,
 0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.7923344267264467,
 0.792334427528683,
 0.792333505755006,
 0.792340873581691,
 0.792344566588896,
 0.7923390355207991,
 0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.792333505755006,
 0.7923399478496358,
 0.7923418072594139,
 0.7923399601272201]