In [1]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode,lit,array,lower
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType
import numpy as np
from pyspark.sql import functions as F
from itertools import chain
from sklearn.metrics import classification_report

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.\
        builder.\
        appName("ml").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

23/01/05 16:26:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
class SentimentModel:
    
    def __init__(self):
        self.df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
        self.df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')
        self.split_content()
        self.convert_feature()
        self.model = {}
        
    def set_weight(self, w_a = 5,w_b = 5, w_c = 1):
        class_weights_spark = {0:w_a,1:w_b,2:w_c}
        mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
        self.train_idf = self.train_idf.withColumn("weight", mapping_expr.getItem(F.col("label")))
        
    def split_content(self):
        self.train_set = self.df_train.select(split(self.df_train.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label')
        self.test_set = self.df_test.select(split(self.df_test.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label','true_label')
    
    def convert_feature(self):
        count = CountVectorizer(inputCol="cmt_token", outputCol="rawFeatures")
        idf = IDF(inputCol="rawFeatures", outputCol="featuresTFIDF",minDocFreq=2)
        pipeline = Pipeline(stages=[count, idf])
        self.model_tfidf = pipeline.fit(self.train_set)
        self.train_idf = self.model_tfidf.transform(self.train_set)
        self.test_idf = self.model_tfidf.transform(self.test_set)
    
    def model_logistic(self,weight):
        if weight == True:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0,weightCol="weight")
        else:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0)
        
        paramGrid = ParamGridBuilder()\
                    .addGrid(lr.maxIter, [10, 20, 50])\
                    .addGrid(lr.regParam, [0.1,0.3,0.5])\
                    .addGrid(lr.elasticNetParam,  [0.0, 0.1, 0.2])\
                    .build()
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        crossval = CrossValidator(estimator=lr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5) 
        model = crossval.fit(self.train_idf)
        if weight == True:
            self.model['lr_yes'] = model
        else:
            self.model['lr_no'] = model
        predictions = model.transform(self.test_idf)
        return predictions
    
    def model_rf(self,weight):
        if weight == True:
            trainer = RandomForestClassifier(featuresCol = "featuresTFIDF",weightCol="weight")
        else:
            trainer = RandomForestClassifier(featuresCol = "featuresTFIDF")
            
        paramGrid = ParamGridBuilder()\
                .addGrid(trainer.numTrees, [10,20,50])\
               .addGrid(trainer.maxDepth, [2,6,8])\
               .addGrid(trainer.minInstancesPerNode, [1,3,5])\
               .build()
        
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        crossval = CrossValidator(estimator=trainer,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5) 
        model = crossval.fit(self.train_idf)
        if weight == True:
            self.model['rf_yes'] = model
        else:
            self.model['rf_no'] = model
        predictions = model.transform(self.test_idf)
        return predictions
    
    def evaluate(self,predictions):
        result = predictions.select('true_label', 'prediction')
        result = result[['true_label','prediction']].toPandas()
        
        print(f'accuracy_score: ',accuracy_score(result.true_label, result.prediction))
        print(f'prediction: ',precision_score(result.true_label, result.prediction, average='weighted'))
        print(f'recall_score: ',recall_score(result.true_label, result.prediction, average='weighted'))
        print(f'f1_score: ',f1_score(result.true_label, result.prediction, average='weighted'))
        print(classification_report(result.true_label, result.prediction))
        
    def save_model(self):
        list_model = ['lr_yes','lr_no','rf_yes','rf_no']
        for model_name in list_model:
            self.model[model_name].save(f'hdfs://namenode:9000/save_model/{model_name}')
        
        self.model_tfidf.save(f'hdfs://namenode:9000/save_model/model_tfidf')

In [5]:
model = SentimentModel()

                                                                                

# Oversampling

In [6]:
lb0_cnt = model.train_set.filter(col('label') == 0).count()
lb1_cnt = model.train_set.filter(col('label') == 1).count()
lb2_cnt = model.train_set.filter(col('label') == 2).count()

In [7]:
w_a = int(lb2_cnt/lb0_cnt)
w_b = int(lb2_cnt/lb1_cnt)

In [8]:
model.set_weight(w_a,w_b)

# LogisticRegression

## Weight balance

In [None]:
predictions_wb = model.model_logistic(weight=True)

In [10]:
model.evaluate(predictions_wb)

23/01/05 16:59:45 WARN DAGScheduler: Broadcasting large task binary with size 1402.7 KiB


accuracy_score:  0.8071127185051236
prediction:  0.8380121130560214
recall_score:  0.8071127185051236
f1_score:  0.8191898941593666
              precision    recall  f1-score   support

           0       0.78      0.65      0.71       805
           1       0.33      0.51      0.40       538
           2       0.93      0.89      0.91      3634

    accuracy                           0.81      4977
   macro avg       0.68      0.68      0.67      4977
weighted avg       0.84      0.81      0.82      4977



## No balance

In [None]:
predictions_no_wb = model.model_logistic(weight=False)

In [12]:
model.evaluate(predictions_no_wb)

23/01/05 17:32:05 WARN DAGScheduler: Broadcasting large task binary with size 1402.8 KiB


accuracy_score:  0.7996785211975085
prediction:  0.7822861675903823
recall_score:  0.7996785211975085
f1_score:  0.7437892326495242
              precision    recall  f1-score   support

           0       0.89      0.44      0.59       805
           1       0.55      0.02      0.04       538
           2       0.79      0.99      0.88      3634

    accuracy                           0.80      4977
   macro avg       0.74      0.48      0.50      4977
weighted avg       0.78      0.80      0.74      4977



# RandomForestClassifier

## Weight balance

In [None]:
rf_predictions_wb = model.model_rf(weight=True)

In [14]:
model.evaluate(rf_predictions_wb)

23/01/06 04:43:29 WARN DAGScheduler: Broadcasting large task binary with size 1423.6 KiB


accuracy_score:  0.6978099256580269
prediction:  0.8142869090693283
recall_score:  0.6978099256580269
f1_score:  0.7217519529565698
              precision    recall  f1-score   support

           0       0.85      0.26      0.40       805
           1       0.20      0.53      0.29       538
           2       0.90      0.82      0.86      3634

    accuracy                           0.70      4977
   macro avg       0.65      0.54      0.52      4977
weighted avg       0.81      0.70      0.72      4977



## No balance

In [None]:
rf_predictions_no_wb = model.model_rf(weight=False)

In [16]:
model.evaluate(rf_predictions_no_wb)

accuracy_score:  0.7301587301587301
prediction:  0.5331317712270093
recall_score:  0.7301587301587301
f1_score:  0.6162807630697539
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       805
           1       0.00      0.00      0.00       538
           2       0.73      1.00      0.84      3634

    accuracy                           0.73      4977
   macro avg       0.24      0.33      0.28      4977
weighted avg       0.53      0.73      0.62      4977



# Analysis result

## Save model 

In [27]:
model.save_model()

## Best param LR

In [18]:
model.model['lr_yes'].getEstimatorParamMaps()[ np.argmax(model.model['lr_yes'].avgMetrics) ]

{Param(parent='LogisticRegression_ee405948cf7c', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='LogisticRegression_ee405948cf7c', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_ee405948cf7c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

In [19]:
model.model['lr_no'].getEstimatorParamMaps()[ np.argmax(model.model['lr_no'].avgMetrics) ]

{Param(parent='LogisticRegression_36f49e359ee4', name='maxIter', doc='max number of iterations (>= 0).'): 20,
 Param(parent='LogisticRegression_36f49e359ee4', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_36f49e359ee4', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

## Best param Rf

In [20]:
model.model['rf_yes'].getEstimatorParamMaps()[ np.argmax(model.model['rf_yes'].avgMetrics) ]

{Param(parent='RandomForestClassifier_19268b04e044', name='numTrees', doc='Number of trees to train (>= 1).'): 50,
 Param(parent='RandomForestClassifier_19268b04e044', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,
 Param(parent='RandomForestClassifier_19268b04e044', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 5}

In [21]:
model.model['rf_no'].getEstimatorParamMaps()[ np.argmax(model.model['rf_no'].avgMetrics) ]

{Param(parent='RandomForestClassifier_9f7ae89f4864', name='numTrees', doc='Number of trees to train (>= 1).'): 10,
 Param(parent='RandomForestClassifier_9f7ae89f4864', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,
 Param(parent='RandomForestClassifier_9f7ae89f4864', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1}

## Avg accuracy

In [22]:
model.model['lr_yes'].avgMetrics

[0.8035241254071251,
 0.7683423426889808,
 0.6972582962713498,
 0.8006572715530759,
 0.6635146579721681,
 0.578808415620367,
 0.7982104886786365,
 0.5971425076992609,
 0.399380499326747,
 0.7985474107130952,
 0.7459278516865715,
 0.6992864664188447,
 0.7888302059647893,
 0.6658135391330271,
 0.5682901734075071,
 0.7840285206222166,
 0.6074315623156624,
 0.399380499326747,
 0.7962262538072673,
 0.7453606400791458,
 0.6992929867733968,
 0.7886265545327096,
 0.6655905387900842,
 0.5680833126628535,
 0.7841037490813189,
 0.6076102459215886,
 0.399380499326747]

In [23]:
model.model['lr_no'].avgMetrics

[0.8723771621025318,
 0.8455521329967675,
 0.8320283526800031,
 0.8580375138735298,
 0.8248658656545842,
 0.8233104207854742,
 0.8490610612224072,
 0.8233292651201483,
 0.8233104207854742,
 0.873914207825562,
 0.8461983352577356,
 0.8329911299234869,
 0.85907582829636,
 0.8248878321839845,
 0.8233104207854742,
 0.8499297000781764,
 0.8233292651201483,
 0.8233104207854742,
 0.8737103421528526,
 0.8461699137588943,
 0.8330040698441363,
 0.8587810412418978,
 0.8248878178899388,
 0.8233104207854742,
 0.8497666934215311,
 0.8233292651201483,
 0.8233104207854742]

In [24]:
model.model['rf_yes'].avgMetrics

[0.4770127269181251,
 0.47700020053802666,
 0.4770064973707198,
 0.5830843515623017,
 0.5768870095241461,
 0.5638718870016706,
 0.60932291180402,
 0.6114060284105972,
 0.5794341983061156,
 0.5578252973982392,
 0.5578190537394551,
 0.5578221755688471,
 0.6443740427015484,
 0.6423338060847188,
 0.6379585422644761,
 0.6711655208604128,
 0.6628006066318488,
 0.6593658725019277,
 0.6307324738008341,
 0.6307293710921875,
 0.6307388668756267,
 0.6845211527667706,
 0.6819409392275879,
 0.6861957896677083,
 0.6947141017833175,
 0.696715828040569,
 0.7022719143131806]

In [25]:
model.model['rf_no'].avgMetrics

[0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233261136955721,
 0.8233198255188479,
 0.8233104207854742,
 0.8233889328814579,
 0.823319811224802,
 0.8233386166796128,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233135692018207,
 0.8233104207854742,
 0.8233104207854742,
 0.8233292915244357,
 0.8233167176181675,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742,
 0.8233104207854742]