In [1]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode,lit,array,lower
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType
import numpy as np
from pyspark.sql import functions as F
from itertools import chain
from sklearn.metrics import classification_report

In [38]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.\
        builder.\
        appName("ml").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

22/12/09 17:28:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [94]:
class SentimentModel:
    
    def __init__(self):
        self.df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
        self.df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')
        self.split_content()
        self.convert_feature()
        self.model = {}
        
    def set_weight(self, w_a = 5,w_b = 5, w_c = 1):
        class_weights_spark = {0:w_a,1:w_b,2:w_c}
        mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
        self.train_idf = self.train_idf.withColumn("weight", mapping_expr.getItem(F.col("label")))
        
    def split_content(self):
        self.train_set = self.df_train.select(split(self.df_train.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label')
        self.test_set = self.df_test.select(split(self.df_test.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label','true_label')
    
    def convert_feature(self):
        count = CountVectorizer(inputCol="cmt_token", outputCol="rawFeatures")
        idf = IDF(inputCol="rawFeatures", outputCol="featuresTFIDF")
        pipeline = Pipeline(stages=[count, idf])
        self.model_tfidf = pipeline.fit(self.train_set)
        self.train_idf = self.model_tfidf.transform(self.train_set)
        self.test_idf = self.model_tfidf.transform(self.test_set)
    
    def model_logistic(self,weight):
        if weight == True:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0,weightCol="weight")
        else:
            lr = LogisticRegression(maxIter=20,featuresCol = "featuresTFIDF", tol=1E-6,regParam=0.3, elasticNetParam=0)
        
        paramGrid = ParamGridBuilder()\
                    .addGrid(lr.maxIter, [10, 20, 50])\
                    .addGrid(lr.regParam, [0.1,0.3,0.5])\
                    .addGrid(lr.elasticNetParam,  [0.0, 0.1, 0.2])\
                    .build()
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        crossval = CrossValidator(estimator=lr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5) 
        model = crossval.fit(self.train_idf)
        if weight == True:
            self.model['weight_balance'] = model
        else:
            self.model['no_weight_balance'] = model
        predictions = model.transform(self.test_idf)
        return predictions

In [107]:
model = SentimentModel()

                                                                                

# Oversampling

In [108]:
lb0_cnt = model.train_set.filter(col('label') == 0).count()
lb1_cnt = model.train_set.filter(col('label') == 1).count()
lb2_cnt = model.train_set.filter(col('label') == 2).count()

In [109]:
w_a = int(lb2_cnt/lb0_cnt)
w_b = int(lb2_cnt/lb1_cnt)

In [110]:
model.set_weight(w_a,w_b)

## Weight balance

In [None]:
predictions_wb = model.model_logistic(weight=True)

In [112]:
# model.model['weight_balance'].getEstimatorParamMaps()[ np.argmax(model.model['weight_balance'].avgMetrics) ]

In [113]:
# model.model['weight_balance'].avgMetrics

In [114]:
result_wb = predictions_wb.select('true_label', 'prediction')
result_wb = result_wb[['true_label','prediction']].toPandas()

22/12/10 07:13:43 WARN DAGScheduler: Broadcasting large task binary with size 1643.4 KiB


In [115]:
print(f'accuracy_score: ',accuracy_score(result_wb.true_label, result_wb.prediction))
print(f'prediction: ',precision_score(result_wb.true_label, result_wb.prediction, average='weighted'))
print(f'recall_score: ',recall_score(result_wb.true_label, result_wb.prediction, average='weighted'))
print(f'f1_score: ',f1_score(result_wb.true_label, result_wb.prediction, average='weighted'))

accuracy_score:  0.8065099457504521
prediction:  0.8367066733814391
recall_score:  0.8065099457504521
f1_score:  0.8183025153512631


In [116]:
print(classification_report(result_wb.true_label, result_wb.prediction))

              precision    recall  f1-score   support

           0       0.79      0.65      0.71       805
           1       0.33      0.50      0.40       538
           2       0.92      0.89      0.90      3634

    accuracy                           0.81      4977
   macro avg       0.68      0.68      0.67      4977
weighted avg       0.84      0.81      0.82      4977



# No balance

In [None]:
predictions_no_wb = model.model_logistic(weight=False)

In [118]:
result_nwb = predictions_no_wb.select('true_label', 'prediction')
result_nwb = result_nwb[['true_label','prediction']].toPandas()

22/12/10 07:40:43 WARN DAGScheduler: Broadcasting large task binary with size 1643.5 KiB


In [119]:
print(f'accuracy_score: ',accuracy_score(result_nwb.true_label, result_nwb.prediction))
print(f'prediction: ',precision_score(result_nwb.true_label, result_nwb.prediction, average='weighted'))
print(f'recall_score: ',recall_score(result_nwb.true_label, result_nwb.prediction, average='weighted'))
print(f'f1_score: ',f1_score(result_nwb.true_label, result_nwb.prediction, average='weighted'))

accuracy_score:  0.7980711271850512
prediction:  0.784149411368234
recall_score:  0.7980711271850512
f1_score:  0.7414852595580866


In [120]:
print(classification_report(result_nwb.true_label, result_nwb.prediction))

              precision    recall  f1-score   support

           0       0.88      0.43      0.58       805
           1       0.59      0.02      0.04       538
           2       0.79      0.99      0.88      3634

    accuracy                           0.80      4977
   macro avg       0.75      0.48      0.50      4977
weighted avg       0.78      0.80      0.74      4977



# Analysis result

## Save model 

In [125]:
model.model['weight_balance'].save('hdfs://namenode:9000/save_model/weight_balance')
model.model['no_weight_balance'].save('hdfs://namenode:9000/save_model/no_weight_balance')

In [126]:
model.model['weight_balance'].getEstimatorParamMaps()[ np.argmax(model.model['weight_balance'].avgMetrics) ]

{Param(parent='LogisticRegression_9a1c83f8064e', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='LogisticRegression_9a1c83f8064e', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_9a1c83f8064e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

In [127]:
model.model['no_weight_balance'].getEstimatorParamMaps()[ np.argmax(model.model['weight_balance'].avgMetrics) ]

{Param(parent='LogisticRegression_aac6323f5e82', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='LogisticRegression_aac6323f5e82', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_aac6323f5e82', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}

In [128]:
model.model['weight_balance'].avgMetrics

[0.8035300288830529,
 0.7698339339286016,
 0.698711878893849,
 0.8001721040312308,
 0.6614500521615891,
 0.5856481907129361,
 0.798380588940723,
 0.6052631292869556,
 0.3993787689234043,
 0.7980602241801211,
 0.7449175079517412,
 0.6989599970521885,
 0.7894914084436643,
 0.6657093028837562,
 0.5674217497526841,
 0.7845162449972907,
 0.6073500159029633,
 0.3993787689234043,
 0.7963729143768001,
 0.7453964639314838,
 0.6994437558777391,
 0.7889888313346793,
 0.6654169621294412,
 0.5675347893341465,
 0.7845349973469451,
 0.6074916152295076,
 0.3993787689234043]

In [129]:
model.model['no_weight_balance'].avgMetrics

[0.8721661133880132,
 0.8456654007614444,
 0.8319067080232899,
 0.8577680083462371,
 0.8248937199570823,
 0.8233131186857944,
 0.8487448789619101,
 0.8233288055950398,
 0.8233099763044979,
 0.8736308318282516,
 0.8462833560879324,
 0.8329286918888472,
 0.8587997635941289,
 0.8248937114057653,
 0.8233131186857944,
 0.8496324185096447,
 0.8233288055950398,
 0.8233099763044979,
 0.8735146780953845,
 0.846255015013958,
 0.8329759342804166,
 0.8585583378775808,
 0.8248905851845851,
 0.8233131186857944,
 0.8494253401798622,
 0.8233288055950398,
 0.8233099763044979]