In [1]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode,lit,array,lower
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType
import numpy as np
import pickle 

from pyspark.sql import functions as F
from itertools import chain
from sklearn.metrics import classification_report

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.feature import IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,CrossValidatorModel
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
spark = SparkSession.\
        builder.\
        appName("ml").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

23/02/14 13:53:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
class SentimentModel:
    
    def __init__(self):
        self.df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
        self.df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')
        self.clean_data()
        self.model = {}
        self.load_model()
        self.split_content()
        self.convert_feature()

        
        
    def load_model(self):
        list_model = ['lr_yes','lr_no','rf_yes','rf_no']
        for model_name in list_model:
            self.model[model_name] = CrossValidatorModel.load(f'hdfs://namenode:9000/save_model/{model_name}')
            
        self.model_tfidf = PipelineModel.load(f'hdfs://namenode:9000/save_model/model_tfidf')
        
    def getNGram(self,df,n):
        ngram = NGram(n=n)
        ngram.setInputCol("comment_term")
        ngram.setOutputCol("nGrams")
        df_nGram = ngram.transform(df)
        result_nGram = df_nGram.withColumn('word',explode(df_nGram.nGrams))\
            .groupBy(['word'])\
            .count()
        return result_nGram
        
    def clean_data(self):
    
        df = self.df_train.withColumn('comment_term',split(self.df_train.clean_content, ' ', -1))

        result_nGram = self.getNGram(df,1)
        result_nGram.createOrReplaceTempView('result_nGram')
        
        stop_word = spark.sql("""
            select word from result_nGram
            where count < 10
        """).toPandas()
        stop_word = stop_word['word'].to_list()
        
        dict_stop_word = {x:1 for x in stop_word}
        self.dict_stop_word = dict_stop_word
        self.df_test.createOrReplaceTempView('df_test')
        self.df_train.createOrReplaceTempView('df_train')
        
        
        def remove_stop_word(txt):
            txt = txt.strip()
            ls_words = txt.split()
            ls_new_words = []
            for word in ls_words:
                if dict_stop_word.get(word) == None:
                    ls_new_words.append(word)
            return ' '.join(ls_new_words)
        spark.udf.register("remove_stop_word", remove_stop_word,StringType())
        
        self.df_test = spark.sql("""
            select remove_stop_word(clean_content) clean_content,rating,sentiment,true_label,label 
            from df_test
        """)
        
        self.df_train = spark.sql("""
            select remove_stop_word(clean_content) clean_content,rating,sentiment,label 
            from df_train
        """)
        

    
    def set_weight(self, w_a = 5,w_b = 5, w_c = 1):
        class_weights_spark = {0:w_a,1:w_b,2:w_c}
        mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
        self.train_idf = self.train_idf.withColumn("weight", mapping_expr.getItem(F.col("label")))
        
    def split_content(self):
        self.train_set = self.df_train.select(split(self.df_train.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label')
        self.test_set = self.df_test.select(split(self.df_test.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label','true_label')
    
    def convert_feature(self):
        self.train_idf = self.model_tfidf.transform(self.train_set)
        self.test_idf = self.model_tfidf.transform(self.test_set)
    
   
    
    def evaluate(self,predictions):
        result = predictions.select('true_label', 'prediction')
        result = result[['true_label','prediction']].toPandas()
        
        print(f'accuracy_score: ',accuracy_score(result.true_label, result.prediction))
        print(f'prediction: ',precision_score(result.true_label, result.prediction, average='weighted'))
        print(f'recall_score: ',recall_score(result.true_label, result.prediction, average='weighted'))
        print(f'f1_score: ',f1_score(result.true_label, result.prediction, average='weighted'))
        print(classification_report(result.true_label, result.prediction))

In [5]:
model = SentimentModel()

                                                                                

In [6]:
list_model = ['lr_yes','lr_no','rf_yes','rf_no']
for model_name in list_model:
    res = model.model[model_name].transform(model.test_idf).select(['true_label','prediction'])
    y_true = res.select(['true_label']).collect()
    y_pred = res.select(['prediction']).collect()
    print('='*30)
    print('Model:',model_name)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

23/02/14 13:53:50 WARN DAGScheduler: Broadcasting large task binary with size 1100.2 KiB
                                                                                

Model: lr_yes
              precision    recall  f1-score   support

           0       0.78      0.65      0.71       805
           1       0.36      0.54      0.43       538
           2       0.93      0.90      0.91      3634

    accuracy                           0.82      4977
   macro avg       0.69      0.70      0.68      4977
weighted avg       0.85      0.82      0.83      4977

[[ 520  239   46]
 [  52  293  193]
 [  91  289 3254]]


23/02/14 13:53:52 WARN DAGScheduler: Broadcasting large task binary with size 1100.2 KiB


Model: lr_no
              precision    recall  f1-score   support

           0       0.88      0.52      0.65       805
           1       0.41      0.02      0.04       538
           2       0.81      0.99      0.89      3634

    accuracy                           0.81      4977
   macro avg       0.70      0.51      0.53      4977
weighted avg       0.78      0.81      0.76      4977

[[ 415   13  377]
 [  38   12  488]
 [  21    4 3609]]


23/02/14 13:53:53 WARN DAGScheduler: Broadcasting large task binary with size 1003.4 KiB


Model: rf_yes
              precision    recall  f1-score   support

           0       1.00      0.03      0.06       805
           1       0.27      0.21      0.24       538
           2       0.77      0.96      0.86      3634

    accuracy                           0.73      4977
   macro avg       0.68      0.40      0.38      4977
weighted avg       0.75      0.73      0.66      4977

[[  23  162  620]
 [   0  111  427]
 [   0  132 3502]]


23/02/14 13:53:54 WARN DAGScheduler: Broadcasting large task binary with size 1025.3 KiB


Model: rf_no
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       805
           1       0.00      0.00      0.00       538
           2       0.73      1.00      0.84      3634

    accuracy                           0.73      4977
   macro avg       0.58      0.33      0.28      4977
weighted avg       0.69      0.73      0.62      4977

[[   1    0  804]
 [   0    0  538]
 [   0    0 3634]]


In [7]:
for x in [round(x*100,7) for x in model.model['rf_yes'].avgMetrics]:
    print(x)

70.6762339
70.6762339
70.6762339
71.9676083
72.4317665
71.5733107
72.3602181
71.1134318
70.7826626
74.3836415
74.3836415
74.3839491
73.8092061
74.3962157
73.4733337
72.9754401
73.9171712
73.0888359
76.8888341
76.888834
76.8887316
74.6391311
74.202973
75.3568291
74.2218713
74.5431967
74.9455338


In [8]:
for x in [round(x*100,2) for x in model.model['rf_no'].avgMetrics]:
    print(x)

79.05
79.05
79.05
79.06
79.06
79.06
79.07
79.07
79.06
79.05
79.05
79.05
79.05
79.05
79.05
79.06
79.06
79.06
79.05
79.05
79.05
79.05
79.05
79.05
79.06
79.06
79.06


In [9]:
np.argmax(model.model['rf_yes'].avgMetrics)

18

In [10]:
model.model['rf_yes'].getEstimatorParamMaps()[ np.argmax(model.model['rf_yes'].avgMetrics) ]

{Param(parent='RandomForestClassifier_5eee6cd5b48a', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.'): 'weight',
 Param(parent='RandomForestClassifier_5eee6cd5b48a', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,
 Param(parent='RandomForestClassifier_5eee6cd5b48a', name='numTrees', doc='Number of trees to train (>= 1).'): 50,
 Param(parent='RandomForestClassifier_5eee6cd5b48a', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2}

In [11]:
# class SentimentModel:
    
#     def __init__(self):
#         self.df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
#         self.df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')
#         self.clean_data()
#         self.split_content()
#         self.convert_feature()
#         self.model = {}
        
#     def getNGram(self,df,n):
#         ngram = NGram(n=n)
#         ngram.setInputCol("comment_term")
#         ngram.setOutputCol("nGrams")
#         df_nGram = ngram.transform(df)
#         result_nGram = df_nGram.withColumn('word',explode(df_nGram.nGrams))\
#             .groupBy(['word'])\
#             .count()
#         return result_nGram
        
#     def clean_data(self):
    
#         df = self.df_train.withColumn('comment_term',split(self.df_train.clean_content, ' ', -1))

#         result_nGram = self.getNGram(df,1)
#         result_nGram.createOrReplaceTempView('result_nGram')
        
#         stop_word = spark.sql("""
#             select word from result_nGram
#             where count < 10
#         """).toPandas()
#         stop_word = stop_word['word'].to_list()
        
#         dict_stop_word = {x:1 for x in stop_word}
#         self.dict_stop_word = dict_stop_word
#         self.df_test.createOrReplaceTempView('df_test')
#         self.df_train.createOrReplaceTempView('df_train')
        
        
#         def remove_stop_word(txt):
#             txt = txt.strip()
#             ls_words = txt.split()
#             ls_new_words = []
#             for word in ls_words:
#                 if dict_stop_word.get(word) == None:
#                     ls_new_words.append(word)
#             return ' '.join(ls_new_words)
#         spark.udf.register("remove_stop_word", remove_stop_word,StringType())
        
#         self.df_test = spark.sql("""
#             select remove_stop_word(clean_content) clean_content,rating,sentiment,true_label,label 
#             from df_test
#         """)
        
#         self.df_train = spark.sql("""
#             select remove_stop_word(clean_content) clean_content,rating,sentiment,label 
#             from df_train
#         """)
        

    
#     def set_weight(self, w_a = 5,w_b = 5, w_c = 1):
#         class_weights_spark = {0:w_a,1:w_b,2:w_c}
#         mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
#         tmp_a = self.train_idf.filter(col('label') == 0)
#         tmp_b = self.train_idf.filter(col('label') == 1)
#         for i in range(w_a):
#             self.train_idf = self.train_idf.unionAll(tmp_a)
#         for i in range(w_b):
#             self.train_idf = self.train_idf.unionAll(tmp_b)
# #         self.train_idf = self.train_idf.withColumn("weight", mapping_expr.getItem(F.col("label")))
        
#     def split_content(self):
#         self.train_set = self.df_train.select(split(self.df_train.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label')
#         self.test_set = self.df_test.select(split(self.df_test.clean_content, ' ').alias('cmt_token'),'clean_content','rating', 'label','true_label')
    
#     def convert_feature(self):
#         count = CountVectorizer(inputCol="cmt_token", outputCol="rawFeatures")
#         idf = IDF(inputCol="rawFeatures", outputCol="featuresTFIDF")
#         pipeline = Pipeline(stages=[count, idf])
#         self.model_tfidf = pipeline.fit(self.train_set)
#         self.train_idf = self.model_tfidf.transform(self.train_set)
#         self.test_idf = self.model_tfidf.transform(self.test_set)
    
#     def model_logistic(self,weight):
#         lr = LogisticRegression(featuresCol = "featuresTFIDF")

#         if weight == True:
#             paramGrid = ParamGridBuilder()\
#                         .addGrid(lr.maxIter, [10, 20, 50])\
#                         .addGrid(lr.regParam, [0.1,0.3,0.5])\
#                         .addGrid(lr.weightCol,  ["weight"])\
#                         .build()
            
#         else:
#             paramGrid = ParamGridBuilder()\
#                         .addGrid(lr.maxIter, [10, 20, 50])\
#                         .addGrid(lr.regParam, [0.1,0.3,0.5])\
#                         .build()

#         evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

#         crossval = CrossValidator(estimator=lr,
#                                   estimatorParamMaps=paramGrid,
#                                   evaluator=evaluator,
#                                   numFolds=5) 
        
#         model = crossval.fit(self.train_idf)
#         if weight == True:
#             self.model['lr_yes'] = model
#         else:
#             self.model['lr_no'] = model
#         predictions = model.transform(self.test_idf)
#         return predictions
    
#     def model_rf(self,weight):
#         trainer = RandomForestClassifier(featuresCol = "featuresTFIDF",numTrees=50,minInstancesPerNode=3,maxDepth=10)
#         if weight == True:
#             paramGrid = ParamGridBuilder()\
#                         .addGrid(trainer.numTrees, [10,20,50])\
#                         .addGrid(trainer.maxDepth, [2,6,8])\
#                         .addGrid(trainer.minInstancesPerNode, [1,3,5])\
#                         .addGrid(trainer.weightCol,  ["weight"])\
#                         .build()
            
#         else:
#             paramGrid = ParamGridBuilder()\
#                         .addGrid(trainer.numTrees, [10,20,50])\
#                         .addGrid(trainer.maxDepth, [2,6,8])\
#                         .addGrid(trainer.minInstancesPerNode, [1,3,5])\
#                         .build()
        
#         evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

#         crossval = CrossValidator(estimator=trainer,
#                                   estimatorParamMaps=paramGrid,
#                                   evaluator=evaluator,
#                                   numFolds=5) 
#         model = trainer.fit(self.train_idf)
#         if weight == True:
#             self.model['rf_yes'] = model
#         else:
#             self.model['rf_no'] = model
#         predictions = model.transform(self.test_idf)
#         return predictions
    
#     def evaluate(self,predictions):
#         result = predictions.select('true_label', 'prediction')
#         result = result[['true_label','prediction']].toPandas()
        
#         print(f'accuracy_score: ',accuracy_score(result.true_label, result.prediction))
#         print(f'prediction: ',precision_score(result.true_label, result.prediction, average='weighted'))
#         print(f'recall_score: ',recall_score(result.true_label, result.prediction, average='weighted'))
#         print(f'f1_score: ',f1_score(result.true_label, result.prediction, average='weighted'))
#         print(classification_report(result.true_label, result.prediction))
        
#     def save_model(self):
#         list_model = ['lr_yes','lr_no','rf_yes','rf_no']
#         for model_name in list_model:
#             self.model[model_name].write().overwrite().save(f'hdfs://namenode:9000/save_model/{model_name}')
        
#         self.model_tfidf.write().overwrite().save(f'hdfs://namenode:9000/save_model/model_tfidf')
        
#         with open('data/dict_stop_word.pkl', 'wb') as f:
#             pickle.dump(self.dict_stop_word, f)

In [12]:
# model = SentimentModel()

In [13]:
# lb0_cnt = model.train_set.filter(col('label') == 0).count()
# lb1_cnt = model.train_set.filter(col('label') == 1).count()
# lb2_cnt = model.train_set.filter(col('label') == 2).count()

In [14]:
# w_a = int(lb2_cnt/lb0_cnt)
# w_b = int(lb2_cnt/lb1_cnt)

In [15]:
# model.set_weight(w_a,w_b)

In [None]:
# rf_predictions_wb = model.model_rf(weight=True)

In [None]:
# model.evaluate(rf_predictions_wb)

# Analysis wrong prediction rule_base

In [33]:
df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')
df_train = spark.read.parquet('hdfs://namenode:9000/ml/train_data')

In [None]:
pd_test = df_test.toPandas()

In [56]:
list_model = ['lr_yes','rf_yes']
for model_name in list_model:
    res = model.model[model_name].transform(model.test_idf).select(['true_label','prediction'])
    res = res.toPandas()
    pd_test['prediction'] = res[res['true_label']!=res['prediction']]['prediction']
    result = pd_test[pd.notna(pd_test['prediction'])][['true_label','prediction','clean_content']]
    result['prediction'] = result['prediction'].astype(int)
    result.to_csv(f'wrong_prediction/{model_name}.csv',index=False)

23/02/14 14:07:18 WARN DAGScheduler: Broadcasting large task binary with size 1101.2 KiB
23/02/14 14:07:18 WARN DAGScheduler: Broadcasting large task binary with size 1004.4 KiB


In [17]:
pst_word = {}
ngt_word = {}
with open('vi_sentiment/positive_words_vi.txt','r') as f:
    for line in f:
        line = line.replace('\n','')
        if line not in pst_word:
            pst_word[line] = 1
with open('vi_sentiment/negative_words_vi.txt','r') as f:
    for line in f:
        line = line.replace('\n','')
        if line not in pst_word:
            ngt_word[line] = 1

In [18]:
def prediction(sentent):
    list_token = sentent.split(' ')
    pos = 0
    neg = 0
    for token in list_token:
        if token in pst_word:
            pos += 1
        elif token in ngt_word:
            neg += 1
    score = pos - neg
    if score > 0:
        return 2
    elif score == 0:
        return 1
    else:
        return 0

In [19]:
spark.udf.register("prediction", prediction,IntegerType())

<function __main__.prediction(sentent)>

In [77]:
def token_anl(sentent):
    list_token = sentent.split(' ')
    pos = 0
    neg = 0
    pos_token = []
    neg_token = []
    for token in list_token:
        if token in pst_word:
            pos_token.append(token)
        elif token in ngt_word:
            neg_token.append(token)
    return [','.join(pos_token),'|'.join(neg_token)]

In [78]:
spark.udf.register("token_anl", token_anl,ArrayType(StringType()))

23/02/14 14:29:34 WARN SimpleFunctionRegistry: The function token_anl replaced a previously registered function.


<function __main__.token_anl(sentent)>

In [79]:
df_test = spark.read.parquet('hdfs://namenode:9000/ml/test_data')

In [80]:
df_test.createOrReplaceTempView("test_data")

In [81]:
result_test = spark.sql("""
    select true_label,prediction(clean_content) prediction,clean_content,token_anl(clean_content) token
    from test_data
    where prediction(clean_content) != true_label
""")

In [82]:
result_test.toPandas().to_csv('wrong_prediction/rule_base.csv',index=False)

In [84]:
res = spark.sql("""
    select true_label,prediction(clean_content) prediction,clean_content,token_anl(clean_content) token
    from test_data
""")

In [85]:
y_true = res.select(['true_label']).collect()
y_pred = res.select(['prediction']).collect()
print('='*30)
print('Model:',model_name)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

Model: rf_yes
              precision    recall  f1-score   support

           0       0.50      0.43      0.46       805
           1       0.15      0.13      0.14       538
           2       0.84      0.87      0.86      3634

    accuracy                           0.72      4977
   macro avg       0.49      0.48      0.49      4977
weighted avg       0.71      0.72      0.71      4977

[[ 346  177  282]
 [ 134   72  332]
 [ 214  246 3174]]
