In [26]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode,lit,array,lower
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType
import numpy as np
from pyspark.sql import functions as F
import re
from itertools import chain
from sklearn.metrics import classification_report

In [16]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.feature import IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,CrossValidatorModel
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.\
        builder.\
        appName("ml").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

23/01/07 14:46:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/07 14:46:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [19]:
crossval = CrossValidatorModel.load('hdfs://namenode:9000/save_model/lr_yes')

                                                                                

In [128]:
class Prediction:
    
    def __init__(self):
        self.model = {}
        self.load_model()
        self.load_dictionary()
    
    def load_model(self):
        list_model = ['lr_yes','lr_no','rf_yes','rf_no']
        for model_name in list_model:
            self.model[model_name] = CrossValidatorModel.load(f'hdfs://namenode:9000/save_model/{model_name}')
            
        self.model_tfidf = PipelineModel.load(f'hdfs://namenode:9000/save_model/model_tfidf')
        
        
    def load_dictionary(self):
        pst_word = {}
        ngt_word = {}
        with open('vi_sentiment/positive_words_vi.txt','r') as f:
            for line in f:
                line = line.replace('\n','')
                if line not in pst_word:
                    pst_word[line] = 1
        with open('vi_sentiment/negative_words_vi.txt','r') as f:
            for line in f:
                line = line.replace('\n','')
                if line not in pst_word:
                    ngt_word[line] = 1
        self.pst_word = pst_word
        self.ngt_word = ngt_word
        
    
    def cleanText(self,str_raw):
        CLEANR = re.compile('<.*?>') 
        # remove tags html
        str_raw = re.sub(CLEANR, ' ', str_raw)

        # remove special character
        str_raw = re.sub('\W+', ' ', str_raw)

        # remove number
        str_raw = re.sub("[0-9]+", "", str_raw)

        # remove space
        cleantext = re.sub(" +", " ", str_raw)
        return cleantext.lower()
    
    
    def rule_lexicon_based(self,sentent):
        list_token = sentent.split(' ')
        pos = 0
        neg = 0
        for token in list_token:
            if token in self.pst_word:
                pos += 1
            elif token in self.ngt_word:
                neg += 1
        score = pos - neg
        if score > 0:
            return 2
        elif score == 0:
            return 1
        else:
            return 0
        
    def predict(self, txt, model_name='lr',mode='yes'):
        clean_text = self.cleanText(txt)
        data = [(clean_text,)]
        schema = StructType([ \
            StructField("clean_text",StringType(),True),
          ])
        
        input_data = spark.createDataFrame(data=data,schema=schema)
        input_data = input_data.select(split(input_data.clean_text, ' ').alias('cmt_token'))
        input_idf = model.model_tfidf.transform(input_data)
        
        
        if model_name == 'lr':
            if mode == 'yes':
                result = self.model['lr_yes'].transform(input_idf).select('prediction').take(1)
            else:
                result = self.model['lr_no'].transform(input_idf).select('prediction').take(1)
        elif model_name == 'rf':
            if mode == 'yes':
                result = self.model['rf_yes'].transform(input_idf).select('prediction').take(1)
            else:
                result = self.model['rf_no'].transform(input_idf).select('prediction').take(1)
        elif model_name == 'rule_based':
            result = self.rule_lexicon_based(clean_text)
        
        if model_name != 'rule_based':
            result = result[0].prediction
            
        print('Input data:', txt)
        print('='*40)
        print('Prediction: ')
        if result == 2:
            print('Positive')
        elif result == 1:
            print('Neural')
        else:
            print('Negative')

In [129]:
model = Prediction()

In [136]:
txt = 'đẹp nhưng ngắn quá'

In [137]:
model.predict(txt)

Input data: đẹp nhưng ngắn quá
Prediction: 
Neural


23/01/07 15:44:43 WARN DAGScheduler: Broadcasting large task binary with size 1401.0 KiB
23/01/07 15:44:43 WARN DAGScheduler: Broadcasting large task binary with size 1401.0 KiB


In [138]:
model.predict(txt,'lr','no')

Input data: đẹp nhưng ngắn quá
Prediction: 
Positive


23/01/07 15:44:45 WARN DAGScheduler: Broadcasting large task binary with size 1401.0 KiB
23/01/07 15:44:45 WARN DAGScheduler: Broadcasting large task binary with size 1401.0 KiB


In [139]:
model.predict(txt,'rf','yes')

Input data: đẹp nhưng ngắn quá
Prediction: 
Neural


23/01/07 15:44:46 WARN DAGScheduler: Broadcasting large task binary with size 1430.4 KiB
23/01/07 15:44:46 WARN DAGScheduler: Broadcasting large task binary with size 1430.4 KiB


In [140]:
model.predict(txt,'rf','no')

Input data: đẹp nhưng ngắn quá
Prediction: 
Positive


In [141]:
model.predict(txt,'rule_based','no')

Input data: đẹp nhưng ngắn quá
Prediction: 
Positive
