In [8]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [None]:
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '3g')
conf = SparkConf().setAppName("Process Comment").setMaster("spark://25.15.27.228:7077")
sc = SparkContext.getOrCreate(conf=conf)

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Process Comment').getOrCreate()

In [13]:
class Model:
    def __init__(self,sc,spark):
        self.sc = sc
        self.type = type
        self.spark = spark
        # self.URI           = self.sc._gateway.jvm.java.net.URI
        # self.Path          = self.sc._gateway.jvm.org.apache.hadoop.fs.Path
        # self.FileSystem    = self.sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
        # self.Configuration = self.sc._gateway.jvm.org.apache.hadoop.conf.Configuration
        self.url = ['rate2_1/part-00000-2729185b-e28a-4984-b39e-86518549ec39-c000.snappy.parquet',
        'rate2_2/part-00000-f47244d2-fc4d-4f67-a011-0a496be62822-c000.snappy.parquet',
        'rate2_3/part-00000-dca5814a-b5ca-4ad4-b96c-b8b268c259a3-c000.snappy.parquet',
        'rate2_4/part-00000-2de369cd-334e-4736-8e34-bd132d4b13ce-c000.snappy.parquet',
        'rate2_5/part-00000-67e9e9c5-96e1-4619-8ae6-a4232c83df79-c000.snappy.parquet']
        # self.processData()
        
        
    def processData(self):
        df = []
        for url in self.url:
            tmp = self.spark.read.parquet(f'hdfs://cris:9000/ProcessShopee/Comment/{url}')
            df.append(tmp.toPandas())

        data = pd.concat(df,axis = 0)
        data = data.reset_index(drop=True)
        data.loc[data[data['rating_star'].astype(int) < 4].index, 'rating_star'] = 0
        data.loc[data[data['rating_star'].astype(int) > 3].index, 'rating_star'] = 1
        data.rating_star.value_counts()
        data = self.spark.createDataFrame(data)
        self.trainWord2vec(data)

    def trainWord2vec(self,data):
        from pyspark.ml.feature import Word2Vec
        from pyspark.sql.functions import lower, col, split

        dataset = data.select(lower(col('comment')).alias('comment'), 'rating_star')
        dataset = dataset.select(split(dataset.comment, ' ').alias('comment'), 'rating_star')

        word2Vec = Word2Vec(vectorSize=100, seed=42, inputCol="comment", outputCol="features")
        word2Vec.setMaxIter(5)
        model = word2Vec.fit(dataset)

        res = model.transform(dataset)
        data = res.select('features', 'rating_star')
        data = data.withColumnRenamed('rating_star', 'label')
        splits = data.randomSplit([0.6, 0.4], 1234)

        self.train_set = splits[0]
        self.test_set = splits[1]
    
    def MPClassifier(self):
        layers = [100, 120, 60, 2]
        trainer = MultilayerPerceptronClassifier(maxIter=500, layers=layers, blockSize=128, seed=1234)
        mpModel = trainer.fit(self.train_set)
        result = mpModel.transform(self.test_set)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
        self.evaluate(predictionAndLabels,'MultilayerPerceptronClassifier')
    
    def OneRest(self):
        from pyspark.ml.classification import LogisticRegression, OneVsRest
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator


        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        ovr = OneVsRest(classifier=lr)
        ovrModel = ovr.fit(self.train_set)
        predictions = ovrModel.transform(self.test_set)
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
        self.evaluate(predictions,'LogisticRegression')





    def evaluate(self,predictionAndLabels,model):
        from sklearn.metrics import f1_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score

        res = predictionAndLabels.toPandas()
        f1_score = f1_score(res.label, res.prediction.astype(int), average=None)
        re_score = recall_score(res.label, res.prediction.astype(int), average=None)
        pre_score = precision_score(res.label, res.prediction.astype(int), average=None)
        print(f'F1 score {model}: {f1_score}')
        print(f'Recall score {model}: {re_score}')
        print(f'Precision score {model}: {pre_score}')

In [10]:
model = Model('sc',spark)

In [8]:
model.MPClassifier()

Test set accuracy = 0.8192122567961494
F1 score MultilayerPerceptronClassifier: [0.71083449 0.86849839]
Recall score MultilayerPerceptronClassifier: [0.68114376 0.8860627 ]
Precision score MultilayerPerceptronClassifier: [0.7432316 0.8516169]


In [None]:
model.OneRest()