In [1]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [2]:
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '4g')

conf = SparkConf().setAppName("Process Comment").setMaster("spark://25.15.27.228:7077")
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Process Comment').getOrCreate()

In [4]:
class Model:
    def __init__(self,sc,spark,load=False):
        self.sc = sc
        self.type = type
        self.spark = spark
        self.url = ['rate2_1/part-00000-2729185b-e28a-4984-b39e-86518549ec39-c000.snappy.parquet',
        'rate2_2/part-00000-f47244d2-fc4d-4f67-a011-0a496be62822-c000.snappy.parquet',
        'rate2_3/part-00000-dca5814a-b5ca-4ad4-b96c-b8b268c259a3-c000.snappy.parquet',
        'rate2_4/part-00000-2de369cd-334e-4736-8e34-bd132d4b13ce-c000.snappy.parquet',
        'rate2_5/part-00000-67e9e9c5-96e1-4619-8ae6-a4232c83df79-c000.snappy.parquet']
        self.processData(load=load)
        
        
    def processData(self,load=False):
        df = []
        for url in self.url:
            tmp = self.spark.read.parquet(f'hdfs://cris:9000/ProcessShopee/Comment/{url}')
            df.append(tmp.toPandas())

        data = pd.concat(df,axis = 0)
        data = data.reset_index(drop=True)
        data.loc[data[data['rating_star'].astype(int) < 4].index, 'rating_star'] = 0
        data.loc[data[data['rating_star'].astype(int) > 3].index, 'rating_star'] = 1
        data.rating_star.value_counts()
        data = self.spark.createDataFrame(data)
        self.rawData = data
        self.trainWord2vec(data,load=load)

    def trainWord2vec(self,data,load=False):
        from pyspark.ml.feature import Word2Vec
        from pyspark.ml.feature import Word2VecModel
        from pyspark.sql.functions import lower, col, split

        dataset = data.select(lower(col('comment')).alias('comment'), 'rating_star')
        dataset = dataset.select(split(dataset.comment, ' ').alias('comment'), 'rating_star')
        self.splitData = dataset

        if load:
            model = Word2VecModel.load(f'hdfs://cris:9000/ProcessShopee/word2vec/')
        else:
            word2Vec = Word2Vec(vectorSize=100, seed=42, inputCol="comment", outputCol="features")
            word2Vec.setMaxIter(5)
            model = word2Vec.fit(dataset)

        self.w2v = model

        res = model.transform(dataset)
        data = res.select('features', 'rating_star')
        self.data = data.withColumnRenamed('rating_star', 'label')
        self.featureData = self.data
        splits = self.data.randomSplit([0.6, 0.4], 1234)

        self.train_set = splits[0]
        self.test_set = splits[1]
    
    def MPClassifier(self):
        layers = [100, 120, 60, 2]
        trainer = MultilayerPerceptronClassifier(maxIter=500, layers=layers, blockSize=128, seed=1234)
        mpModel = trainer.fit(self.train_set)
        result = mpModel.transform(self.test_set)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
        self.evaluate(predictionAndLabels,'MultilayerPerceptronClassifier')
    
    def OneRest(self):
        from pyspark.ml.classification import LogisticRegression, OneVsRest
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator


        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        ovr = OneVsRest(classifier=lr)
        ovrModel = ovr.fit(self.train_set)
        predictions = ovrModel.transform(self.test_set)
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
        self.evaluate(predictions,'LogisticRegression')


    def svm(self):
        from pyspark.ml import Pipeline
        from pyspark.ml.classification import LinearSVC
        from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(self.data)

        featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(self.data)

        (trainingData, testData) = self.data.randomSplit([0.7, 0.3])

        svm = LinearSVC(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=500)

        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, svm, labelConverter])

        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)

        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test accuracy = %g" % (accuracy))

        predictions = predictions.select('label', 'predictedLabel')
        predictions = predictions.withColumnRenamed('predictedLabel', 'prediction')

        self.evaluate(predictions, 'SVM')


    def RandForest(self):
        from pyspark.ml import Pipeline
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(self.data)

        featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(self.data)

        (trainingData, testData) = self.data.randomSplit([0.7, 0.3])

        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=50)

        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)

        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test accuracy = %g" % (accuracy))

        predictions = predictions.select('label', 'predictedLabel')
        predictions = predictions.withColumnRenamed('predictedLabel', 'prediction')

        self.evaluate(predictions, 'random forest')


    def evaluate(self,predictionAndLabels,model):
        from sklearn.metrics import f1_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score

        res = predictionAndLabels.toPandas()
        F1_score = f1_score(res.label, res.prediction.astype(int), average=None)
        re_score = recall_score(res.label, res.prediction.astype(int), average=None)
        pre_score = precision_score(res.label, res.prediction.astype(int), average=None)
        print(f'F1 score {model}: {F1_score}')
        print(f'Recall score {model}: {re_score}')
        print(f'Precision score {model}: {pre_score}')

In [5]:
model = Model('sc',spark,load=True)

In [27]:
model.rawData.take(1)

[Row(comment='Thực_sự thất_vọng về sản_phẩm . Giao sai hàng , sai màu . Sp thì size M 40-50 kg , tôi 46kg mặc chỉ kéo qua đầu_gối . Vì ct16 nên k đi gơit trả hàng dc đành để làm dẻ lau nhà hoặc khăn lau cho chó nó tắm . Tức gì đâu', rating_star=0)]

In [28]:
model.splitData.show()

+--------------------+-----------+
|             comment|rating_star|
+--------------------+-----------+
|[thực_sự, thất_vọ...|          0|
|[hàng, nhận, đc, ...|          0|
|[đặt, 1, đùi, và,...|          0|
|[chất_liệu, quá, ...|          0|
|[giao, vải, bị, l...|          0|
|[mỏng, te, ko, nh...|          0|
|[shop, cố_tình, g...|          0|
|[shop, ngoài, cái...|          0|
|[sản_phẩm, kém, c...|          0|
|[mua, áo, áp_dụng...|          0|
|[vải, xấu, đau_đớ...|          0|
|    [vải, quá, mỏng]|          0|
|[thất_vọng, tôi, ...|          0|
|[mua, 2, tấm, lướ...|          0|
|[quần, quá, xấu, ...|          0|
|   [chất_lượng, kém]|          0|
|[không, tốt, như,...|          0|
|[em, đặt, 1, bộ, ...|          0|
|[sản_phẩm, màu, k...|          0|
|[rõ_ràng, không, ...|          0|
+--------------------+-----------+
only showing top 20 rows



In [33]:
model.featureData.take(1)

[Row(features=DenseVector([-0.0174, 0.0119, -0.0421, 0.0937, -0.0235, 0.0302, -0.0108, 0.1426, -0.0875, -0.0354, 0.0298, 0.0618, -0.0105, 0.0405, 0.074, -0.1662, -0.0, -0.0401, -0.0495, -0.0587, 0.0345, 0.0278, 0.0095, -0.0296, -0.0126, -0.0084, 0.0205, -0.104, 0.0515, -0.01, 0.1015, 0.0005, -0.0456, -0.0379, 0.0126, -0.0638, 0.0156, 0.0262, 0.0282, -0.0609, 0.0096, -0.0718, 0.0583, 0.0148, -0.0804, -0.0509, -0.0327, 0.0548, 0.0534, -0.108, -0.0039, 0.0445, -0.0154, 0.1341, 0.0898, 0.0087, 0.0435, 0.0969, -0.0207, -0.0313, -0.0331, 0.1383, 0.0197, 0.2204, -0.0663, 0.0579, 0.0899, -0.0049, -0.0502, -0.0252, -0.1318, -0.0085, -0.0929, -0.0255, 0.0261, -0.0186, -0.0556, 0.0023, 0.1293, 0.0985, -0.0403, -0.0169, -0.0192, -0.0478, 0.0098, -0.0043, 0.0091, -0.0087, 0.075, -0.0176, 0.1849, -0.0018, 0.0272, -0.078, -0.1213, -0.0142, -0.0635, -0.0377, 0.0709, 0.0107]), label=0)]

In [6]:
# model.w2v.save(f'hdfs://cris:9000/ProcessShopee/word2vec/')

In [6]:
model.MPClassifier()

Test set accuracy = 0.9311043031913815
F1 score MultilayerPerceptronClassifier: [0.58501772 0.96243376]
Recall score MultilayerPerceptronClassifier: [0.49723711 0.97806418]
Precision score MultilayerPerceptronClassifier: [0.71043571 0.94729507]


In [8]:
model.OneRest()

Test Error = 0.0771612
F1 score LogisticRegression: [0.49589099 0.95822202]
Recall score LogisticRegression: [0.38859113 0.98066338]
Precision score LogisticRegression: [0.6850509  0.93678477]


In [6]:
model.svm()

Test accuracy = 0.922033
F1 score SVM: [0.43766361 0.95811276]
Recall score SVM: [0.31239891 0.98761062]
Precision score SVM: [0.73062875 0.93032588]


In [8]:
model.RandForest()

Test accuracy = 0.902453
F1 score random forest: [0.        0.9487259]
Recall score random forest: [0. 1.]
Precision score random forest: [0.         0.90245342]


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
model.data.createOrReplaceTempView("data")
spark.sql("select label, count(label) as number_samples from data group by label order by count(label)").show()

+-----+--------------+
|label|number_samples|
+-----+--------------+
|    0|         59930|
|    1|        556311|
+-----+--------------+



In [15]:
model.train_set.createOrReplaceTempView("train")
spark.sql("select label, count(label) as number_samples from train group by label order by count(label)").show()

+-----+--------------+
|label|number_samples|
+-----+--------------+
|    0|         35861|
|    1|        333935|
+-----+--------------+



In [17]:
model.test_set.createOrReplaceTempView("test")
spark.sql("select label, count(label) as number_samples from test group by label order by count(label)").show()

+-----+--------------+
|label|number_samples|
+-----+--------------+
|    0|         24069|
|    1|        222376|
+-----+--------------+

