In [11]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '3g')
conf = SparkConf().setAppName("Process Comment").setMaster("spark://25.15.27.228:7077")
sc = SparkContext.getOrCreate(conf=conf)

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Process Comment').getOrCreate()

In [8]:
class Model:
    def __init__(self,sc,spark):
        self.sc = sc
        self.type = type
        self.spark = spark
        # self.URI           = self.sc._gateway.jvm.java.net.URI
        # self.Path          = self.sc._gateway.jvm.org.apache.hadoop.fs.Path
        # self.FileSystem    = self.sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
        # self.Configuration = self.sc._gateway.jvm.org.apache.hadoop.conf.Configuration
        self.url = 'hdfs://cris:9000/ProcessShopee/Comment/rate_sum/part-00000-2bb5d99d-0093-4c2e-bc46-0b8487d2c0db-c000.snappy.parquet'
        self.processData()
        
        
    def processData(self):
        data = self.spark.read.parquet(self.url)
        data = data.toPandas()
        data.loc[data[data['rating_star'].astype(int) < 4].index, 'rating_star'] = 0
        data.loc[data[data['rating_star'].astype(int) > 3].index, 'rating_star'] = 1
        data.rating_star.value_counts()
        data = self.spark.createDataFrame(data)
        self.trainWord2vec(data)

    def trainWord2vec(self,data):
        from pyspark.ml.feature import Word2Vec
        from pyspark.sql.functions import lower, col, split

        dataset = data.select(lower(col('comment')).alias('comment'), 'rating_star')
        dataset = dataset.select(split(dataset.comment, ' ').alias('comment'), 'rating_star')

        word2Vec = Word2Vec(vectorSize=100, seed=42, inputCol="comment", outputCol="features")
        word2Vec.setMaxIter(5)
        model = word2Vec.fit(dataset)

        res = model.transform(dataset)
        data = res.select('features', 'rating_star')
        data = data.withColumnRenamed('rating_star', 'label')
        splits = data.randomSplit([0.6, 0.4], 1234)

        self.train_set = splits[0]
        self.test_set = splits[1]
    
    def MPClassifier(self):
        layers = [100, 120, 60, 2]
        trainer = MultilayerPerceptronClassifier(maxIter=500, layers=layers, blockSize=128, seed=1234)
        mpModel = trainer.fit(self.train_set)
        result = mpModel.transform(self.test_set)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
        self.evaluate(predictionAndLabels,'MultilayerPerceptronClassifier')
    
    def OneRest(self):
        from pyspark.ml.classification import LogisticRegression, OneVsRest
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator


        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        ovr = OneVsRest(classifier=lr)
        ovrModel = ovr.fit(self.train)
        predictions = ovrModel.transform(self.test)
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
        self.evaluate(predictions,'LogisticRegression')





    def evaluate(self,predictionAndLabels,model):
        from sklearn.metrics import f1_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score

        res = predictionAndLabels.toPandas()
        f1_score = f1_score(res.label, res.prediction.astype(int), average=None)
        re_score = recall_score(res.label, res.prediction.astype(int), average=None)
        pre_score = precision_score(res.label, res.prediction.astype(int), average=None)
        print(f'F1 score {model}: {f1_score}')
        print(f'Recall score {model}: {re_score}')
        print(f'Precision score {model}: {pre_score}')

In [9]:
model = Model('sc',spark)

In [12]:
model.MPClassifier()