### **Word2Vec**

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '3g')
conf = SparkConf().setAppName("Process Comment").setMaster("spark://25.15.27.228:7077")
sc = SparkContext.getOrCreate(conf=conf)

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Process Comment').getOrCreate()

In [None]:
url = 'hdfs://cris:9000/ProcessShopee/Comment/rate_sum/part-00000-2bb5d99d-0093-4c2e-bc46-0b8487d2c0db-c000.snappy.parquet'
data = spark.read.parquet(url)
data = data.toPandas()

In [None]:
data.loc[data[data['rating_star'].astype(int) < 4].index, 'rating_star'] = 0
data.loc[data[data['rating_star'].astype(int) > 3].index, 'rating_star'] = 1
data.rating_star.value_counts()

In [None]:
data = spark.createDataFrame(data)

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import lower, col, split

dataset = data.select(lower(col('comment')).alias('comment'), 'rating_star')
dataset = dataset.select(split(dataset.comment, ' ').alias('comment'), 'rating_star')

word2Vec = Word2Vec(vectorSize=100, seed=42, inputCol="comment", outputCol="features")
word2Vec.setMaxIter(5)
model = word2Vec.fit(dataset)

res = model.transform(dataset)
res.show()

In [None]:
data = res.select('features', 'rating_star')
data = data.withColumnRenamed('rating_star', 'label')

### **MultilayerPerceptronClassifier**

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#split train test
(train, test) = data.randomSplit([0.6, 0.4], 1234)

layers = [100, 120, 60, 2]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=500, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

In [None]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

res_pred = res_pred = predictionAndLabels.toPandas()

f1 = f1_score(res_pred.label, res_pred.prediction.astype(int), average=None)
precision = precision_score(res_pred.label, res_pred.prediction.astype(int), average=None)
recall = recall_score(res_pred.label, res_pred.prediction.astype(int), average=None)

print('Label: \t \t    0   \t    1')
print('F1 score: \t {:.5f}  \t {:.5f}'.format(f1[0], f1[1]))
print('Precision: \t {:.5f}  \t {:.5f}'.format(precision[0], precision[1]))
print('Recall: \t {:.5f}  \t {:.5f}'.format(recall[0], recall[1]))

### **Ramdom Forest**

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=50)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

res_pred = predictions.select("predictedLabel", "label").toPandas()

f1 = f1_score(res_pred.label, res_pred.prediction.astype(int), average=None)
precision = precision_score(res_pred.label, res_pred.prediction.astype(int), average=None)
recall = recall_score(res_pred.label, res_pred.prediction.astype(int), average=None)

print('Label: \t \t    0   \t    1')
print('F1 score: \t {:.5f}  \t {:.5f}'.format(f1[0], f1[1]))
print('Precision: \t {:.5f}  \t {:.5f}'.format(precision[0], precision[1]))
print('Recall: \t {:.5f}  \t {:.5f}'.format(recall[0], recall[1]))