In [None]:
from json import loads
from pyspark.sql import SparkSession
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from pyspark.sql.functions import col,from_json,udf,split,explode
from pyspark.ml.feature import NGram
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType

In [None]:
spark = SparkSession.\
        builder.\
        appName("process-data").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

In [None]:
data = spark.read.parquet('hdfs://namenode:9000/TikiCleaned/Comment')

22/12/07 16:11:50 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/12/07 16:12:05 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


In [4]:
data.createOrReplaceTempView('data')

In [6]:
df = spark.sql("""
    select distinct clean_content,rating,sentiment,
    case
        when rating >= 4 then 2
        when rating = 3 then 1
        else 0
    end as label
    from data
    where clean_content <> '' and clean_content is not null and clean_content <> ' '
""")

In [25]:
df.groupby('label').count().show()

                                                                                

+-----+------+
|label| count|
+-----+------+
|    1| 17646|
|    2|266607|
|    0| 39571|
+-----+------+



In [None]:
df.toPandas().to_csv('data/data.csv',index=False)

In [15]:
train,valid,test = df.randomSplit([0.8,0.1,0.1], 1234)

In [20]:
train.toPandas().to_csv('data/train.csv',index=False)
valid.toPandas().to_csv('data/valid.csv',index=False)
test.toPandas().to_csv('data/test.csv',index=False)

                                                                                

In [28]:
df = df.withColumn('comment_term',split(df.clean_content, ' ', -1))

In [74]:
def getNGram(n):
    ngram = NGram(n=n)
    ngram.setInputCol("comment_term")
    ngram.setOutputCol("nGrams")
    df_nGram = ngram.transform(df)
    result_nGram = df_nGram.withColumn('word',explode(df_nGram.nGrams))\
        .groupBy(['label','word'])\
        .count()
    return result_nGram

In [75]:
result_nGram = getNGram(1)

In [76]:
result_nGram.show()



+-----+-------+-----+
|label|   word|count|
+-----+-------+-----+
|    2|   nhìn|11151|
|    2|   trái|  779|
|    2|   quan| 3069|
|    2|    bền| 7049|
|    1|   giặt|  145|
|    2|    tập| 3121|
|    2|   muỗi|  413|
|    0|    khó|  992|
|    2|    mùa| 1818|
|    2|   đuợc|   35|
|    2|    béo|  437|
|    2|    lừa|  161|
|    2|     vẽ|  606|
|    0|    nối|  925|
|    1|    hợp|  726|
|    2|nghiêng|  265|
|    0|  thùng|  333|
|    0|    đối|  213|
|    1|    anh|  106|
|    2|   sườn|   60|
+-----+-------+-----+
only showing top 20 rows



                                                                                

In [81]:
for i in range(3):
    result_nGram.filter(result_nGram.label == i).orderBy(col("count").desc()).toPandas().to_csv(f'sample/{i}.csv',index=False)

                                                                                

In [82]:
pst_word = {}
ngt_word = {}
with open('vi_sentiment/positive_words_vi.txt','r') as f:
    for line in f:
        line = line.replace('\n','')
        if line not in pst_word:
            pst_word[line] = 1
with open('vi_sentiment/negative_words_vi.txt','r') as f:
    for line in f:
        line = line.replace('\n','')
        if line not in pst_word:
            ngt_word[line] = 1

In [84]:
def check(sentent):
    list_token = sentent.split(' ')
    res = 0
    for token in list_token:
        if token in pst_word:
            res += 1
        elif token in ngt_word:
            res -= 1
    if res > 0:
        return 2
    elif res == 0:
        return 1
    else:
        return 0

In [124]:
def value_check(sentent):
    list_token = sentent.split(' ')
    pos = 0
    neg = 0
    for token in list_token:
        if token in pst_word:
            pos += 1
        elif token in ngt_word:
            neg += 1
    return round((pos - neg)/ (pos + neg + 1e-6),3)

In [125]:
spark.udf.register("check", check,IntegerType())
spark.udf.register("value_check", value_check,FloatType())

22/12/04 17:51:46 WARN SimpleFunctionRegistry: The function check replaced a previously registered function.
22/12/04 17:51:46 WARN SimpleFunctionRegistry: The function value_check replaced a previously registered function.


<function __main__.value_check(sentent)>

In [126]:
df.createOrReplaceTempView("tmp")

In [127]:
res = spark.sql("""
    select *,check(clean_content) prediction,value_check(clean_content) val from tmp
""")

In [128]:
x =  res[['label','prediction','val']].toPandas()

                                                                                

In [131]:
tmp = x[x['label']==1]

In [132]:
tmp

Unnamed: 0,label,prediction,val
5,1,2,0.333
23,1,2,1.000
41,1,0,-0.333
50,1,0,-0.333
58,1,2,1.000
...,...,...,...
323726,1,2,0.143
323730,1,2,1.000
323795,1,2,0.250
323796,1,2,1.000


In [139]:
val = 0.4

In [140]:
tmp[(tmp['val']> val*-1) & (tmp['val']<val)]

Unnamed: 0,label,prediction,val
5,1,2,0.333
41,1,0,-0.333
50,1,0,-0.333
70,1,1,0.000
78,1,0,-0.333
...,...,...,...
323658,1,2,0.200
323713,1,0,-0.333
323715,1,2,0.250
323726,1,2,0.143


In [142]:
tmp = x[x['val']> 0.4]

In [143]:
tmp[tmp['label']== tmp['prediction']].shape[0]/tmp.shape[0]

0.9478712501519915

In [150]:
x[x['val'] < -0.99]

Unnamed: 0,label,prediction,val
11,2,0,-1.0
57,0,0,-1.0
63,2,0,-1.0
69,0,0,-1.0
74,0,0,-1.0
...,...,...,...
323594,1,0,-1.0
323611,2,0,-1.0
323725,2,0,-1.0
323754,0,0,-1.0


In [154]:
x[x['label'] == 0]

Unnamed: 0,label,prediction,val
1,0,0,-0.200
10,0,0,-0.333
20,0,2,0.250
33,0,2,0.333
48,0,2,0.200
...,...,...,...
323772,0,0,-0.600
323784,0,1,0.000
323786,0,2,0.200
323797,0,0,-0.333


In [152]:
tmp = x[x['val'] == -1]
tmp[tmp['label']== tmp['prediction']].shape[0]/tmp.shape[0]

0.39766023397660233

In [None]:
x['prd'] = x[x['val']> 0.4]

In [106]:
x[x['val']==0]

Unnamed: 0,label,prediction,val
27,2,1,0
42,2,1,0
52,0,1,0
53,2,1,0
56,2,1,0
...,...,...,...
323785,2,1,0
323801,2,1,0
323803,2,1,0
323815,2,1,0


In [105]:
x[x['label']==1]

Unnamed: 0,label,prediction,val
5,1,2,1
23,1,2,5
41,1,0,-1
50,1,0,-1
58,1,2,4
...,...,...,...
323726,1,2,2
323730,1,2,3
323795,1,2,2
323796,1,2,3


In [112]:
x[x['label']== x['prediction']].shape[0]/x.shape[0]

0.7450436039330006

In [98]:
x[x['label']== 1]

Unnamed: 0,label,prediction
18,1,2
106,1,0
173,1,0
177,1,2
195,1,1
...,...,...
323726,1,2
323730,1,2
323795,1,2
323796,1,2


In [119]:
value = 2

In [115]:
e = x[x['label']== value]
e[e['label']== e['prediction']].shape[0]/e[e['label']== value].shape[0]

0.192904907627791

In [118]:
e = x[x['label']== value]
e[e['label']== e['prediction']].shape[0]/e[e['label']== value].shape[0]

0.4451239544110586

In [120]:
e = x[x['label']== value]
e[e['label']== e['prediction']].shape[0]/e[e['label']== value].shape[0]

0.8261035906784143

In [111]:
x[x['label']== value][x['label']== x['prediction']].shape[0]/x[x['label']== value].shape[0]

0.4451239544110586

In [97]:
x[x['label']== 1][x['label']== x['prediction']].shape[0]/x[x['label']== 1].shape[0]

0.192904907627791

In [90]:
x.shape

(323824, 2)

In [91]:
x[x['label']== x['prediction']].shape[0]

241263