In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Setup

In [7]:
# Install jdk8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Set jdk environment path which enables you to run Pyspark in your Colab environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)


In [8]:
# Install latest pre-release version of BigDL
# Installing BigDL from pip will automatically install all BigDL modules and their dependencies.
!pip install BigDL==0.13.1.dev1



In [9]:
!pip install -Uq emoji \
                 optuna \
                 flashtext \
                 underthesea \
                 scikit-learn \
                 vncorenlp \

In [27]:
!pip install bigdl-orca

Collecting bigdl-orca
  Downloading bigdl_orca-2.0.0-py3-none-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 1.5 MB/s 
[?25hCollecting bigdl-dllib==2.0.0
  Downloading bigdl_dllib-2.0.0-py3-none-manylinux1_x86_64.whl (101.0 MB)
[K     |████████████████████████████████| 101.0 MB 37 kB/s 
Collecting bigdl-math==2.0.0
  Downloading bigdl_math-2.0.0-py3-none-manylinux2010_x86_64.whl (35.4 MB)
[K     |████████████████████████████████| 35.4 MB 488 kB/s 
[?25hCollecting bigdl-tf==2.0.0
  Downloading bigdl_tf-2.0.0-py3-none-manylinux2010_x86_64.whl (71.0 MB)
[K     |████████████████████████████████| 71.0 MB 145 kB/s 
Collecting conda-pack==0.3.1
  Downloading conda_pack-0.3.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: conda-pack, bigdl-tf, bigdl-math, bigdl-dllib, bigdl-orca
Successfully installed bigdl-dllib-2.0.0 bigdl-math-2.0.0 bigdl-orca-2.0.0 bigdl-tf-2.0.0 conda-pack-0.3.1


# Library

In [10]:
import matplotlib
matplotlib.use('Agg')
%pylab inline

import pandas
import datetime as dt

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist

from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
from __future__ import print_function
import os
import argparse



Populating the interactive namespace from numpy and matplotlib


In [11]:
# import necesary libraries and modules
from pyspark import SparkContext
sc=SparkContext.getOrCreate(conf=create_spark_conf().set("spark.driver.memory","16g"))

In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .getOrCreate()
init_engine()

In [13]:
import glob
import pandas as pd
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql import functions as f
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.util.common import *

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.util.common import *

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.util.common import *
from bigdl.dlframes.dl_classifier import *
from pyspark.sql.types import StringType, DoubleType
from pyspark.sql.types import *
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.util.common import *
from bigdl.dlframes.dl_classifier import *
from pyspark.sql.types import *
from pyspark.sql.functions import col,length,trim
import re
from emoji import get_emoji_regexp
import unicodedata
from underthesea import word_tokenize
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

# Load dataset
/content/drive/MyDrive/DoAn/data/dataset.csv

In [14]:
def load_data_spark(url = '/content/drive/MyDrive/DoAn/data/100k/'):
  folder= ['VA','BLong','Phuong','TH']
  address_files = []
  for i in folder:
    address_files += glob.glob(url+i+'/*.xlsx')
  temp_column = pd.read_excel(address_files[0]).columns.str.lower()
  sum_read_file=[]
  for i in address_files:
    temp_file = pd.read_excel(i)
    temp_file.columns= temp_column
  
    sum_read_file.append(temp_file)
  sum_read_file = pd.concat(sum_read_file)
  return spark.createDataFrame(sum_read_file.astype(str))

In [15]:
dataDF = load_data_spark()

In [16]:
dataDF.show(3,80)

+------+--------------------------------------------------------------------------------+-------+-----+-------+-------+---------+--------+----------+
|    id|                                                                             cmt|general|price|quality|service|stylefood|location|background|
+------+--------------------------------------------------------------------------------+-------+-----+-------+-------+---------+--------+----------+
|6001.0|

Thích bánh mì ở đây, đặt ruột , lúc nào cũng nóng giòn, chấn mắn thôi cũng ...|    1.0|  nan|    1.0|    nan|      nan|     nan|       nan|
|6002.0|Món gà nướng hôm nay có bánh bao ngon lắm nha các bạn. Ngồi đây mát lắm lun. ...|    nan|  nan|    1.0|    nan|      nan|     nan|       1.0|
|6003.0|

- Đậu hũ, có nấm bông cải xanh cà chua: ngon tuyệt 👍👍 đậu hũ non như bánh...|    nan|  nan|    1.0|    nan|      nan|     nan|       1.0|
+------+--------------------------------------------------------------------------------+-------+-----

# Processing

In [17]:
class Preporcessing:
  def __init__(self, basic_preprocessing = False, embedding_type = "tfidf", path_acronyms = None):
    self.basic_prepro = basic_preprocessing
    self.embedding_type = embedding_type
    if path_acronyms:
      self.dict_special = self.special_case(path_acronyms)
  
  def special_case(self, path_acronyms):
      special_w = pd.read_excel(path_acronyms)
      special_w = special_w.to_dict('index')
      dict_special={}
      for key, values in special_w.items():
        row = []
        for k,v in values.items():
          if len(v)>=3:
            row.append(v)
        if len(row) ==2:
          dict_special.update({row[1]:[row[0]]})
      return dict_special

  def clean_text(self, text, special_w=None):
    # Unicode normalize
    text = unicodedata.normalize('NFC',text)

    # Lower
    text = text.lower()

    # Remove all emoji
    text = re.sub(get_emoji_regexp(),"",text)

    #  Change #string to HASTAG 
    if self.basic_prepro == False:
        text = re.sub('#\S+',"HASTAG",text)

        # # Find all price tag and change to GIÁ
        pricetag = '((?:(?:\d+[,\.]?)+) ?(?:nghìn đồng|đồng|k|vnd|d|đ))'
        text = re.sub(pricetag,"PRICE",text)

        # Replace some special word
        replace_dct = {"òa ":["oà "], "óa ":["oá "], "ỏa ":["oả "], "õa ":["oã "], "ọa ":["oạ "],
                  "òe":["oè"], "óe":["oé"], "ỏe":["oẻ"], "õe":["oẽ"], "ọe":["oẹ"],
                  "ùy":["uỳ"], "úy":["uý"], "ủy":["uỷ"], "ũy":["uỹ"], "ụy":["uỵ"],
                  "ùa":["uà"], "úa ":["uá "], "ủa":["uả"], "ũa":["uã"], "ụa":["uạ"],
                  "xảy":["xẩy"], "bảy":["bẩy"], "gãy":["gẫy"],"nhân viên ":["nvien"],"quay":['qay']}
        sum_special =  {**special_w, **replace_dct}    
        for key, values in sum_special.items():
          if type(values) == list:
            for v in values:
              text = text.replace(v, key)
        text = text.replace('ìnhh','ình')

    # Remove all special char
    specialchar = r"[\"#$%&'()*+,\-\/\\:;<=>@[\]^_`{|}~\n\r\t]"
    text = re.sub(specialchar," ",text)

    if self.basic_prepro == False:
        text = word_tokenize(text, format="text")

    return text

  def clean_df(self, sparkDF):
    Clean_UDF = udf(lambda x: self.clean_text(x,self.dict_special),StringType())
    # Clean_Nan = udf (lambda x: label_encode[2] if x=='nan' else label_encode[int(float(x))],ArrayType(StringType()))
    Clean_Nan = udf (lambda x: float(-2.0) if x not in ['0.0','1.0','-1.0'] else float(x), FloatType())
    DF_Clean = sparkDF.select(Clean_UDF('cmt').alias("cmt") , Clean_Nan('general').alias("general"), Clean_Nan('price').alias("price"), Clean_Nan('quality').alias("quality"), Clean_Nan('service').alias("service"), Clean_Nan('stylefood').alias("stylefood"),Clean_Nan('location').alias("location"), Clean_Nan('background').alias("background"))
    return DF_Clean.withColumn("label", f.array("general",'price',"quality","service","stylefood","location","background").cast(ArrayType(FloatType())))
  
  def clean_sentenceDF(self, sentenceDF):
    Clean_UDF = udf(lambda x: self.clean_text(x,self.dict_special),StringType())
    DF_Clean = sentenceDF.select(Clean_UDF('cmt').alias("cmt"))
    return DF_Clean

  def split_data(self, sparkDF, train_ratio = 0.8, seed = 50):
    train_data, test_data = sparkDF.randomSplit([train_ratio, 1-train_ratio], seed)
    return train_data, test_data

  def Embedding(self, num_feature):
    tokenizer = Tokenizer(inputCol="cmt", outputCol="words")
    newdb = VectorAssembler(inputCols=["features_vec"], outputCol="features")

    if self.embedding_type == "wordcount":
      countVectors = CountVectorizer(inputCol="words", outputCol="features_vec", minDF=5, vocabSize=num_feature)
      pipeline = Pipeline(stages=[tokenizer,countVectors,newdb])

    elif self.embedding_type == "tfidf":
      hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=num_feature)
      idf = IDF(inputCol="rawFeatures", outputCol="features_vec" )
      pipeline = Pipeline(stages=[tokenizer,hashingTF,idf,newdb])

    elif self.embedding_type == "word2vec":
      w2v = Word2Vec(vectorSize=num_feature, seed=42, inputCol="words", outputCol="features_vec")
      pipeline = Pipeline(stages=[tokenizer, w2v,newdb])
      
    else:
      raise ValueError("Embedding phải là 'wordcount', 'tfidf' hoặc 'word2vec'. Các embedding khác chưa hỗ trợ.")
    
    return pipeline
def convertCase(float_num):
  """ so sánh với 0,-1,1,-2""" 
  value_with_nan = abs(-2-float_num)
  value_with_neu = abs(0-float_num)
  value_with_neg = abs(-1-float_num)
  value_with_pos = abs(1-float_num)
  value_min = min([value_with_nan,value_with_neu,value_with_neg,value_with_pos])
  if value_min == value_with_nan:
    return -2.
  elif value_min== value_with_neu:
    return 0.
  elif value_min == value_with_neg:
    return -1.
  return 1.

def edit_prediction_label(prediction_data):
  edit_pred_label = udf(lambda label_list: [convertCase(x) for x in label_list],ArrayType(FloatType()))
  list_pre = prediction_data.withColumn("prediction",edit_pred_label('prediction'))
  return list_pre

In [18]:
preprocessing = Preporcessing(basic_preprocessing=False, embedding_type="tfidf", path_acronyms='/content/drive/MyDrive/BigData/data/original/Acronyms.xlsx')
sparkDF_cleaned = preprocessing.clean_df(dataDF)
train_data, test_data = preprocessing.split_data(sparkDF_cleaned)
embedding = preprocessing.Embedding(300)


In [19]:
embedding_abc = embedding.fit(train_data)

In [20]:

#embedding.save('/content/drive/MyDrive/DoAn/model/Embedding/tfidf')


train_data = embedding_abc.transform(train_data).select('features','label')
test_data = embedding_abc.transform(test_data).select('features','label')

In [21]:
train_data.show(3,80)

+--------------------------------------------------------------------------------+------------------------------------------+
|                                                                        features|                                     label|
+--------------------------------------------------------------------------------+------------------------------------------+
|(300,[9,18,39,47,56,65,76,79,84,87,88,103,108,122,128,137,143,144,146,165,168...|   [-2.0, 1.0, 1.0, -2.0, -2.0, 1.0, -2.0]|
|(300,[7,9,16,18,19,20,24,43,44,56,69,77,80,82,83,89,103,104,110,118,124,128,1...|   [1.0, -2.0, 1.0, -2.0, -2.0, -2.0, 1.0]|
|(300,[2,3,4,9,16,19,37,40,54,56,75,81,86,97,100,102,107,143,144,164,168,171,1...|[-2.0, -1.0, -1.0, -2.0, -2.0, -2.0, -2.0]|
+--------------------------------------------------------------------------------+------------------------------------------+
only showing top 3 rows



# Model


In [22]:


def LSTM_model(input_size, hidden_size, output_size):
    model = Sequential()
    recurrent = Recurrent()
    recurrent.add(LSTM(input_size, hidden_size))
    model.add(InferReshape([-1, input_size], True))
    model.add(recurrent)
    model.add(Select(2, -1))
    model.add(Dropout(0.2))
    model.add(Linear(hidden_size, output_size))
    return model

def GRU_model(input_size, hidden_size, output_size):
    model = Sequential()
    recurrent = Recurrent()
    recurrent.add(GRU(input_size, hidden_size))
    model.add(InferReshape([-1, input_size], True))
    model.add(recurrent)
    model.add(Select(2, -1))
    model.add(Dropout(0.2))
    model.add(Linear(hidden_size, output_size))
    return model


def MLP(input_size, hidden_size, hidden_size2, output_size):
    model = Sequential()
    model.add(Linear(input_size, 1000))
    model.add(ReLU())
    model.add(Linear(1000, 256))
    model.add(ReLU())
    model.add(Linear(256, output_size))
    return model

def RNN_model(input_size, hidden_size, output_size):
    model = Sequential()
    recurrent = Recurrent()
    recurrent.add(RnnCell(input_size, hidden_size, Tanh()))
    model.add(InferReshape([-1, input_size], True))
    model.add(recurrent)
    model.add(Select(2, -1))
    model.add(Dropout(0.2))
    model.add(Linear(hidden_size, output_size))
    return model


In [23]:
model_lstm = LSTM_model(300, 256,7)


creating: createSequential
creating: createRecurrent
creating: createTanh
creating: createSigmoid
creating: createLSTM
creating: createInferReshape
creating: createSelect
creating: createDropout
creating: createLinear


## Train

In [38]:
from bigdl.dllib.nn.layer import *
from bigdl.dllib.nn.criterion import *
from bigdl.dllib.utils.common import *
from bigdl.dllib.nnframes.nn_classifier import *
from bigdl.dllib.feature.common import *

In [39]:
est =  NNEstimator(model_lstm, MSECriterion(), SeqToTensor([300]), ArrayToTensor([7])) \
            .setBatchSize(64).setLearningRate(0.2).setMaxEpoch(10) 

creating: createMSECriterion


TypeError: ignored

In [None]:
%%time
# Boot training process
trained_model = optimizer.optimize()
print("Optimization Done.")

In [None]:
model_lstm.save_model('/content/drive/MyDrive/DoAn/model/Model_BigDL/lstm_tfidf')

#

## Test 