# IMPORT REQUIRED LIBRARY

In [29]:
import pandas as pd
import os
import codecs
import gensim.corpora as corpora
from snownlp import SnowNLP
from snownlp import normal
from snownlp import seg
from tqdm import tqdm
import re
import numpy as np
import json
import matplotlib.pyplot as plt

from sentiment_modeling import Sentiment_classfifier

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, Bidirectional
from keras.models import Model, Input
from sklearn.model_selection import train_test_split

# GET DATA

In [2]:
path = os.path.abspath(os.getcwd()) 
filename = path+"\\dataset\\organized_dataset\\csv-final.csv"

In [3]:
## Check data type
import chardet
with open(filename, 'rb') as file:
    print(chardet.detect(file.read()))

{'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}


In [4]:
df = pd.read_csv(filename, encoding = 'UTF-16', sep='\t')

### Cut the information of original email

In [5]:
check = df['Body'].isnull()
contents = [df['Body'][i] for i in range(len(check)) if check[i] == False] #Filter the email which has non content

In [6]:
for i in range(len(contents)):
    for j in range(len(contents[i])):
        if contents[i][j:j+len('SecuShare')] == 'SecuShare': #Cut the information for original email
            contents[i] = contents[i][:j]

### Preprocess function

In [7]:
def filter_chinese(mycontents):
    #Filter to get a sentnece only chinese
    for n in range(len(mycontents)):
        temp_list = ""
        for i in re.findall("[\u4e00-\u9fff]",mycontents[n]):
            temp_list += i
        mycontents[n] = temp_list
    return mycontents

In [8]:
def handle(doc):
    han_text = normal.zh2hans(doc) #Conver Traditional chinese to simplfied chinese
    words = seg.seg(han_text) #segment the sentences
    words = normal.filter_stop(words)
    return words

def sent_to_words(sentences): #Preprocess data
    word_list = list()
    for i in tqdm(range(len(sentences))):
        word_list.append(handle(sentences[i]))
    return word_list

In [9]:
def preprocessing(data_words, max_length, token2id):
    preprocessed_text = np.ones((len(data_words), max_length))*len(token2id) #Assign id=length for the word not in the id2word dictiornary 
    for row_id in tqdm(range(len(data_words))):
        for col_id in range(max_length):
            if col_id < len(data_words[row_id]):
                if data_words[row_id][col_id] in token2id:
                    preprocessed_text[row_id][col_id] = token2id[data_words[row_id][col_id]] #Assign id for the word in id2word dictiornary

    return preprocessed_text

### Preprocess data

In [10]:
contents = filter_chinese(contents)

In [11]:
data_words = sent_to_words(contents)           

100%|██████████████████████████████████████████████████████████████████████████████| 2525/2525 [05:48<00:00,  7.24it/s]


In [12]:
id2word = corpora.Dictionary(data_words)
max_length = max([len(i) for i in data_words])
token2id = id2word.token2id

In [13]:
preprocessed_text = preprocessing(data_words, max_length, token2id)

100%|████████████████████████████████████████████████████████████████████████████| 2525/2525 [00:00<00:00, 3435.61it/s]


# LABEL DATA

## Sentiment label

#### Creating new label (if sentiment_label file is not existed)

In [14]:
bayes_modelpath = path+"\\sentiment_modeling\\saved_model\\sentiment_bayes\\sentiment_bayes.marshal"
sentiment_classifier = Sentiment_classfifier(bayes_modelpath)

In [15]:
sentiment_labels = sentiment_classifier.predict(contents)

100%|██████████████████████████████████████████████████████████████████████████████| 2525/2525 [09:36<00:00,  4.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2525/2525 [00:36<00:00, 68.66it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2525/2525 [00:00<00:00, 5529.59it/s]


In [17]:
# open file in write mode
with open('sentiment_label.txt', 'w') as fp:
    for item in sentiment_labels:
        # write each item on a new line
        fp.write("%s\n" % item)

#### Reading available label

In [18]:
sentiment_labels = []
with open(r'sentiment_label.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        sentiment_labels.append(int(x))

In [19]:
#Convert to one hot vector:
n_values = np.max(sentiment_labels) + 1
sentiment_labels = np.eye(n_values)[sentiment_labels]

## Topic label (updating)

In [20]:
topic_labels = np.random.randint(20,size=len(sentiment_labels)).tolist()

In [21]:
#Convert to one hot vector:
n_values = np.max(topic_labels) + 1
topic_labels = np.eye(n_values)[topic_labels]

# BUILDING MODEL

In [22]:
class BiLstm:
    def __init__(self,vocabulary_size, seq_len):
        
        input_shape = Input((seq_len,))
        embedding_layer = Embedding(vocabulary_size, 128, input_length=seq_len)(input_shape)
        
        bilstm_layer = Bidirectional(LSTM(64))(embedding_layer)
        dropout_layer = Dropout(0.5)(bilstm_layer)
        dense_layer = Dense(64, activation='relu')(dropout_layer)
        
        out1 = Dense(2, activation='softmax')(dense_layer)    
        out2 = Dense(20, activation='softmax')(dense_layer)
        
        self.model = Model(input_shape, [out1,out2])
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        self.history

    def fit(self, x, y1, y2,mybatch_size = 256,myepochs = 5):
        self.history = self.model.fit(x, [y1,y2], batch_size=mybatch_size, epochs=myepochs)
        
    def prediction(self,x):
        return self.model.predict(x)#.reshape(-1,1) ##size col = 41, row = 1 ==> y = (1,25949) //but only y[8],y[11] or y[12] keep the result
           
    def test(self,x,y):
        y_pred = self.model.predict(x)
        y_test = np.array(y)

        print("Accuracy of testing: ",mylossfunction(y_pred,y_test))
        #print("ROC Area: ")
        #roc_error =  myROCfunction(y_test,y_pred)
        #for i in range(len(roc_error)):
        #  print("- Class ",i," = %0.2f",roc_error[i])

    def save_model(self,name):
        self.model.save(name + "-BiLSTM")
    
    def load_model(self,name):
        self.model = keras.models.load_model(name + "-BiLSTM")
            
    def history(self):
        return self.history

# TRAINING

### Define Training Dataset

In [23]:
X = preprocessed_text
y1 = sentiment_labels
y2 = topic_labels
#y = np.hstack((topic_labels,sentiment_labels))
# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [24]:
vocabulary_size = len(id2word) + 1
seq_len = np.shape(X)[1] #Sequence length = sentence length
input_dim = vocabulary_size 
input_length = seq_len
mybatch_size = 256
myepochs = 5

### Training Model

In [25]:
myBiLSTM = BiLstm(input_dim, input_length)
myBiLSTM.fit(X,y1,y2,mybatch_size,myepochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Save model and parameters

In [26]:
myBiLSTM.save_model("saved_model\\customer_preferences_model")  



INFO:tensorflow:Assets written to: saved_model\customer_preferences_model-BiLSTM\assets


INFO:tensorflow:Assets written to: saved_model\customer_preferences_model-BiLSTM\assets


In [27]:
#Save parameters
saved_dict = {'max_length': max_length, 'token2id': token2id}
with codecs.open('saved_model\\token2id.json', 'w', encoding='utf-8') as f:
    json.dump(saved_dict, f, ensure_ascii=False)

# Prediction

In [30]:
myBiLSTM.load_model("saved_model\\customer_preferences_model")

In [31]:
# Loading parameters
with codecs.open('saved_model\\token2id.json', 'r', 'utf-8') as data_file:
    parameters = json.load(data_file)

In [32]:
# Pre_processing by Preprocess function 
text = [contents[4]]
text = filter_chinese(text)
pre_data_words = sent_to_words(text)  
data = preprocessing(pre_data_words, parameters['max_length'], parameters['token2id']) 

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1005.11it/s]


In [33]:
# Prediction
result = myBiLSTM.prediction(data)

In [34]:
print("Sentiment", result[0][0]) #List of probability of 'neg', 'pos'
print("Topic", result[1][0]) #Lis of probability of 20 topics

Sentiment [0.6516181  0.34838185]
Topic [0.04384184 0.04987843 0.04468744 0.05048144 0.05874954 0.04817674
 0.05743834 0.05562602 0.05087141 0.04837254 0.05310418 0.0419022
 0.04885544 0.04682598 0.04890513 0.04816113 0.05538277 0.04923476
 0.04305987 0.0564447 ]


In [38]:
# Real result
print("Sentiment", y1[4])
print("Topic", y2[4])

Sentiment [1. 0.]
Topic [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
