In [1]:
# Importing the libraries
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/int3405-sentiment-analysis-problem/test.csv
/kaggle/input/int3405-sentiment-analysis-problem/full_train.csv


In [2]:
!wget "https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.window2.vn.model.bin.gz"

--2022-12-13 09:41:48--  https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.window2.vn.model.bin.gz
Resolving thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)... 52.219.136.115, 52.219.197.118, 52.219.196.50, ...
Connecting to thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)|52.219.136.115|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 494402382 (471M) [application/x-gzip]
Saving to: ‘baomoi.window2.vn.model.bin.gz’


2022-12-13 09:42:05 (29.2 MB/s) - ‘baomoi.window2.vn.model.bin.gz’ saved [494402382/494402382]



In [3]:
from gensim.models import KeyedVectors
from gensim import models

word2vec_path = '/kaggle/working/baomoi.window2.vn.model.bin.gz'
w2v_model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [4]:
vector = w2v_model['y']
vector.shape

(300,)

In [5]:
# Importing the dataset
trainData = pd.read_csv("/kaggle/input/int3405-sentiment-analysis-problem/full_train.csv")
testData = pd.read_csv("/kaggle/input/int3405-sentiment-analysis-problem/test.csv")
print("Train shape : ",trainData.shape)
print("Test shape : ",testData.shape)

Train shape :  (9073, 6)
Test shape :  (5103, 5)


In [6]:
trainData = trainData.dropna()
trainData

Unnamed: 0.1,Unnamed: 0,RevId,UserId,Comment,image_urls,Rating
0,0,3839333,10106093.0,"Xôi dẻo, đồ ăn đậm vị. Hộp xôi được lót lá trô...",['https://images.foody.vn/res/g97/966781/s800/...,1.0
1,1,2824877,786914.0,Gọi ship 1 xuất cari gà bánh naan và 3 miếng g...,['https://images.foody.vn/res/g69/688413/s800/...,0.0
2,2,9816702,22467889.0,"Thời tiết lạnh như này, cả nhà rủ nhau đến leg...",['https://images.foody.vn/res/g72/715078/s800/...,1.0
3,3,2684585,1889449.0,Em có đọc review thấy mng bảo trà sữa nướng đề...,['https://images.foody.vn/res/g90/895545/s800/...,0.0
4,4,2737987,8839942.0,"Đồ ăn rất ngon, nhà hàng cũng rất đẹp, tất cả ...",['https://images.foody.vn/res/g4/30186/s800/fo...,1.0
...,...,...,...,...,...,...
9068,9066,11236510,23044142.0,Thực sự mà nói thấy mọi người đánh giá nhiều q...,['https://images.foody.vn/res/g94/930433/s800/...,1.0
9069,9067,1831645,10117823.0,👉👉👉LẨU THÁI TÔMYUM\n👉👉Đc: 19 Lò Đúc-HBT-HN\n👉T...,['https://images.foody.vn/res/g16/150633/s800/...,1.0
9070,9068,2155495,11636069.0,Ngay từ lúc đầu tiên bước vào nhà hàng đã được...,['https://images.foody.vn/res/g74/736118/s800/...,1.0
9071,9069,7357032,12208284.0,Đặt ăn thử mà thấy ngón cá...! 🤩 sẽ còn ủn hộ ...,['https://images.foody.vn/res/g104/1036659/s80...,1.0


In [7]:
max_features = 500000
max_len = 30 
embed_size = 300

def clean_special_chars(text):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    
    punct_mapping = {
        "‘": "'", "´": "'", "°": "", "vnd": "đồng", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', 
        '”': '"', '“': '"', '•': '.', '−': '-'
    }
    
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    return text

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
def dataHandling(trainData, testData, maxfeatures, maxlen):
    Traindf, Valdf = train_test_split(trainData, test_size=0.1, random_state=2018)
    
    Train_y = Traindf['Rating'].values
    Val_y = Valdf['Rating'].values
    
    Train_X = Traindf['Comment'].astype(str).apply(lambda x: x.lower())
    Train_X = Train_X.apply(lambda x: clean_special_chars(x))
    
    Val_X = Valdf['Comment'].astype(str).apply(lambda x: x.lower())
    Val_X = Val_X.apply(lambda x: clean_special_chars(x))

    Test_X = testData['Comment'].astype(str).apply(lambda x: x.lower())
    Test_X = Test_X.apply(lambda x: clean_special_chars(x))

    tokenizer = Tokenizer(num_words = max_features)
    tokenizer.fit_on_texts(list(Train_X))
    Train_X = tokenizer.texts_to_sequences(Train_X)
    Val_X = tokenizer.texts_to_sequences(Val_X)
    Test_X = tokenizer.texts_to_sequences(Test_X)

    Train_X = pad_sequences(Train_X, maxlen=maxlen)
    Val_X = pad_sequences(Val_X, maxlen=maxlen)
    Test_X = pad_sequences(Test_X, maxlen=maxlen)

    return Train_X, Train_y, Test_X, Val_X, Val_y, tokenizer.word_index

Train_X, Train_y, Test_X, Val_X, Val_y, WordIndex = dataHandling(trainData, testData, max_features, max_len)

In [8]:
WordIndex

{'à': 1,
 'ăn': 2,
 'mình': 3,
 'l': 4,
 'v': 5,
 'có': 6,
 'quán': 7,
 'không': 8,
 'n': 9,
 'thì': 10,
 'm': 11,
 'ngon': 12,
 'cũng': 13,
 'nên': 14,
 'ở': 15,
 'rất': 16,
 'nhưng': 17,
 'o': 18,
 '1': 19,
 'y': 20,
 'đồ': 21,
 'đây': 22,
 'khá': 23,
 'ng': 24,
 'thấy': 25,
 'được': 26,
 'lại': 27,
 'h': 28,
 'bánh': 29,
 'vị': 30,
 'giá': 31,
 'nhiều': 32,
 'với': 33,
 'cho': 34,
 'như': 35,
 'gọi': 36,
 'cả': 37,
 'nh': 38,
 'món': 39,
 'sẽ': 40,
 'hơi': 41,
 'i': 42,
 'đi': 43,
 'ko': 44,
 'còn': 45,
 'các': 46,
 'nhân': 47,
 'nước': 48,
 'phải': 49,
 'đến': 50,
 '2': 51,
 'lần': 52,
 'quá': 53,
 'thịt': 54,
 'ra': 55,
 'viên': 56,
 'của': 57,
 'bạn': 58,
 'luôn': 59,
 'uống': 60,
 'lắm': 61,
 'để': 62,
 'k': 63,
 'vừa': 64,
 'vì': 65,
 'một': 66,
 'trong': 67,
 'g': 68,
 'thích': 69,
 'cái': 70,
 'người': 71,
 'tr': 72,
 'chỉ': 73,
 'hơn': 74,
 'nữa': 75,
 'về': 76,
 'rồi': 77,
 'thử': 78,
 'gian': 79,
 'ngọt': 80,
 'thêm': 81,
 'bị': 82,
 'mới': 83,
 'vụ': 84,
 'nói': 85,
 'hay

In [9]:
len(WordIndex)

13144

In [10]:
def embeddingsMatrix(word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))  
    for word, i in word_index.items():
        if i >= max_features: continue  #bỏ các ký tự có value lớn hơn max_features
        if word in w2v_model: 
            embedding_vector = w2v_model[word]
        else:
            embedding_vector = np.zeros(300,)

        embedding_matrix[i] = embedding_vector
    return embedding_matrix

matrix = embeddingsMatrix(WordIndex)

In [11]:
matrix.shape

(13145, 300)

In [12]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU 
from keras.layers import Bidirectional, GlobalMaxPooling1D, Concatenate, SpatialDropout1D 
from keras.models import Model, Sequential 
from keras import initializers, regularizers, constraints, optimizers, layers 
from tensorflow import keras 
from numpy import argmax 
from tensorflow.keras.layers import BatchNormalization 
nmodel = Sequential() 
nmodel.add(layers.Embedding(len(WordIndex) + 1, embed_size, weights=[matrix], input_length=max_len)) 
nmodel.add(SpatialDropout1D(0.5)) 
nmodel.add(Bidirectional(LSTM(64, return_sequences=True))) 
nmodel.add(layers.GlobalMaxPool1D()) 
nmodel.add(layers.Dense(10)) 
nmodel.add(BatchNormalization()) 
nmodel.add(Activation('relu')) 
nmodel.add(Dropout(0.5)) 
nmodel.add(layers.Dense(1, activation='sigmoid')) 
nmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 
nmodel.summary()

2022-12-13 09:42:26.274681: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 300)           3943500   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 30, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 128)           186880    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
batch_normalization (BatchNo (None, 10)                40        
_________________________________________________________________
activation (Activation)      (None, 10)                0

In [13]:
numepochs = 10    
batchsize = 8
h2 = nmodel.fit(Train_X, Train_y, batch_size = batchsize, epochs = numepochs, validation_data=(Val_X, Val_y))

Epoch 1/10


2022-12-13 09:42:27.058097: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
testData

Unnamed: 0.1,Unnamed: 0,RevId,UserId,Comment,image_urls
0,0,781115,1326532,Trà táo 35k\nCookie socola 38k \nNước ở đây bì...,['https://images.foody.vn/res/g8/73091/s800/fo...
1,1,1219481,422306,Hôm rồi trung tâm mình tổ chức noel party ở đâ...,['https://images.foody.vn/res/g1/33/s800/foody...
2,2,1703765,9779143,Thịt gà của quán là nhất đấy. Đi ăn gọi liền 4...,['https://images.foody.vn/res/g66/659655/s800/...
3,3,4870346,12924388,Hai đứa ăn xong đau bụng cả ngày\nChân gà ok n...,['https://images.foody.vn/res/g78/772244/s800/...
4,4,2638711,1134279,Mình vừa thử trưa nay. Điểm cộng đầu tiên là b...,['https://images.foody.vn/res/g74/737874/s800/...
...,...,...,...,...,...
5098,5098,1025826,132889,"Bún riêu ở đây ngon mà, lúc nào cũng đông, thế...",['https://images.foody.vn/res/g10/92649/s800/f...
5099,5099,1278470,1691594,Quán ngồi thoải mái và cưc thích ❤ menu đa dạn...,['https://images.foody.vn/res/g25/247296/s800/...
5100,5100,2565212,9989615,"Quá thất vọng, chất lượng sản phẩm ngày càng đ...",['https://images.foody.vn/res/g89/888154/s800/...
5101,5101,3766155,17686862,"Giao hàng nhanh, miếng băm chả đều ngon ko bị...",['https://images.foody.vn/res/g13/128194/s800/...


In [15]:
import sklearn.metrics as metrics
def bestThresh(Val_y, pred_val_y, pr, minthresh, maxthresh): 
    best_threshhold = 0
    bestacc = 0
    for thresh in np.arange(minthresh, maxthresh, 0.01):
        thresh = np.round(thresh, 2)
        acc = metrics.accuracy_score(Val_y, (pred_val_y>thresh).astype(int))
        if pr: print("Accuracy score for threshold {0} is {1}".format(thresh,acc))
        if(acc > bestacc): 
            best_threshhold = thresh
            bestacc = acc
    print("")
    print("Highest accuracy score is {} and the threshold is {}".format(bestacc, best_threshhold))
    return best_threshhold, bestacc

In [16]:
pred_val = nmodel.predict([Val_X], batch_size= batchsize, verbose=1)
bestthresh, bestacc = bestThresh(Val_y, pred_val, True, 0.1, 0.9)
pred1 = (nmodel.predict([Test_X], batch_size= batchsize, verbose=1) > bestthresh).astype(int)

Accuracy score for threshold 0.1 is 0.8324145534729879
Accuracy score for threshold 0.11 is 0.836824696802646
Accuracy score for threshold 0.12 is 0.836824696802646
Accuracy score for threshold 0.13 is 0.8401323042998897
Accuracy score for threshold 0.14 is 0.8467475192943771
Accuracy score for threshold 0.15 is 0.8467475192943771
Accuracy score for threshold 0.16 is 0.8478500551267916
Accuracy score for threshold 0.17 is 0.8500551267916208
Accuracy score for threshold 0.18 is 0.8522601984564498
Accuracy score for threshold 0.19 is 0.8522601984564498
Accuracy score for threshold 0.2 is 0.856670341786108
Accuracy score for threshold 0.21 is 0.8610804851157663
Accuracy score for threshold 0.22 is 0.8654906284454245
Accuracy score for threshold 0.23 is 0.8654906284454245
Accuracy score for threshold 0.24 is 0.8665931642778391
Accuracy score for threshold 0.25 is 0.8676957001102535
Accuracy score for threshold 0.26 is 0.8676957001102535
Accuracy score for threshold 0.27 is 0.87100330760749

In [17]:
out_df1 = pd.DataFrame({"RevId":testData["RevId"].values})
out_df1['Rating'] = pred1
out_df1.to_csv("resultX.csv", index=False)
out_df1

Unnamed: 0,RevId,Rating
0,781115,1
1,1219481,1
2,1703765,1
3,4870346,1
4,2638711,1
...,...,...
5098,1025826,1
5099,1278470,1
5100,2565212,0
5101,3766155,1


In [18]:
count0 = 0
count1 = 0
for i in pred1:
    if i == 1: count1 += 1
    else: count0 += 1
print(count1)
print(count0)

4101
1002
