In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy.sparse import hstack, csr_matrix, vstack

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from keras.preprocessing import sequence 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load data
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,comment,label
0,train_000000,Dung dc sp tot cam on \r\nshop Đóng gói sản ph...,0
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...,0
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...,0
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,1
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,1


In [4]:
train_df.shape

(16087, 3)

In [5]:
test_df.head()

Unnamed: 0,id,comment
0,test_000000,Chưa dùng thử nên chưa biết
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua n...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng
4,test_000004,Chuẩn hàng đóng gói đẹp


In [6]:
test_df.shape

(10981, 2)

In [7]:
def count_nulls(df):
    null_counter = df.isnull().sum(axis=0)
    null_counter = null_counter[null_counter > 0]
    null_percent = df.isnull().sum(axis=0) / df.shape[0] * 100
    null_percent = null_percent[null_percent > 0]
    null_df = pd.concat([null_counter,null_percent],axis=1)
    null_df.columns = ['count','percent']
    display(null_df)

count_nulls(train_df)

Unnamed: 0,count,percent


In [39]:
vectorizer = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))

In [40]:
x_train, x_val, y_train, y_val = train_test_split(train_df.comment, train_df.label, test_size=0.3,
    random_state=42)
x_train

2142      Chất lượng sản phẩm tuyệt vời \r\nRất đáng tiền.
3851                  Shop phục vụ rất kém giao thiếu hàng
5844                                          Thik lam\r\n
1082         Giao hàng kh đúng màu. Mặc dù đã chú thích.  
1671     Sản phẩm nhìn ko chất lượng giờ chạy ko đúng m...
8522                                  Giày như trong hình❤
10546                    Sản phẩm ok lắm. Nằm thấy dễ chịu
5145      Chất lượng sản phẩm tuyệt vời Đóng gói sản ph...
15380    Vỏ chặn bị dính bẩn như qua sd rôigiặt ko sạch...
1503          Võ ngoài bị lõm chắc do quá trình vận chuyển
1971                          Kim bị rớt... chất lượng kém
7372                                      "Thấy ok rồi đó"
3798     Shop giao hàng ko đúng 10 bộ size 6 mà có 4 bộ...
5508                         Loa dùng đc.giá phù hơp...
694       Chất lượng sản phẩm tuyệt vời không thích ngư...
12127           Hàng đẹp đóng cẩn thận nhưng giao hàng lâu
3855                      Trả tiền 2 mà mới nhận đc

In [41]:
vectorizer.fit(x_train)
x_tfidf_train = vectorizer.transform(x_train)
x_tfidf_val = vectorizer.transform(x_val)

In [42]:
x_tfidf_train.shape

(11260, 100000)

In [43]:
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score

# model = GradientBoostingClassifier(
#                                            n_estimators=1000,
# #                                            learning_rate= 0.002,
#                                            max_depth = 7,
#                                            min_samples_leaf=10,
#                                            verbose =1)

model = LogisticRegression()

print("Train model.......")
sentiment_fit = model.fit(x_tfidf_train, y_train)
print("Predict ......")
y_pred = sentiment_fit.predict(x_tfidf_val)

threshold = 0.5
y_pred = (y_pred > threshold).astype(np.uint8)

accuracy = accuracy_score(y_val, y_pred)
print("accuracy score: {0:.2f}%".format(accuracy*100))

f1_score = f1_score(y_val, y_pred)
print("F1 score: {0:.2f}".format(f1_score))


Train model.......




Predict ......
accuracy score: 89.02%
F1 score: 0.88


In [44]:
from keras.preprocessing import sequence 
from keras.preprocessing.text import Tokenizer

### Create sequence
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(x_train)

max_review_length = 500
sequences = tokenizer.texts_to_sequences(x_train)
X_train = sequence.pad_sequences(sequences, max_review_length)

sequences = tokenizer.texts_to_sequences(x_val)
X_test = sequence.pad_sequences(sequences, max_review_length)

# X_train = x_tfidf_train
# X_test = x_tfidf_val

In [15]:
X_test

array([[   0,    0,    0, ...,  184,   52,   69],
       [   0,    0,    0, ...,   11,   30,   26],
       [   0,    0,    0, ...,    5,  832, 2178],
       ...,
       [   0,    0,    0, ...,    0,    0,  117],
       [   0,    0,    0, ..., 1126,   68,  201],
       [   0,    0,    0, ...,    9,   10,  257]])

In [52]:
# Build the model 
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Flatten

top_words = 100000 
embedding_vector_length = 32

model = Sequential() 
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length)) 
model.add(LSTM(50))
# model.add(Flatten()) 
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           3200000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 3,216,651
Trainable params: 3,216,651
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
#Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_val), nb_epoch=5, batch_size=64) 

  


Train on 11260 samples, validate on 4827 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a5aec20908>

In [54]:
y_pred = model.predict(X_test)
y_pred

array([[2.1576427e-03],
       [9.7957075e-01],
       [5.2393969e-02],
       ...,
       [1.4999948e-01],
       [9.2515409e-01],
       [1.9849870e-04]], dtype=float32)

In [149]:
y_pred

array([[3.9474987e-03],
       [9.7973776e-01],
       [5.0319717e-03],
       ...,
       [1.3824444e-01],
       [8.8915288e-01],
       [4.1093484e-05]], dtype=float32)

In [55]:
from sklearn.metrics import accuracy_score, f1_score

threshold = 0.5
y_pred = (y_pred > threshold).astype(np.uint8)

accuracy = accuracy_score(y_val, y_pred)
print("accuracy score: {0:.2f}%".format(accuracy*100))

f1_score = f1_score(y_val, y_pred)
print("F1 score: {0:.2f}%".format(f1_score))

accuracy score: 88.54%
F1 score: 0.86%


In [146]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = y_pred.astype(int)
f1 = f1_score(y_val, y_pred)
print("F1 score: {0:.2f}%".format(f1))

F1 score: 0.87%
