### Original Fasttext

In [1]:
import numpy as np
import pandas as pd
import re
import os
import fasttext
from sklearn.metrics import roc_auc_score

In [2]:
# Modelling
model = fasttext.train_supervised('train.ft.txt',label_prefix='__label__', thread=4, epoch = 10)
print(model.labels, 'are the labels or targets the model is predicting')

['__label__1', '__label__2'] are the labels or targets the model is predicting


In [3]:
# Load the test data
test_data = open('test.ft.txt')
test_data=test_data.readlines()
print(len(test_data), 'number of records in the test set') 

400000 number of records in the test set


In [4]:
#review the test data
test_data[:2]

['__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 "__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too 

In [5]:
# Remove the __label__1 and __label__2 from the testset 
test_reviews = [w.replace('__label__2 ', '').replace('__label__1 ', '').replace('\n', '') for w in test_data]

In [8]:
# Use the predict function 
pred = model.predict(test_reviews)

labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_data]
pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]

# run the accuracy measure. 
print(roc_auc_score(labels, pred_labels))

0.9172450000000001


### Classical ML algorithm 
Intro: Here below I use XGBOOST as classical ML algorithm.
TFIDF + XGBoost, and CountVectorizer + XGBoost

##### Split dataset
NOTE: When running FastText with the whole dataset, the code can be ran successfully and fast, but when I use classical ML-algorithm, there were always have memory error. So, firstly I only use 5000 data in train and 500 in test to do the main work.

In [4]:
#reviews and labels extract
def get_review(train,test):
    X_train = [w.replace('__label__2 ', '').replace('__label__1 ', '').replace('\n', '') for w in train]
    X_test = [w.replace('__label__2 ', '').replace('__label__1 ', '').replace('\n', '') for w in test]
    y_train = []
    y_test = []
    for i in range(0,len(train)):
        temp = train[i][9]
        y_train.append(temp)
    for i in range(0,len(test)):
        temp = test[i][9]
        y_test.append(temp)
    
    return X_train,X_test,y_train,y_test

In [5]:
# Load the train data
train_data = open('train.ft.txt')
train_data=train_data.readlines()
print(len(train_data), 'number of records in the test set') 

3600000 number of records in the test set


In [31]:
# small data set which can be ran succesefully in my computer
train = train_data[:5000]
test = test_data[:500]

In [32]:
#split reviews and labels
Xtrain,Xtest,ytrain,ytest = get_review(train,test)

In [6]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

##### TfidfVectorizer+XGBoost

In [7]:
# set up TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [21]:
# Fitting TF-IDF to both training and test sets
tfv.fit(Xtrain+Xtest)
Xtrain_tfv =  tfv.transform(Xtrain) 
Xtest_tfv = tfv.transform(Xtest)

Xtrain_tfv_vect = pd.concat([pd.DataFrame(Xtrain_tfv.toarray())], axis=1)
Xtest_tfv_vect = pd.concat([pd.DataFrame(Xtest_tfv.toarray())], axis=1)

In [8]:
# set up XGBoost classifier
clf_xgb = xgb.XGBClassifier(max_depth=3, colsample_bytree=0.8,
                        subsample=0.8, nthread=10, learning_rate=0.5)

In [24]:
# Fitting xgboost on Tfidf
clf_xgb_tfvmodel = clf_xgb.fit(Xtrain_tfv_vect, ytrain)
y_pred_tfv = clf_xgb_tfvmodel.predict(Xtest_tfv_vect)
score_xgb_tfv = accuracy_score(ytest, y_pred_tfv)

In [25]:
score_xgb_tfv

0.804

In [28]:
print(classification_report(ytest,y_pred_tfv))

              precision    recall  f1-score   support

           1       0.80      0.80      0.80       241
           2       0.81      0.81      0.81       259

    accuracy                           0.80       500
   macro avg       0.80      0.80      0.80       500
weighted avg       0.80      0.80      0.80       500



##### CountVectorizer+XGBoost

In [9]:
# set up CountVectorizer
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [30]:
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(Xtrain+Xtest)
Xtrain_ctv =  ctv.transform(Xtrain) 
Xvalid_ctv = ctv.transform(Xtest)

In [35]:
clf_xgb_ctvmodel = clf_xgb.fit(Xtrain_ctv, ytrain)
y_pred_ctv = clf_xgb_ctvmodel.predict(Xvalid_ctv)
score_ctv_xgb = accuracy_score(ytest, y_pred_ctv)

In [36]:
score_ctv_xgb

0.822

In [37]:
print(classification_report(ytest,y_pred_ctv))

              precision    recall  f1-score   support

           1       0.81      0.82      0.82       241
           2       0.83      0.83      0.83       259

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500



#### Increase train data from 5000 to 500000, apply CountVectorizer+XGBoost

In [10]:
#Lager data
train_larger = train_data[:500000]
test_larger = test_data[:50000]

In [11]:
#split reviews and labels
Xtrain_l,Xtest_l,ytrain_l,ytest_l = get_review(train_larger,test_larger)

In [12]:
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(Xtrain_l+Xtest_l)
Xtrain_ctv_l =  ctv.transform(Xtrain_l) 
Xvalid_ctv_l = ctv.transform(Xtest_l)

In [13]:
clf_ctvmodel_l = clf_xgb.fit(Xtrain_ctv_l, ytrain_l)
y_pred_ctv_l = clf_ctvmodel_l.predict(Xvalid_ctv_l)
score_ctv_l = accuracy_score(ytest_l, y_pred_ctv_l)

In [14]:
score_ctv_l

0.8462

In [15]:
print(classification_report(ytest_l,y_pred_ctv_l))

              precision    recall  f1-score   support

           1       0.86      0.82      0.84     24626
           2       0.83      0.87      0.85     25374

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



#### Conclusions:
1. The technical reason why I have memory error: I didn't use readline(), instead, I used readlines() when I read txt file. Because readline() method will return a line from the file when called, and readlines() method will return all the lines in a file in the format of a list where each element is a line in the file, which means use readlines means more CPU and RAM taking.
2. As the final result, I found that in this case CountVectorizer is better than TFIDF in preprocessing, and FastTest is more efficient in checking and predicting Amazon reviews.
3. Increase the number of data can improve accuracy.

##### Fully convelutional network
Inspired by https://www.kaggle.com/kevinautin/fully-convolutional-accuracy-94-4-15-min , which can run the whole data set within 1.5 hours and no need to worry about run out of ROM.

In [9]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [252]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [253]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 

In [254]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [257]:
# Load from the file
reviews_train, y_train = splitReviewsLabels(train_data)
reviews_test, y_test = splitReviewsLabels(test_data)

100%|██████████| 3600000/3600000 [01:27<00:00, 41233.34it/s]
100%|██████████| 400000/400000 [00:12<00:00, 31854.66it/s]


In [272]:
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [273]:
#data preparation
y_train = np.array(y_train)
y_test = np.array(y_test)

In [275]:
#set up padding features
max_features = 8192
maxlen = 128
embed_size = 64

In [276]:
tokenizer = Tokenizer(num_words=max_features)

In [277]:
tokenizer.fit_on_texts(reviews_train)
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)

In [278]:
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [279]:
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.5)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model_52"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_51 (InputLayer)        (None, 128)               0         
_________________________________________________________________
embedding_55 (Embedding)     (None, 128, 64)           524288    
_________________________________________________________________
dropout_52 (Dropout)         (None, 128, 64)           0         
_________________________________________________________________
batch_normalization_178 (Bat (None, 128, 64)           256       
_________________________________________________________________
conv1d_174 (Conv1D)          (None, 128, 32)           14368     
_________________________________________________________________
batch_normalization_179 (Bat (None, 128, 32)           128       
_________________________________________________________________
conv1d_175 (Conv1D)          (None, 128, 32)           310

In [281]:
model.fit(x_train, y_train, batch_size=2048, epochs=2, validation_split=0.5)

Train on 1800000 samples, validate on 1800000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1bd2f994d0>

In [282]:
model.evaluate (x_test, y_test)



[0.17004760593764484, 0.9375324845314026]

#### Conclusions:
1. NN network is more user-frinedly in practice.
2. When I tune some parametors such as optimizer,activation function,batch size etc. I found that there're only slight differences. Maybe I didn't try enough as re-running the code takes too much time.
3. In different environments have different results. In Anaconda I got 93.7 acc and spent 1.4h for running, while in Kaggle, got 94 acc only need a half-hour