In [2]:
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from TryAroundModels import *

Using TensorFlow backend.


In [3]:
def shuffle(X, y):
    perm = np.random.permutation(len(X))
    X = X[perm]
    y = y[perm]
    return X, y

def load_imdb_dataset(path):
    imdb_path = os.path.join(path, '')

    # Load the dataset
    train_texts = []
    train_labels = []
    test_texts = []
    test_labels = []
    for dset in ['train', 'test']:
        for cat in ['pos', 'neg']:
            dset_path = os.path.join(imdb_path, dset, cat)
            for fname in sorted(os.listdir(dset_path)):
                if fname.endswith('.txt'):
                    with open(os.path.join(dset_path, fname)) as f:
                        if dset == 'train': train_texts.append(f.read())
                        else: test_texts.append(f.read())
                    label = 0 if cat == 'neg' else 1
                    if dset == 'train': train_labels.append(label)
                    else: test_labels.append(label)

    # Converting to np.array
    train_texts = np.array(train_texts)
    train_labels = np.array(train_labels)
    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    # Shuffle the dataset
    train_texts, train_labels = shuffle(train_texts, train_labels)
    test_texts, test_labels = shuffle(test_texts, test_labels)

    # Return the dataset
    return train_texts, train_labels, test_texts, test_labels

In [4]:
train_texts, train_labels, test_texts, test_labels = load_imdb_dataset("data/")

In [5]:
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOC_FREQ = 2


kwargs = {
    'ngram_range' : NGRAM_RANGE,
    'dtype' : 'int32',
    'strip_accents' : 'unicode',
    'decode_error' : 'replace',
    'analyzer' : TOKEN_MODE,
    'min_df' : MIN_DOC_FREQ,
}

# Learn Vocab from train texts and vectorize train and val sets
tfidf_vectorizer = TfidfVectorizer(**kwargs)
X_tfidf_train = tfidf_vectorizer.fit_transform(train_texts)
X_tfidf_test = tfidf_vectorizer.transform(test_texts)

selector = SelectKBest(f_classif, k=min(TOP_K, X_tfidf_train.shape[1]))
selector.fit(X_tfidf_train, train_labels)
X_selected_tfidf_train = selector.transform(X_tfidf_train).astype('float32')
X_selected_tfidf_test = selector.transform(X_tfidf_test).astype('float32')



In [7]:
def TryAroundModel(X_train, X_test, Y_train, Y_test, X_raw_text_train = None, X_raw_text_test = None, Models = None):
    if Models is None:
        Models = []
        for i in np.nonzero([re.match("TryAroundModel", x) for x in globals().keys()])[0]:
            Models.append(list(globals().keys())[i])
    
    accuracy_list = []
    processed_arg = [X_train, X_test, Y_train, Y_test]
    
    for m in Models:
        if m == "TryAroundModel_LG":
            accuracy_list.append(TryAroundModel_LG(*processed_arg))
        elif m == "TryAroundModel_NB":
            accuracy_list.append(TryAroundModel_NB(*processed_arg))
        elif m == "TryAroundModel_NBSVM":
            accuracy_list.append(TryAroundModel_NBSVM(*processed_arg))
        elif m == "TryAroundModel_RF":
            accuracy_list.append(TryAroundModel_RF(*processed_arg))
        elif m == "TryAroundModel_GBM":
            accuracy_list.append(TryAroundModel_GBM(*processed_arg))
        elif m == "TryAroundModel_MPLNN":
            accuracy_list.append(TryAroundModel_MPLNN(*processed_arg))

        if X_raw_text_train is not None and X_raw_text_test is not None:

            raw_arg = [X_raw_text_train, X_raw_text_test, Y_train, Y_test]
            if m == "TryAroundModel_CNN":
                accuracy_list.append(TryAroundModel_CNN(*raw_arg))
            elif m == "TryAroundModel_LSTM":
                accuracy_list.append(TryAroundModel_LSTM(*raw_arg))
            elif m == "TryAroundModel_FB_LSTM":
                accuracy_list.append(TryAroundModel_FB_LSTM(*raw_arg))
        
    return sorted(accuracy_list, key = lambda x: x[1], reverse = True)


# accuracy_list = TryAroundModel(X_selected_tfidf_train, X_selected_tfidf_test, train_labels,
#                                test_labels, train_texts, test_texts)
accuracy_list = TryAroundModel(X_selected_tfidf_train, X_selected_tfidf_test, train_labels,
                               test_labels)
accuracy_list

Logistic Regression -- Accuracy:  0.88356
Multinomial Naive Bayes -- Accuracy:  0.859
Random Forest -- Accuracy:  0.81924
Gradient Boosting Machine -- Accuracy:  0.70616
Naive Bayes SVM -- Accuracy:  0.86632
Multilayer Perceptron Neural Network(MLP) -- Accuracy:  0.90284


[('Multilayer Perceptron Neural Network(MLP)', 0.90284),
 ('Logistic Regression', 0.88356),
 ('Naive Bayes SVM', 0.86632),
 ('Multinomial Naive Bayes', 0.859),
 ('Random Forest', 0.81924),
 ('Gradient Boosting Machine', 0.70616)]

In [50]:
pd.DataFrame(accuracy_list)

Unnamed: 0,0,1
0,Multilayer Perceptron Neural Network(MLP),0.89868
1,Logistic Regression,0.88356
2,Convolutional Neural Network,0.87708
3,Naive Bayes SVM,0.86632
4,Multinomial Naive Bayes,0.859
5,LSTM Neural Network,0.84944
6,Random Forest,0.82364
7,Forward and Backward LSTM Neural Netword,0.75512
8,Gradient Boosting Machine,0.70032


In [17]:
count_vec = CountVectorizer()
train_vec = count_vec.fit_transform(train_texts)

[[478,
  240,
  226,
  119,
  624,
  161,
  176,
  156,
  228,
  234,
  155,
  92,
  216,
  121,
  108,
  116,
  301,
  161,
  27,
  159,
  442,
  392,
  240,
  33,
  43,
  46,
  459,
  185,
  46,
  787,
  114,
  216,
  83,
  214,
  141,
  415,
  192,
  303,
  99,
  153,
  128,
  208,
  469,
  104,
  113,
  170,
  356,
  255,
  385,
  279,
  110,
  56,
  220,
  152,
  164,
  179,
  142,
  133,
  154,
  128,
  785,
  74,
  813,
  335,
  113,
  155,
  244,
  165,
  241,
  272,
  457,
  105,
  228,
  338,
  131,
  138,
  147,
  77,
  115,
  111,
  143,
  111,
  201,
  35,
  120,
  58,
  117,
  256,
  45,
  117,
  183,
  107,
  189,
  180,
  259,
  119,
  152,
  59,
  219,
  217,
  206,
  954,
  251,
  258,
  394,
  142,
  321,
  124,
  287,
  274,
  315,
  105,
  236,
  148,
  543,
  291,
  186,
  935,
  245,
  117,
  266,
  105,
  148,
  128,
  108,
  113,
  103,
  224,
  98,
  243,
  190,
  56,
  124,
  59,
  465,
  160,
  255,
  329,
  426,
  425,
  62,
  153,
  105,
  499,
  108,
  50

In [36]:
np.sum(train_vec, axis = 0)

matrix([[ 93, 300,   1, ...,   1,   4,   1]], dtype=int64)

In [38]:
len(train_texts[0].split())

500

In [32]:
np.sum(train_vec, axis = 1).shape

(25000, 1)