In [2]:
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from nltk.corpus import stopwords
from TryAroundModels import *

Using TensorFlow backend.


In [3]:
def shuffle(X, y):
    perm = np.random.permutation(len(X))
    X = X[perm]
    y = y[perm]
    return X, y

def load_imdb_dataset(path):
    imdb_path = os.path.join(path, '')

    # Load the dataset
    train_texts = []
    train_labels = []
    test_texts = []
    test_labels = []
    for dset in ['train', 'test']:
        for cat in ['pos', 'neg']:
            dset_path = os.path.join(imdb_path, dset, cat)
            for fname in sorted(os.listdir(dset_path)):
                if fname.endswith('.txt'):
                    with open(os.path.join(dset_path, fname)) as f:
                        if dset == 'train': train_texts.append(f.read())
                        else: test_texts.append(f.read())
                    label = 0 if cat == 'neg' else 1
                    if dset == 'train': train_labels.append(label)
                    else: test_labels.append(label)

    # Converting to np.array
    train_texts = np.array(train_texts)
    train_labels = np.array(train_labels)
    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    # Shuffle the dataset
    train_texts, train_labels = shuffle(train_texts, train_labels)
    test_texts, test_labels = shuffle(test_texts, test_labels)

    # Return the dataset
    return train_texts, train_labels, test_texts, test_labels

In [4]:
train_texts, train_labels, test_texts, test_labels = load_imdb_dataset("data/")

In [13]:
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOC_FREQ = 2


kwargs = {
    'ngram_range' : NGRAM_RANGE,
    'dtype' : 'int32',
    'strip_accents' : 'unicode',
    'decode_error' : 'replace',
    'analyzer' : TOKEN_MODE,
    'min_df' : MIN_DOC_FREQ,
}

# Learn Vocab from train texts and vectorize train and val sets
tfidf_vectorizer = TfidfVectorizer(**kwargs)
X_tfidf_train = tfidf_vectorizer.fit_transform(train_texts)
X_tfidf_test = tfidf_vectorizer.transform(test_texts)

selector = SelectKBest(f_classif, k=min(TOP_K, X_tfidf_train.shape[1]))
selector.fit(X_tfidf_train, train_labels)
X_selected_tfidf_train = selector.transform(X_tfidf_train).astype('float32')
X_selected_tfidf_test = selector.transform(X_tfidf_test).astype('float32')



In [19]:
def TryAroundModel(X_train, X_test, Y_train, Y_test, X_raw_text_train = None, X_raw_text_test = None, Models = None):
    if Models is None:
        Models = []
        for i in np.nonzero([re.match("TryAroundModel", x) for x in globals().keys()])[0]:
            Models.append(list(globals().keys())[i])
    
    accuracy_list = []
    processed_arg = [X_train, X_test, Y_train, Y_test]
    
    for m in Models:
        if m == "TryAroundModel_LG":
            accuracy_list.append(TryAroundModel_LG(*processed_arg))
        elif m == "TryAroundModel_NB":
            accuracy_list.append(TryAroundModel_NB(*processed_arg))
        elif m == "TryAroundModel_NBSVM":
            accuracy_list.append(TryAroundModel_NBSVM(*processed_arg))
        elif m == "TryAroundModel_RF":
            accuracy_list.append(TryAroundModel_RF(*processed_arg))
        elif m == "TryAroundModel_GBM":
            accuracy_list.append(TryAroundModel_GBM(*processed_arg))
        elif m == "TryAroundModel_MPLNN":
            accuracy_list.append(TryAroundModel_MPLNN(*processed_arg))

        if X_raw_text_train is not None and X_raw_text_test is not None:

            raw_arg = [X_raw_text_train, X_raw_text_test, Y_train, Y_test]
            if m == "TryAroundModel_CNN":
                accuracy_list.append(TryAroundModel_CNN(*raw_arg))
            elif m == "TryAroundModel_LSTM":
                accuracy_list.append(TryAroundModel_LSTM(*raw_arg))
            elif m == "TryAroundModel_FB_LSTM":
                accuracy_list.append(TryAroundModel_FB_LSTM(*raw_arg))
        
    return sorted(accuracy_list, key = lambda x: x[1], reverse = True)


# accuracy_list = TryAroundModel(X_selected_tfidf_train, X_selected_tfidf_test, train_labels,
#                                test_labels, train_texts, test_texts)
accuracy_list = TryAroundModel(X_selected_tfidf_train, X_selected_tfidf_test, train_labels,
                               test_labels, Models=["TryAroundModel_LG"])
accuracy_list = TryAroundModel(train_vec, test_vec, train_labels,
                               test_labels, Models=["TryAroundModel_LG"])
accuracy_list

Logistic Regression -- Accuracy:  0.88356
Logistic Regression -- Accuracy:  0.86304




[('Logistic Regression', 0.86304)]

In [50]:
pd.DataFrame(accuracy_list)

Unnamed: 0,0,1
0,Multilayer Perceptron Neural Network(MLP),0.89868
1,Logistic Regression,0.88356
2,Convolutional Neural Network,0.87708
3,Naive Bayes SVM,0.86632
4,Multinomial Naive Bayes,0.859
5,LSTM Neural Network,0.84944
6,Random Forest,0.82364
7,Forward and Backward LSTM Neural Netword,0.75512
8,Gradient Boosting Machine,0.70032


In [5]:
sw = stopwords.words("english")
sw+=["would", "think", "might", "since", "else", "mr", "usually"]

In [18]:
def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', 'NUM', tokens.lower())
    return r


count_vec = CountVectorizer(stop_words=sw, preprocessor=no_number_preprocessor)
train_vec = count_vec.fit_transform(train_texts)
test_vec = count_vec.transform(test_texts)

In [7]:
vocal_count_dict = dict(zip(count_vec.get_feature_names(), np.sum(train_vec, axis = 0).tolist()[0]))

In [8]:
dict(sorted(vocal_count_dict.items(), key = lambda x: x[1], reverse = True))

{'br': 101871,
 'movie': 44047,
 'film': 40159,
 'NUM': 35049,
 'one': 26795,
 'like': 20281,
 'good': 15147,
 'time': 12727,
 'even': 12655,
 'story': 11988,
 'really': 11738,
 'see': 11479,
 'well': 10667,
 'much': 9764,
 'get': 9311,
 'bad': 9308,
 'people': 9287,
 'also': 9159,
 'first': 9064,
 'great': 9060,
 'made': 8364,
 'way': 8026,
 'make': 8025,
 'could': 7922,
 'movies': 7668,
 'characters': 7159,
 'character': 7024,
 'watch': 6974,
 'two': 6906,
 'films': 6889,
 'seen': 6681,
 'many': 6675,
 'life': 6632,
 'plot': 6589,
 'acting': 6494,
 'never': 6485,
 'love': 6453,
 'little': 6438,
 'best': 6416,
 'show': 6295,
 'know': 6167,
 'ever': 5992,
 'man': 5979,
 'better': 5740,
 'end': 5651,
 'still': 5624,
 'say': 5396,
 'scene': 5383,
 'scenes': 5213,
 'go': 5158,
 'something': 5076,
 'back': 4972,
 'real': 4736,
 'watching': 4606,
 'though': 4566,
 'old': 4526,
 'thing': 4525,
 'years': 4514,
 'actors': 4488,
 'director': 4449,
 'work': 4374,
 'another': 4330,
 'new': 4311,


In [None]:
pos_rate_list = []
for feature in count_vec.get_feature_names():
    good_index = [i for i in range(train_vec.shape[0]) if feature in train_texts[i].lower()]
    pos_rate = sum([train_labels[i] for i in good_index])/len(good_index)
    pos_rate_list.append((feature, pos_rate))
    print(pos_rate_list[-1])

('00', 0.4933530280649926)
('000', 0.4120982986767486)
('0000000000001', 0.0)
('00001', 0.0)
('00015', 0.0)
('000s', 0.2222222222222222)
('001', 0.6)
('003830', 1.0)
('006', 0.6056338028169014)
('007', 0.6)
('0079', 1.0)
('0080', 1.0)
('0083', 1.0)
('0093638', 1.0)
('00am', 0.75)
('00pm', 0.5)
('00s', 0.4418604651162791)
('01', 0.5666666666666667)
('01pm', 0.0)
('02', 0.5303030303030303)
('020410', 1.0)
('029', 0.0)
('03', 0.5367647058823529)
('04', 0.5899280575539568)
('041', 0.5)
('05', 0.6428571428571429)
('050', 0.25)
('06', 0.5739644970414202)
('06th', 1.0)
('07', 0.6013986013986014)
('08', 0.5196078431372549)
('087', 0.5)
('089', 1.0)
('08th', 1.0)
('09', 0.5409836065573771)
('0f', 0.0)
('0ne', 0.5)
('0r', 1.0)
('0s', 0.5318246110325319)
('10', 0.49770642201834864)
('100', 0.44680851063829785)
('1000', 0.3333333333333333)
('1000000', 0.5)
('10000000000000', 1.0)
('1000lb', 0.0)
('1000s', 0.5)
('1001', 0.6666666666666666)
('100b', 0.0)
('100k', 1.0)
('100m', 0.25)
('100min', 0.0)


('1989', 0.546875)
('1990', 0.5303867403314917)
('1990s', 0.4745762711864407)
('1991', 0.5862068965517241)
('1992', 0.5)
('1993', 0.4266666666666667)
('1994', 0.6101694915254238)
('1995', 0.6756756756756757)
('1996', 0.6632653061224489)
('1997', 0.5526315789473685)
('1998', 0.5185185185185185)
('1999', 0.5)
('19k', 1.0)
('19th', 0.6388888888888888)
('19thc', 1.0)
('1am', 0.0)
('1and', 0.0)
('1d', 0.0)
('1h', 0.42857142857142855)
('1h30', 0.5)
('1h40', 0.5)
('1h40m', 0.0)
('1h53', 1.0)
('1hour', 0.0)
('1hr', 0.5)
('1million', 1.0)
('1min', 0.0)
('1mln', 0.0)
('1o', 1.0)
('1s', 0.4734042553191489)
('1st', 0.4808743169398907)
('1ton', 0.0)
('1tv', 0.0)
('1½', 1.0)
('1ç', 0.0)
('20', 0.5332645693656524)
('200', 0.5638095238095238)
('2000', 0.5031446540880503)
('20000', 0.0)
('20001', 1.0)
('2000ad', 1.0)
('2000s', 0.0)
('2001', 0.6241610738255033)
('2002', 0.5725806451612904)
('2003', 0.5)


0.637875751503006