In [12]:
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from TryAroundModels import *

In [2]:
def shuffle(X, y):
    perm = np.random.permutation(len(X))
    X = X[perm]
    y = y[perm]
    return X, y

def load_imdb_dataset(path):
    imdb_path = os.path.join(path, '')

    # Load the dataset
    train_texts = []
    train_labels = []
    test_texts = []
    test_labels = []
    for dset in ['train', 'test']:
        for cat in ['pos', 'neg']:
            dset_path = os.path.join(imdb_path, dset, cat)
            for fname in sorted(os.listdir(dset_path)):
                if fname.endswith('.txt'):
                    with open(os.path.join(dset_path, fname)) as f:
                        if dset == 'train': train_texts.append(f.read())
                        else: test_texts.append(f.read())
                    label = 0 if cat == 'neg' else 1
                    if dset == 'train': train_labels.append(label)
                    else: test_labels.append(label)

    # Converting to np.array
    train_texts = np.array(train_texts)
    train_labels = np.array(train_labels)
    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    # Shuffle the dataset
    train_texts, train_labels = shuffle(train_texts, train_labels)
    test_texts, test_labels = shuffle(test_texts, test_labels)

    # Return the dataset
    return train_texts, train_labels, test_texts, test_labels

In [3]:
train_texts, train_labels, test_texts, test_labels = load_imdb_dataset("data/")

In [5]:
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOC_FREQ = 2


kwargs = {
    'ngram_range' : NGRAM_RANGE,
    'dtype' : 'int32',
    'strip_accents' : 'unicode',
    'decode_error' : 'replace',
    'analyzer' : TOKEN_MODE,
    'min_df' : MIN_DOC_FREQ,
}

# Learn Vocab from train texts and vectorize train and val sets
tfidf_vectorizer = TfidfVectorizer(**kwargs)
X_tfidf_train = tfidf_vectorizer.fit_transform(train_texts)
X_tfidf_test = tfidf_vectorizer.transform(test_texts)

selector = SelectKBest(f_classif, k=min(TOP_K, X_tfidf_train.shape[1]))
selector.fit(X_tfidf_train, train_labels)
X_selected_tfidf_train = selector.transform(X_tfidf_train).astype('float32')
X_selected_tfidf_test = selector.transform(X_tfidf_test).astype('float32')

In [49]:
def TryAroundModel(X_train, X_test, Y_train, Y_test, X_raw_text_train = None, X_raw_text_test = None, Models = None):
    if Models is None:
        Models = []
        for i in np.nonzero([re.match("TryAroundModel", x) for x in globals().keys()])[0]:
            Models.append(list(globals().keys())[i])
    
    accuracy_list = []
    processed_arg = [X_train, X_test, Y_train, Y_test]
    
    for m in Models:
        print(m, Models)
        if m == "TryAroundModel_LG":
            accuracy_list.append(TryAroundModel_LG(*processed_arg))
        elif m == "TryAroundModel_NB":
            accuracy_list.append(TryAroundModel_NB(*processed_arg))
        elif m == "TryAroundModel_NBSVM":
            accuracy_list.append(TryAroundModel_NBSVM(*processed_arg))
        elif m == "TryAroundModel_RF":
            accuracy_list.append(TryAroundModel_RF(*processed_arg))
        elif m == "TryAroundModel_GBM":
            accuracy_list.append(TryAroundModel_GBM(*processed_arg))
        elif m == "TryAroundModel_MPLNN":
            accuracy_list.append(TryAroundModel_MPLNN(*processed_arg))

        if X_raw_text_train is not None and X_raw_text_test is not None:

            raw_arg = [X_raw_text_train, X_raw_text_test, Y_train, Y_test]
            if m == "TryAroundModel_CNN":
                accuracy_list.append(TryAroundModel_CNN(*raw_arg))
            elif m == "TryAroundModel_LSTM":
                accuracy_list.append(TryAroundModel_LSTM(*raw_arg))
            elif m == "TryAroundModel_FB_LSTM":
                accuracy_list.append(TryAroundModel_FB_LSTM(*raw_arg))
        
    return sorted(accuracy_list, key = lambda x: x[1], reverse = True)


accuracy_list = TryAroundModel(X_selected_tfidf_train, X_selected_tfidf_test, train_labels,
                               test_labels, train_texts, test_texts)
accuracy_list

TryAroundModel_LG ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']
Logistic Regression -- Accuracy:  0.88356
TryAroundModel_NB ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']
Multinomial Naive Bayes -- Accuracy:  0.859
TryAroundModel_RF ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']
Random Forest -- Accuracy:  0.82364
TryAroundModel_GBM ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'T



LSTM Neural Network -- Accuracy:  0.84944
TryAroundModel_FB_LSTM ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']
Forward and Backward LSTM Neural Netword -- Accuracy:  0.75512
TryAroundModel_CNN ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']
Convolutional Neural Network -- Accuracy:  0.87708
TryAroundModel ['TryAroundModel_LG', 'TryAroundModel_NB', 'TryAroundModel_RF', 'TryAroundModel_GBM', 'TryAroundModel_NBSVM', 'TryAroundModel_MPLNN', 'TryAroundModel_LSTM', 'TryAroundModel_FB_LSTM', 'TryAroundModel_CNN', 'TryAroundModel']


[('Multilayer Perceptron Neural Network(MLP)', 0.89868),
 ('Logistic Regression', 0.88356),
 ('Convolutional Neural Network', 0.87708),
 ('Naive Bayes SVM', 0.86632),
 ('Multinomial Naive Bayes', 0.859),
 ('LSTM Neural Network', 0.84944),
 ('Random Forest', 0.82364),
 ('Forward and Backward LSTM Neural Netword', 0.75512),
 ('Gradient Boosting Machine', 0.70032)]

In [50]:
pd.DataFrame(accuracy_list)

Unnamed: 0,0,1
0,Multilayer Perceptron Neural Network(MLP),0.89868
1,Logistic Regression,0.88356
2,Convolutional Neural Network,0.87708
3,Naive Bayes SVM,0.86632
4,Multinomial Naive Bayes,0.859
5,LSTM Neural Network,0.84944
6,Random Forest,0.82364
7,Forward and Backward LSTM Neural Netword,0.75512
8,Gradient Boosting Machine,0.70032


In [18]:
accuracy_list_old = accuracy_list.copy()

'0b0'

In [68]:
# Python3 code to find the element that 
# appears once 

def getSingle(arr, n): 
	ones = 0
	twos = 0
	
	for i in range(n): 
		# one & arr[i]" gives the bits that 
		# are there in both 'ones' and new 
		# element from arr[]. We add these 
		# bits to 'twos' using bitwise OR 
		twos = twos | (ones & arr[i]) 
		print(bin(twos))
		
		# one & arr[i]" gives the bits that 
		# are there in both 'ones' and new 
		# element from arr[]. We add these 
		# bits to 'twos' using bitwise OR 
		ones = ones ^ arr[i] 
		print(bin(ones))
		
		# The common bits are those bits 
		# which appear third time. So these 
		# bits should not be there in both 
		# 'ones' and 'twos'. common_bit_mask 
		# contains all these bits as 0, so 
		# that the bits can be removed from 
		# 'ones' and 'twos' 
		common_bit_mask = ~(ones & twos) 
		print(bin(common_bit_mask))
		
		# Remove common bits (the bits that 
		# appear third time) from 'ones' 
		ones &= common_bit_mask 
		print(bin(ones))
		
		# Remove common bits (the bits that 
		# appear third time) from 'twos' 
		twos &= common_bit_mask 
		print(bin(twos), '\n')
	return ones 
	
# driver code 
arr = [3, 2, 3, 4, 5, 6, 4] 
n = len(arr) 
print("The element with single occurrence is ", 
		getSingle(arr, n)) 

# This code is contributed by "Abhishek Sharma 44" 



0b0
0b11
-0b1
0b11
0b0 

0b10
0b1
-0b1
0b1
0b10 

0b11
0b10
-0b11
0b0
0b1 

0b1
0b100
-0b1
0b100
0b1 

0b101
0b1
-0b10
0b0
0b100 

0b100
0b110
-0b101
0b10
0b0 

0b0
0b110
-0b1
0b110
0b0 

The element with single occurrence is  6
