In [1]:
import nltk
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
import langid
import matplotlib.pyplot as plt

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words = 25000)
X = np.concatenate((x_train, x_test))
Y = np.concatenate((y_train, y_test))
print(len(X))

50000


In [4]:
#Maps i to ith most common word in dataset
word_index = tf.keras.datasets.imdb.get_word_index()
inverted_word_index = dict((i, word) for (word, i) in word_index.items())

In [5]:
#Print x most frequent words
def mostFreq(x):
    return " ".join(inverted_word_index[i] for i in range(1, x+1))

print(mostFreq(1), end='\n\n')
print(mostFreq(25), end='\n\n')
print(mostFreq(100))

the

the and a of to is br in it i this that was as for with movie but film on not you are his have

the and a of to is br in it i this that was as for with movie but film on not you are his have he be one all at by an they who so from like her or just about it's out has if some there what good more when very up no time she even my would which only story really see their had can were me well than we much been bad get will do also into people other first great because how him most don't made its then way make them too could any movies after


In [6]:
#Get 90:10 train-test split while maintaining 50-50 positive-negative ratio
df = pd.DataFrame({'X': X, 'Y': Y})
df = df.sort_values(by=['Y'])
dataset = df.to_numpy()
train = np.concatenate((dataset[:22500], dataset[25000:47500]))
test = np.concatenate((dataset[22500:25000], dataset[47500:]))
print(sum(train[i][1] for i in range(45000)))
print(sum(test[i][1] for i in range(5000)))

22500
2500


In [7]:
x_train = np.array([[inverted_word_index[word] for word in doc[0]] for doc in train])
y_train = np.array([doc[1] for doc in train])
x_test = np.array([[inverted_word_index[word] for word in doc[0]] for doc in test])
y_test = np.array([doc[1] for doc in test])

In [8]:
#Stemming, removing stop words, numbers, punctuation
stops = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
for i in range(len(x_train)):
    x_train[i] = " ".join([stemmer.stem(w) for w in x_train[i] if (w.isalpha() and w not in stops)])
for i in range(len(x_test)):
    x_test[i] = " ".join([stemmer.stem(w) for w in x_test[i] if (w.isalpha() and w not in stops)])

In [9]:
#Build Tf-idf sparse matrix
vectorizer = TfidfVectorizer(min_df=2,max_df=0.5,ngram_range=(1,2), max_features = 10000)
tfidf_matrix = vectorizer.fit_transform(x_train)
vocab = np.array(vectorizer.get_feature_names())

In [56]:
def randSample(docs, targets, pct_acq, pct_del = 0):
    n = int((pct_acq + pct_del) * len(docs))
    indices = np.random.choice(len(docs), n, replace=False)
    X = [docs[i] for i in indices]
    Y = [targets[i] for i in indices]
    return X, Y

def dropout(X, Y, pct_acq, pct_del):
    n = int(pct_acq / (pct_acq + pct_del) * len(X))
    indices = np.random.choice(len(X), n, replace=False)
    X = [X[i] for i in indices]
    Y = [Y[i] for i in indices]
    return X, Y

In [57]:
def entropy(y_probs):
    return -1.0 * np.sum(y_probs * np.log(y_probs + np.finfo(float).eps)) / np.log(y_probs.size)
    
def least_confidence(y_probs):
    return y_probs.size * (1 - np.nanmax(y_probs)) / (y_probs.size - 1)

In [62]:
def NB_with_TFIDF(iters, pct_acq, metric, pct_del = 0, features = 10000):
    accuracy = list()
    X, Y = randSample(x_train, y_train, pct_acq, pct_del)
    if pct_del > 0:
        X, Y = dropout(X, Y, pct_acq, pct_del)
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2), max_features = features)
    tfidf_matrix = vectorizer.fit_transform(X)
    model = MultinomialNB()
    model.fit(tfidf_matrix, Y)
    accuracy.append(model.score(vectorizer.transform(x_test), y_test))
    y_probs = model.predict_proba(vectorizer.transform(x_train))
    del vectorizer
    del model
    for itr in range(iters):
        if metric == 'LC':
            uncertainty = pd.DataFrame([least_confidence(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
        elif metric == 'entropy':
            uncertainty = pd.DataFrame([entropy(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
        n = int((pct_acq + pct_del) * len(x_train))
        subX = [x_train[i] for i in uncertainty.iloc[:n].index.tolist()]
        subY = [y_train[i] for i in uncertainty.iloc[:n].index.tolist()]
        if pct_del > 0:
            subX, subY = dropout(subX, subY, pct_acq, pct_del)
        X.extend(subX)
        Y.extend(subY)
        vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2), max_features = features)
        tfidf_matrix = vectorizer.fit_transform(X)
        model = MultinomialNB()
        model.fit(tfidf_matrix, Y)
        score = model.score(vectorizer.transform(x_test), y_test)
        accuracy.append(score)
        y_probs = model.predict_proba(vectorizer.transform(x_train))
        del vectorizer
        del model
    print(accuracy)
    return accuracy