In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import os
import string
import numpy as np

(1) IMDB Dataset
------------
The IMDB dataset consists of 15000 reviews in training set, 10000 reviews in validation set, and 25000 reviews in test set. This is a 2 class problem with class 1 being positive sentiment and class 0 being negative sentiment.

Here, we format such that each line in the train/val/test file is a data point with word occurances encoded by their respective index.

This takes a couple minutes to run, because we save the dataset as a
nb_example x 10000 feature matrix, where 10000 is the number of words 
in the vocabulary, and nb_example is the number of examples in the set.


In [13]:
# Load dataset split
train_dir = os.path.join(os.getcwd(), 'hwk3_datasets/IMDB-train.txt')
val_dir = os.path.join(os.getcwd(), 'hwk3_datasets/IMDB-valid.txt')
test_dir = os.path.join(os.getcwd(), 'hwk3_datasets/IMDB-test.txt')

train_data = pd.read_csv(train_dir, sep='\t', names=['review', 'score'], header = None)
val_data = pd.read_csv(val_dir, sep='\t', names=['review', 'score'], header = None)
test_data = pd.read_csv(test_dir, sep='\t', names=['review', 'score'], header = None)

# Now, remove punctuation and capital letters - we want to keep only word
# characteres (letters and numbers) so [^\w\s]
# Note we also want to get rid of '<br />', which is kind of a special case
train_data['review'] = train_data['review'].str.replace('<br />', '')
train_data['review'] = train_data['review'].str.replace(r'[^\w\s]+', '')
train_data['review'] = train_data['review'].str.lower()
val_data['review'] = val_data['review'].str.replace('<br />', '')
val_data['review'] = val_data['review'].str.replace(r'[^\w\s]+', '')
val_data['review'] = val_data['review'].str.lower()
test_data['review'] = test_data['review'].str.replace('<br />', '')
test_data['review'] = test_data['review'].str.replace(r'[^\w\s]+', '')
test_data['review'] = test_data['review'].str.lower()


# Will take the 10000 most frequent words
vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(train_data['review'])
train_vectors = vectorizer.transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])
val_vectors = vectorizer.transform(val_data['review'])
words = vectorizer.get_feature_names()
train_matrix = np.asarray(train_vectors)
frequency_vector = np.asarray(train_vectors.sum(axis=0)).reshape(10000,)
frequency_list = frequency_vector.tolist()

index_list = range(10000)
frequency_list, index_list, words = zip(*sorted(zip(frequency_list, index_list, words), reverse=True))

# Get the vocabulary. 
vocabulary = vectorizer.vocabulary_

with open("imdb-vocab.txt",'w') as vocab_file:
    for i in range(10000):
        vocab_file.write("{0:<12}\t{1:>5}\t{2:>8}\n".format(words[i], index_list[i], frequency_list[i] ))

# Build train file
nb_examples_train = 15000
nb_examples_val = 10000
nb_examples_test = 25000
nb_features = 10000
# Set up BBoW with 1 if example has word in index, 0 if not
BBOW_trainx = np.zeros((nb_examples_train, nb_features))
BBOW_trainy = np.zeros((nb_examples_train,))
# Set up FBoW with word_count/total_count if example has word in index, 0 if not
FBOW_trainx = np.zeros((nb_examples_train, nb_features))
FBOW_trainy = np.zeros((nb_examples_train,))
example = 0
with open("imdb-train.txt",'w') as train_file:
    for review in train_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_trainx[example, index] = 1
                FBOW_trainx[example, index] += 1
                occurances += 1
        BBOW_trainy[example] = train_data['score'][example]
        FBOW_trainy[example] = train_data['score'][example]
        train_file.write("{}\t{}\n".format(paragraph, train_data['score'][example]))
        if occurances != 0:
            FBOW_trainx[example] /= occurances
        example += 1
np.savetxt("inputs/imdb-train-bbow_x.txt", BBOW_trainx, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-train-bbow_y.txt", BBOW_trainy, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-train-fbow_x.txt", FBOW_trainx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/imdb-train-fbow_y.txt", FBOW_trainy, delimiter=",", fmt='%d')

# Build validation file
BBOW_valx = np.zeros((nb_examples_val, nb_features))
BBOW_valy = np.zeros((nb_examples_val,))
FBOW_valx = np.zeros((nb_examples_val, nb_features))
FBOW_valy = np.zeros((nb_examples_val,))
example = 0
with open("imdb-val.txt",'w') as val_file:
    for review in val_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)           
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_valx[example, index] = 1
                FBOW_valx[example, index] += 1
                occurances += 1
        BBOW_valy[example] = val_data['score'][example]
        FBOW_valy[example] = train_data['score'][example]
        val_file.write("{}\t{}\n".format(paragraph, val_data['score'][example]))
        if occurances != 0:
            FBOW_valx[example] /= occurances
        example += 1
np.savetxt("inputs/imdb-val-bbow_x.txt", BBOW_valx, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-val-bbow_y.txt", BBOW_valy, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-val-fbow_x.txt", FBOW_valx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/imdb-val-fbow_y.txt", FBOW_valy, delimiter=",", fmt='%d')

# Build test file
BBOW_testx = np.zeros((nb_examples_test, nb_features))
BBOW_testy = np.zeros((nb_examples_test,))
FBOW_testx = np.zeros((nb_examples_test, nb_features))
FBOW_testy = np.zeros((nb_examples_test,))
example = 0
with open("imdb-test.txt",'w') as test_file:
    for review in test_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)           
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_testx[example, index] = 1
                FBOW_testx[example, index] += 1
                occurances += 1
    BBOW_testy[example] = test_data['score'][example]
    FBOW_testy[example] = test_data['score'][example]
    test_file.write("{}\t{}\n".format(paragraph, test_data['score'][example]))
    if occurances != 0:
        FBOW_testx[example] /= occurances
    example += 1
np.savetxt("inputs/imdb-test-bbow_x.txt", BBOW_testx, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-test-bbow_y.txt", BBOW_testy, delimiter=",", fmt='%d')
np.savetxt("inputs/imdb-test-fbow_x.txt", FBOW_testx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/imdb-test-fbow_y.txt", FBOW_testy, delimiter=",", fmt='%d')



(1) Yelp Dataset
------------
The Yelp dataset consists of 7000 reviews in the training set, 1000 reviews in the validation set, and 2000 reviews in the test set. This is a 5 class problem where each review is classified into one of the five ratings with rating-5 being the best score and rating-1 being the worst score.

Here, we format such that each line in the train/val/test file is a data point with word occurances encoded by their respective index.

As for the IMDB dataset, this takes a couple minutes to run, because we save the dataset as a
nb_example x 10000 feature matrix, where 10000 is the number of words 
in the vocabulary, and nb_example is the number of examples in the set.

In [12]:
# Load dataset split
train_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-train.txt')
val_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-valid.txt')
test_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-test.txt')

train_data = pd.read_csv(train_dir, sep='\t', names=['review', 'score'], header = None)
val_data = pd.read_csv(val_dir, sep='\t', names=['review', 'score'], header = None)
test_data = pd.read_csv(test_dir, sep='\t', names=['review', 'score'], header = None)

# Now, remove punctuation and capital letters - we want to keep only word
# characteres (letters and numbers) so [^\w\s]
train_data['review'] = train_data['review'].str.replace(r'[^\w\s]+', '')
train_data['review'] = train_data['review'].str.lower()
val_data['review'] = val_data['review'].str.replace(r'[^\w\s]+', '')
val_data['review'] = val_data['review'].str.lower()
test_data['review'] = test_data['review'].str.replace(r'[^\w\s]+', '')
test_data['review'] = test_data['review'].str.lower()

# Will take the 10000 most frequent words
vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(train_data['review'])
train_vectors = vectorizer.transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])
val_vectors = vectorizer.transform(val_data['review'])
words = vectorizer.get_feature_names()
train_matrix = np.asarray(train_vectors)
frequency_vector = np.asarray(train_vectors.sum(axis=0)).reshape(10000,)
frequency_list = frequency_vector.tolist()

index_list = range(10000)
frequency_list, index_list, words = zip(*sorted(zip(frequency_list, index_list, words), reverse=True))

# Get the vocabulary. 
vocabulary = vectorizer.vocabulary_

with open("yelp-vocab.txt",'w') as vocab_file:
    for i in range(10000):
        vocab_file.write("{0:<12}\t{1:>5}\t{2:>8}\n".format(words[i], index_list[i], frequency_list[i] ))

# Build train file
nb_examples_train = 7000
nb_examples_val = 1000
nb_examples_test = 2000
nb_features = 10000
example = 0
# Set up BBoW with 1 if example has word in index, 0 if not
BBOW_trainx = np.zeros((nb_examples_train, nb_features))
BBOW_trainy = np.zeros((nb_examples_train,))
# Set up FBoW with word_count/total_count if example has word in index, 0 if not
FBOW_trainx = np.zeros((nb_examples_train, nb_features))
FBOW_trainy = np.zeros((nb_examples_train,))
with open("yelp-train.txt",'w') as train_file:
    for review in train_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_trainx[example, index] = 1
                FBOW_trainx[example, index] += 1
                occurances += 1
        BBOW_trainy[example] = train_data['score'][example]
        FBOW_trainy[example] = train_data['score'][example]
        train_file.write("{}\t{}\n".format(paragraph, train_data['score'][example]))
        if occurances != 0:
            FBOW_trainx[example] /= occurances
        example += 1
np.savetxt("inputs/yelp-train-bbow_x.txt", BBOW_trainx, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-train-bbow_y.txt", BBOW_trainy, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-train-fbow_x.txt", FBOW_trainx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/yelp-train-fbow_y.txt", FBOW_trainy, delimiter=",", fmt='%1.5f')

# Build validation file
BBOW_valx = np.zeros((nb_examples_val, nb_features))
BBOW_valy = np.zeros((nb_examples_val,))
FBOW_valx = np.zeros((nb_examples_val, nb_features))
FBOW_valy = np.zeros((nb_examples_val,))
example = 0
with open("yelp-val.txt",'w') as val_file:
    for review in val_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)           
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_valx[example, index] = 1
                FBOW_valx[example, index] += 1
                occurances += 1
        BBOW_valy[example] = val_data['score'][example]
        FBOW_valy[example] = val_data['score'][example]
        val_file.write("{}\t{}\n".format(paragraph, val_data['score'][example]))
        if occurances != 0:
            FBOW_valx[example] /= occurances
        example += 1
np.savetxt("inputs/yelp-val-bbow_x.txt", BBOW_valx, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-val-bbow_y.txt", BBOW_valy, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-val-fbow_x.txt", FBOW_valx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/yelp-val-fbow_y.txt", FBOW_valy, delimiter=",", fmt='%1.5f')

# Build test file
BBOW_testx = np.zeros((nb_examples_test, nb_features))
BBOW_testy = np.zeros((nb_examples_test,))
FBOW_testx = np.zeros((nb_examples_test, nb_features))
FBOW_testy = np.zeros((nb_examples_test,))
example = 0
with open("yelp-test.txt",'w') as test_file:
    for review in test_data['review']:
        occurances = 0
        words = review.split()
        paragraph = ""
        for word in words:
            index = vocabulary.get(word)           
            if index is not None:
                paragraph += str(index)+ " "
                BBOW_testx[example, index] = 1
                FBOW_testx[example, index] += 1
                occurances += 1
        BBOW_testy[example] = test_data['score'][example]
        FBOW_testy[example] = test_data['score'][example]
        test_file.write("{}\t{}\n".format(paragraph, test_data['score'][example]))
        if occurances != 0:
            FBOW_testx[example] /= occurances
        example += 1
np.savetxt("inputs/yelp-test-bbow_x.txt", BBOW_testx, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-test-bbow_y.txt", BBOW_testy, delimiter=",", fmt='%d')
np.savetxt("inputs/yelp-test-fbow_x.txt", FBOW_testx, delimiter=",", fmt='%1.5f')
np.savetxt("inputs/yelp-test-fbow_y.txt", FBOW_testy, delimiter=",", fmt='%1.5f')