(2) Binary Bag of Words (Yelp Dataset)
---------------------------------
For this question, we will focus on the yelp dataset with binary bag-of-words (BBoW) representation. We will use the F1-measure as the evaluation metric.
As a baseline, we use a random classifier and a majority classifier. Then we train Naive Bayes (Bernoulli), Decision Tree and Linear SVM. We tune hyperparameters with the validation set.

In [16]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
import os
import pandas as pd
import string
import numpy as np

In [17]:
def yelp_load():
    # Load dataset split (loads numpy arrays in case they aren't loaded)
    train_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-train.txt')
    val_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-valid.txt')
    test_dir = os.path.join(os.getcwd(), 'hwk3_datasets/yelp-test.txt')

    train_data = pd.read_csv(train_dir, sep='\t', names=['review', 'score'], header = None)
    val_data = pd.read_csv(val_dir, sep='\t', names=['review', 'score'], header = None)
    test_data = pd.read_csv(test_dir, sep='\t', names=['review', 'score'], header = None)

    # Now, remove punctuation and capital letters - we want to keep only word
    # characteres (letters and numbers) so [^\w\s]
    train_data['review'] = train_data['review'].str.replace(r'[^\w\s]+', '')
    train_data['review'] = train_data['review'].str.lower()
    val_data['review'] = val_data['review'].str.replace(r'[^\w\s]+', '')
    val_data['review'] = val_data['review'].str.lower()
    test_data['review'] = test_data['review'].str.replace(r'[^\w\s]+', '')
    test_data['review'] = test_data['review'].str.lower()

    # Will take the 10000 most frequent words
    vectorizer = CountVectorizer(max_features=10000)
    vectorizer.fit(train_data['review'])
    train_vectors = vectorizer.transform(train_data['review'])
    test_vectors = vectorizer.transform(test_data['review'])
    val_vectors = vectorizer.transform(val_data['review'])
    words = vectorizer.get_feature_names()
    train_matrix = np.asarray(train_vectors)
    frequency_vector = np.asarray(train_vectors.sum(axis=0)).reshape(10000,)
    frequency_list = frequency_vector.tolist()

    index_list = range(10000)
    frequency_list, index_list, words = zip(*sorted(zip(frequency_list, index_list, words), reverse=True))

    # Get the vocabulary. 
    vocabulary = vectorizer.vocabulary_

    with open("yelp-vocab.txt",'w') as vocab_file:
        for i in range(10000):
            vocab_file.write("{0:<12}\t{1:>5}\t{2:>8}\n".format(words[i], index_list[i], frequency_list[i] ))

    # Build train file
    nb_examples_train = 7000
    nb_examples_val = 1000
    nb_examples_test = 2000
    nb_features = 10000
    example = 0
    # Set up BBoW with 1 if example has word in index, 0 if not
    BBOW_trainx = np.zeros((nb_examples_train, nb_features))
    BBOW_trainy = np.zeros((nb_examples_train,))
    # Set up FBoW with word_count/total_count if example has word in index, 0 if not
    FBOW_trainx = np.zeros((nb_examples_train, nb_features))
    FBOW_trainy = np.zeros((nb_examples_train,))
    with open("yelp-train.txt",'w') as train_file:
        for review in train_data['review']:
            occurances = 0
            words = review.split()
            paragraph = ""
            for word in words:
                index = vocabulary.get(word)
                if index is not None:
                    paragraph += str(index)+ " "
                    BBOW_trainx[example, index] = 1
                    FBOW_trainx[example, index] += 1
                    occurances += 1
            BBOW_trainy[example] = train_data['score'][example]
            FBOW_trainy[example] = train_data['score'][example]
            train_file.write("{}\t{}\n".format(paragraph, train_data['score'][example]))
            if occurances != 0:
                FBOW_trainx[example] /= occurances
            example += 1
    np.savetxt("inputs/yelp-train-bbow_x.txt", BBOW_trainx, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-train-bbow_y.txt", BBOW_trainy, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-train-fbow_x.txt", FBOW_trainx, delimiter=",", fmt='%1.5f')
    np.savetxt("inputs/yelp-train-fbow_y.txt", FBOW_trainy, delimiter=",", fmt='%d')

    # Build validation file
    BBOW_valx = np.zeros((nb_examples_val, nb_features))
    BBOW_valy = np.zeros((nb_examples_val,))
    FBOW_valx = np.zeros((nb_examples_val, nb_features))
    FBOW_valy = np.zeros((nb_examples_val,))
    example = 0
    with open("yelp-val.txt",'w') as val_file:
        for review in val_data['review']:
            occurances = 0
            words = review.split()
            paragraph = ""
            for word in words:
                index = vocabulary.get(word)           
                if index is not None:
                    paragraph += str(index)+ " "
                    BBOW_valx[example, index] = 1
                    FBOW_valx[example, index] += 1
                    occurances += 1
            BBOW_valy[example] = val_data['score'][example]
            FBOW_valy[example] = val_data['score'][example]
            val_file.write("{}\t{}\n".format(paragraph, val_data['score'][example]))
            if occurances != 0:
                FBOW_valx[example] /= occurances
            example += 1
    np.savetxt("inputs/yelp-val-bbow_x.txt", BBOW_valx, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-val-bbow_y.txt", BBOW_valy, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-val-fbow_x.txt", FBOW_valx, delimiter=",", fmt='%1.5f')
    np.savetxt("inputs/yelp-val-fbow_y.txt", FBOW_valy, delimiter=",", fmt='%d')

    # Build test file
    BBOW_testx = np.zeros((nb_examples_test, nb_features))
    BBOW_testy = np.zeros((nb_examples_test,))
    FBOW_testx = np.zeros((nb_examples_test, nb_features))
    FBOW_testy = np.zeros((nb_examples_test,))
    example = 0
    with open("yelp-test.txt",'w') as test_file:
        for review in test_data['review']:
            occurances = 0
            words = review.split()
            paragraph = ""
            for word in words:
                index = vocabulary.get(word)           
                if index is not None:
                    paragraph += str(index)+ " "
                    BBOW_testx[example, index] = 1
                    FBOW_testx[example, index] += 1
                    occurances += 1
            BBOW_testy[example] = test_data['score'][example]
            FBOW_testy[example] = test_data['score'][example]
            test_file.write("{}\t{}\n".format(paragraph, test_data['score'][example]))
            if occurances != 0:
                FBOW_testx[example] /= occurances
            example += 1
    np.savetxt("inputs/yelp-test-bbow_x.txt", BBOW_testx, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-test-bbow_y.txt", BBOW_testy, delimiter=",", fmt='%d')
    np.savetxt("inputs/yelp-test-fbow_x.txt", FBOW_testx, delimiter=",", fmt='%1.5f')
    np.savetxt("inputs/yelp-test-fbow_y.txt", FBOW_testy, delimiter=",", fmt='%d')

In [18]:
# Load up arrays
yelp_train_bbow_x = 'inputs/yelp-train-bbow_x.txt'
yelp_train_bbow_y ='inputs/yelp-train-bbow_y.txt'
yelp_val_bbow_x ='inputs/yelp-val-bbow_x.txt'
yelp_val_bbow_y ='inputs/yelp-val-bbow_y.txt'
yelp_test_bbow_x ='inputs/yelp-test-bbow_x.txt'
yelp_test_bbow_y ='inputs/yelp-test-bbow_y.txt'
if os.path.isfile(yelp_train_bbow_x) and os.path.isfile(yelp_val_bbow_x) and os.path.isfile(yelp_test_bbow_x):
    print("Input files already exist")
    train_x = np.loadtxt(yelp_train_bbow_x, delimiter=',')
    train_y = np.loadtxt(yelp_train_bbow_y, delimiter=',')
    val_x = np.loadtxt(yelp_val_bbow_x, delimiter=',')
    val_y = np.loadtxt(yelp_val_bbow_y, delimiter=',')
    test_x = np.loadtxt(yelp_test_bbow_x, delimiter=',')
    test_y = np.loadtxt(yelp_test_bbow_y, delimiter=',')
else:
    print("--------Creating input files (might take a while)---------------")
    if not os.path.exists('inputs'):
        os.makedirs('inputs')
    yelp_load()
    print("Loading training set...")
    train_x = np.loadtxt(yelp_train_bbow_x, delimiter=',')
    train_y = np.loadtxt(yelp_train_bbow_y, delimiter=',')
    print("Loading validation set...")
    val_x = np.loadtxt(yelp_val_bbow_x, delimiter=',')
    val_y = np.loadtxt(yelp_val_bbow_y, delimiter=',')
    print("Loading test set...")
    test_x = np.loadtxt(yelp_test_bbow_x, delimiter=',')
    test_y = np.loadtxt(yelp_test_bbow_y, delimiter=',')

Input files already exist


In [19]:
def eval_classifier(classifier): 
    train_yh =classifier.predict(train_x)
    val_yh = classifier.predict(val_x)
    test_yh = classifier.predict(test_x)
    
    # Check performance
    train_ascore = accuracy_score(train_y, train_yh)
    val_ascore = accuracy_score(val_y, val_yh)
    test_ascore = accuracy_score(test_y, test_yh)
   
    val_f1 = f1_score(val_y, val_yh, average='micro')
    train_f1 = f1_score(train_y, train_yh, average='micro')
    test_f1 = f1_score(test_y, test_yh, average='micro')
    
    print("Train Acc: {}".format(train_ascore))
    print("Val Acc: {}".format(val_ascore))
    print("Test Acc: {}".format(test_ascore))
    
    print("Train F1: {}".format(train_f1))
    print("Val F1: {}".format(val_f1))
    print("Test F1: {}".format(test_f1))
    print("\n")

Hyperparameter Tuning
---------------------------------
We'll try a grid search over several parameters. The values are refined 
in the neighbourhood of where performance is best.

In [20]:
print('--------Random Classifier ----------------')
random = DummyClassifier(strategy='uniform', random_state=1) 
random.fit(train_x, train_y)
eval_classifier(random)

print('--------Majority Class Classifier ----------------')
majority = DummyClassifier(strategy='most_frequent')
majority.fit(train_x, train_y)
eval_classifier(majority)

print('-------- Naive Bayes Classifier ----------------')
# Here we setup a training validation combined array for cross 
# validation
cv_x = np.append(train_x, val_x, axis = 0)
cv_y = np.append(train_y, val_y, axis = 0)

print('Hyperparameter tuning...')
# Tuning the smoothing parameter 'alpha' of the error term. We locate it on the
# range (0.04, 0.05) and find the optimum at 0.045
bayes = BernoulliNB()
alpha_space = np.linspace(0.042, 0.048, num=5)
alpha_list = [a for a in alpha_space]
parameters={'alpha' : alpha_list}
classifier = GridSearchCV(bayes, parameters, cv = 5, scoring='f1_micro')
classifier.fit(cv_x, cv_y)
print("Optimum hyper-parameters: ", classifier.best_params_)

# Now fit model with optimum hyperparameters
print('\nEvaluating...')
opt_alpha = classifier.best_params_['alpha']
classifier = BernoulliNB(alpha=opt_alpha)
classifier.fit(train_x, train_y)
eval_classifier(classifier)


--------Random Classifier ----------------
Train Acc: 0.2017142857142857
Val Acc: 0.195
Test Acc: 0.1915
Train F1: 0.2017142857142857
Val F1: 0.195
Test F1: 0.1915


--------Majority Class Classifier ----------------
Train Acc: 0.3525714285714286
Val Acc: 0.356
Test Acc: 0.351
Train F1: 0.3525714285714286
Val F1: 0.356
Test F1: 0.351


-------- Naive Bayes Classifier ----------------
Hyperparameter tuning...
Optimum hyper-parameters:  {'alpha': 0.045}

Evaluating...
Train Acc: 0.7252857142857143
Val Acc: 0.418
Test Acc: 0.434
Train F1: 0.7252857142857143
Val F1: 0.418
Test F1: 0.434




In [21]:
print('\n-------- Decision Tree Classifier ----------------')
print('Hyperparameter tuning...')
# Tuning the depth (other decision tree hyperparameters not seen in class
# will not be considered here)
max_depths = [8, 9, 10] 

tree = DecisionTreeClassifier(random_state=1)
parameters={'max_depth' : max_depths}
classifier = GridSearchCV(tree, parameters, cv = 5, scoring='f1_micro')
classifier.fit(cv_x, cv_y)
print(classifier.best_params_)

# Now train on optimum hyperparameters
print('\nEvaluating...')
opt_parameters = classifier.best_params_
classifier = DecisionTreeClassifier(random_state=1,max_depth=opt_parameters['max_depth'])
                                  
classifier.fit(train_x, train_y)
eval_classifier(classifier)


-------- Decision Tree Classifier ----------------
Hyperparameter tuning...
{'max_depth': 9}

Evaluating...
Train Acc: 0.5062857142857143
Val Acc: 0.397
Test Acc: 0.401
Train F1: 0.5062857142857143
Val F1: 0.3970000000000001
Test F1: 0.401




In [22]:
print('-------- Linear SVM Classifier ----------------')
print('Hyperparameter tuning...')
# Tuning the penalty parameter 'C' of the error term. We locate it on the
# range (0.004, 0.005) and find the optimum at 0.0045
parameters={'C': [0.0044, 0.0045, 0.0046]}

svm_clf = svm.LinearSVC(random_state=1, max_iter=5000)
classifier = GridSearchCV(svm_clf, parameters, cv = 5, scoring='f1_micro')
classifier.fit(cv_x, cv_y)
print("Optimum hyper-parameters: ", classifier.best_params_)

# Now train on optimum hyperparameters
opt_C = classifier.best_params_['C']
print('\nEvaluating...')
classifier = svm.LinearSVC(random_state=1, C=opt_C, max_iter=5000)
classifier.fit(train_x, train_y)
eval_classifier(classifier)

-------- Linear SVM Classifier ----------------
Hyperparameter tuning...
Optimum hyper-parameters:  {'C': 0.0045}

Evaluating...
Train Acc: 0.7741428571428571
Val Acc: 0.501
Test Acc: 0.512
Train F1: 0.7741428571428571
Val F1: 0.501
Test F1: 0.512


