In [1]:
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
#preprocesses the file into a usable format
def preprocess(file_name):
    
    data = []
    is_header = True
    with open(file_name) as file:
        for line in file:
            
            
            if(is_header):
                is_header = False
                continue
            
            line=line.strip().split(",")
            
            data.append(line)
    
    return tuple(data);
train_data10 = preprocess("train-top10.csv")
train_data100 = preprocess("train-top100.csv")

In [3]:
#remove all zero vectors
def remove_non_zero(data):
    non_zero_train_data = []

    for line in data:
        for i in range(len(line[1:-1])):
            if (line[i+1] != '0'):
                non_zero_train_data.append(line)
                break
    return non_zero_train_data



non_zero_train_data10 = remove_non_zero(train_data10)
non_zero_train_data100 = remove_non_zero(train_data100)

In [4]:
#X = frequency of the top words within the tweet, y = classes
def get_training(train_data):
    X_train = []
    y_train = []

    for line in train_data:

        token_freq = line[1:-1]

        token_freq = list(map(int, token_freq))

        X_train.append(token_freq)
        y_train.append(line[-1])
        
    return X_train, y_train

In [5]:
#trains the data on both with zero and non-zero data sets
non_zero_X_train10 , non_zero_y_train10 = get_training(non_zero_train_data10)
non_zero_X_train100 , non_zero_y_train100 = get_training(non_zero_train_data100)
X_train10, y_train10 = get_training(train_data10)
X_train100, y_train100 = get_training(train_data100)

In [6]:
#Used Bernoulli NB with hold out method to generate accuracies
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

non_zero_train10_acc = []
non_zero_train100_acc = []
train10_acc = []
train100_acc = []

for i in range(3):
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train10, non_zero_y_train10, test_size=0.33, random_state=i)
    bnb.fit(X_train_samp, y_train_samp)
    non_zero_train10_acc.append(bnb.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train100, non_zero_y_train100, test_size=0.33, random_state=i)
    bnb.fit(X_train_samp, y_train_samp)
    non_zero_train100_acc.append(bnb.score(X_test_samp, y_test_samp))
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train10, y_train10, test_size=0.33, random_state=i)
    bnb.fit(X_train_samp, y_train_samp)
    train10_acc.append(bnb.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train100, y_train100, test_size=0.33, random_state=i)
    bnb.fit(X_train_samp, y_train_samp)
    train100_acc.append(bnb.score(X_test_samp, y_test_samp))

print("bernoulliNB")
print("top 10 non-zero vectors accuracy:", np.mean(non_zero_train10_acc))
print("top 100 non-zero vectors accuracy:", np.mean(non_zero_train100_acc))
print("top 10 vectors accuracy:", np.mean(train10_acc))
print("top 100 vectors accuracy:", np.mean(train100_acc))



bernoulliNB
top 10 non-zero vectors accuracy: 0.7549213898797728
top 100 non-zero vectors accuracy: 0.6128102550866777
top 10 vectors accuracy: 0.29276968328887065
top 100 vectors accuracy: 0.33762324958713225


In [7]:
#Used Multinomial NB with hold out method to generate accuracies
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

non_zero_train10_acc = []
non_zero_train100_acc = []
train10_acc = []
train100_acc = []

for i in range(3):
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train10, non_zero_y_train10, test_size=0.33, random_state=i)
    mnb.fit(X_train_samp, y_train_samp)
    non_zero_train10_acc.append(mnb.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train100, non_zero_y_train100, test_size=0.33, random_state=i)
    mnb.fit(X_train_samp, y_train_samp)
    non_zero_train100_acc.append(mnb.score(X_test_samp, y_test_samp))
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train10, y_train10, test_size=0.33, random_state=i)
    mnb.fit(X_train_samp, y_train_samp)
    train10_acc.append(mnb.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train100, y_train100, test_size=0.33, random_state=i)
    mnb.fit(X_train_samp, y_train_samp)
    train100_acc.append(mnb.score(X_test_samp, y_test_samp))

print("MultinomialNB")
print("top 10 non-zero vectors accuracy:", np.mean(non_zero_train10_acc))
print("top 100 non-zero vectors accuracy:", np.mean(non_zero_train100_acc))
print("top 10 vectors accuracy:", np.mean(train10_acc))
print("top 100 vectors accuracy:", np.mean(train100_acc))


MultinomialNB
top 10 non-zero vectors accuracy: 0.7536002113885586
top 100 non-zero vectors accuracy: 0.6136275648470769
top 10 vectors accuracy: 0.2846100475897316
top 100 vectors accuracy: 0.33074374835097187


In [8]:
#Used LogisticRegression with hold out method to generate accuracies
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

non_zero_train10_acc = []
non_zero_train100_acc = []
train10_acc = []
train100_acc = []

for i in range(3):
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train10, non_zero_y_train10, test_size=0.33, random_state=i)
    lr.fit(X_train_samp, y_train_samp)
    non_zero_train10_acc.append(lr.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(non_zero_X_train100, non_zero_y_train100, test_size=0.33, random_state=i)
    lr.fit(X_train_samp, y_train_samp)
    non_zero_train100_acc.append(lr.score(X_test_samp, y_test_samp))
    
    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train10, y_train10, test_size=0.33, random_state=i)
    lr.fit(X_train_samp, y_train_samp)
    train10_acc.append(lr.score(X_test_samp, y_test_samp))

    X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train100, y_train100, test_size=0.33, random_state=i)
    lr.fit(X_train_samp, y_train_samp)
    train100_acc.append(lr.score(X_test_samp, y_test_samp))

print("Logistic regression")
print("top 10 non-zero vectors accuracy:", np.mean(non_zero_train10_acc))
print("top 100 non-zero vectors accuracy:", np.mean(non_zero_train100_acc))
print("top 10 vectors accuracy:", np.mean(train10_acc))
print("top 100 vectors accuracy:", np.mean(train100_acc))


Logistic regression
top 10 non-zero vectors accuracy: 0.768001056942793
top 100 non-zero vectors accuracy: 0.6192627005635135
top 10 vectors accuracy: 0.28983807764846137
top 100 vectors accuracy: 0.33563952977045525


In [27]:
#error analysis with mnb
import random

locs = ["Brisbane", "Perth", "Melbourne", "Sydney"]
locs = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]

rand_num = random.randint(1,100000)

X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train100, y_train100, test_size=0.33, random_state=4)
mnb.fit(X_train_samp, y_train_samp)

for i in range(len(X_test_samp)):
    pred = mnb.predict([X_test_samp[i]])
    if pred == "Brisbane":
        if y_test_samp[i] == "Brisbane":
            locs[0][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[0][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[0][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[0][3] += 1
    if pred == "Perth":
        if y_test_samp[i]  == "Brisbane":
            locs[1][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[1][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[1][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[1][3] += 1
    if pred == "Melbourne":
        if y_test_samp[i]  == "Brisbane":
            locs[2][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[2][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[2][2] += 1
        if y_test_samp[i] == "Sydney":
            locs[2][3] += 1
    if pred == "Sydney":
        if y_test_samp[i]  == "Brisbane":
            locs[3][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[3][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[3][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[3][3] += 1
print(locs)

[[1217, 309, 255, 313], [6894, 7657, 6820, 6872], [180, 324, 1162, 291], [273, 199, 300, 1045]]


In [30]:
#error analysis with lr
import random

locs = ["Brisbane", "Perth", "Melbourne", "Sydney"]
locs = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]

rand_num = random.randint(1,100000)

X_train_samp, X_test_samp, y_train_samp, y_test_samp = train_test_split(X_train100, y_train100, test_size=0.33, random_state=7)
lr.fit(X_train_samp, y_train_samp)

for i in range(len(X_test_samp)):
    pred = mnb.predict([X_test_samp[i]])
    if pred == "Brisbane":
        if y_test_samp[i] == "Brisbane":
            locs[0][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[0][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[0][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[0][3] += 1
    if pred == "Perth":
        if y_test_samp[i]  == "Brisbane":
            locs[1][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[1][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[1][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[1][3] += 1
    if pred == "Melbourne":
        if y_test_samp[i]  == "Brisbane":
            locs[2][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[2][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[2][2] += 1
        if y_test_samp[i] == "Sydney":
            locs[2][3] += 1
    if pred == "Sydney":
        if y_test_samp[i]  == "Brisbane":
            locs[3][0] += 1
        if y_test_samp[i]  == "Perth":
            locs[3][1] += 1
        if y_test_samp[i]  == "Melbourne":
            locs[3][2] += 1
        if y_test_samp[i]  == "Sydney":
            locs[3][3] += 1
print(locs)

[[1209, 352, 247, 336], [6765, 7803, 6806, 6838], [190, 307, 1154, 282], [273, 209, 248, 1092]]
