In [22]:
import ngram_lm
import tweets_processor
import importlib
import vocabulary
import utils
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [2]:
# reload to get the recent changes ortherwise have to restart the notebook
importlib.reload(tweets_processor)
# get the tweets and the region labels from csv file
tweets_text, tweets_regions = tweets_processor.get_tweets_from_csv()

In [3]:
# split into test and train
######TODO: Split into train, dev, test -use kfold cross validation #########
train_data, test_data, train_labels, test_labels = train_test_split(tweets_text, tweets_regions, test_size=0.33, random_state=0)

In [4]:
# processing the tweets
def get_tokenized_tweets(tweets, mode):
    tokenized_tweets = []
    tokens = []
    vocabulary_tokens = []
    for tweet in tweets:
        # preprocess the tweets
        processed_tweet = tweets_processor.preprocessor(tweet)
        # pass through tokenizer
        tokenized_tweet = tweets_processor.tokenizer(processed_tweet)
        #tokens without <s></s> for creating the train vocabulary
        if mode == 'train':
            vocabulary_tokens.extend(tokenized_tweet)
        tokenized_tweet = ["<s>", "<s>"] + tokenized_tweet + ["</s>"]
        # for each tweet get the tokens
        tokenized_tweets.append(tokenized_tweet)
        # get all the tokens for all tweets in train set
        tokens.extend(tokenized_tweet)
    return tokenized_tweets, tokens, vocabulary_tokens

In [10]:
# processing tokens to replace anything not in vocab with <unk>
def process_tokens_unk(tokens):
    return np.array([utils.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in tokens], dtype=object)

In [7]:
# get train tokens and vocabulary tokens from trian data set
train_tokenized_tweets, train_tokens, vocabulary_tokens = get_tokenized_tweets(train_data, 'train')

In [8]:
# create vocabulary
vocab = vocabulary.Vocabulary(utils.canonicalize_word(w,digits=False) for w in vocabulary_tokens)
print(vocab.size)

13711


In [11]:
# process the train_tokenized_tweets
train_tokens = process_tokens_unk(train_tokens)

processed_train_tokenized_tweets = []
for tokenized_tweet in train_tokenized_tweets:
    processed_unk_tweet = process_tokens_unk(tokenized_tweet)
    processed_train_tokenized_tweets.append(processed_unk_tweet)

In [13]:
# get test tokens
test_tokenized_tweets, test_tokens, _ = get_tokenized_tweets(test_data, 'test')
test_tokens = process_tokens_unk(test_tokens)

# process the test_tokenized_tweets
processed_test_tokenized_tweets = []
for tokenized_tweet in test_tokenized_tweets:
    processed_unk_tweet = process_tokens_unk(tokenized_tweet)
    processed_test_tokenized_tweets.append(processed_unk_tweet)

In [14]:
# build model
Model = ngram_lm.KNTrigramLM

t0 = time.time()
print("Building trigram LM... ", end="")
lm = Model(train_tokens)
print("done in {:.02f} s".format(time.time() - t0))
lm.print_stats()

Building trigram LM... done in 0.36 s
=== N-gram Language Model stats ===
  13,710 unique 1-grams
  48,633 unique 2-grams
  57,948 unique 3-grams
Optimal memory usage (counts only): 2.72 MB


In [200]:
importlib.reload(ngram_lm)
importlib.reload(utils)

<module 'utils' from '/Users/divyagorantla/Documents/MIDS/w266/final-project/utils.py'>

In [15]:
# build feature vectors for passing to the classifier
def get_input_features(tokenized_tweets, ngram):
    feature_vector = []
    for sequence in tokenized_tweets:
        probabilities, count = lm.get_seq_probability(sequence, n_order=ngram)
        #print(probabilities)
        feature_vector.append(probabilities)
    # pad probabilities with 0.0 for lengths less than 50 ??????????????what should we set this length
    padded_feature_vector, _ = utils.pad_np_array_float(feature_vector, max_len=50, pad_id=0.0)
    return padded_feature_vector

In [16]:
# for trigrams
# get train fetaure vectors
classfier_train_data = get_input_features(processed_train_tokenized_tweets, 3)

# get test feature vectors
classfier_test_data = get_input_features(processed_test_tokenized_tweets, 3)

In [17]:
#classfier_train_data[1]
classfier_test_data[2]

array([5.4030586e-02, 8.5744292e-01, 7.6563349e-03, 4.2354516e-03,
       0.0000000e+00, 0.0000000e+00, 1.5627249e-03, 7.5044453e-02,
       5.7978872e-05, 1.2116993e-05, 7.7108137e-04, 0.0000000e+00,
       0.0000000e+00, 1.8505953e-04, 1.5421627e-05, 0.0000000e+00,
       2.0562169e-04, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00], dtype=float32)

In [18]:
processed_test_tokenized_tweets[2]

array(['<s>', '<s>', 'happy', 'halloween', 'favorite', 'holiday', '<unk>',
       '<unk>', 'happyhalloween', 'halloween', 'witch', 'blackcat',
       'spooky', '<unk>', '<unk>', 'haunted', 'graveyard', '<unk>', 'the',
       '</s>'], dtype=object)

In [205]:
# for bigrams
# get train fetaure vectors
new_classfier_train_data = get_input_features(processed_train_tokenized_tweets, 2)

# get test feature vectors
new_classfier_test_data = get_input_features(processed_test_tokenized_tweets, 2)

In [182]:
#new_classfier_train_data[1]
new_classfier_test_data[2]

array([2.40327176e-02, 7.49111712e-01, 2.88975495e-03, 5.64726861e-03,
       0.00000000e+00, 0.00000000e+00, 1.56272494e-03, 7.50444531e-02,
       1.15957744e-04, 1.21169933e-05, 7.71081366e-04, 0.00000000e+00,
       0.00000000e+00, 1.85059529e-04, 1.54216268e-05, 0.00000000e+00,
       2.05621691e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00], dtype=float32)

In [20]:
# using naive bayes
nb = MultinomialNB()
nb.fit(classfier_train_data, train_labels)
y_pred = nb.predict(classfier_test_data)
acc = accuracy_score(test_labels, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 13.12%


In [207]:
# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg.fit(classfier_train_data, train_labels)

print("Accuracy on test set: {:.02%}".format(lg.score(classfier_test_data, test_labels)))

Accuracy on test set: 10.85%


In [23]:
mlp = MLPClassifier(hidden_layer_sizes=(100, ),)
mlp.fit(classfier_train_data, train_labels)
y_pred = mlp.predict(classfier_test_data)
acc = accuracy_score(test_labels, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 11.24%


