In [2]:
!pip -V
!pip install nltk

pip 19.1 from /home/vchrombie/anaconda3/lib/python3.7/site-packages/pip (python 3.7)


In [3]:
import re
import json
import string

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import sklearn.linear_model as sk
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

from nltk import SnowballStemmer

In [4]:
def tokenize(data):
    """ Tokenize tweets """
    stemmer = SnowballStemmer("english")
    stop_words = text.ENGLISH_STOP_WORDS
    temp = data
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    temp = regex.sub(' ', temp)
    temp = "".join(b for b in temp if ord(b) < 128)
    temp = temp.lower()
    words = temp.split()
    no_stop_words = [w for w in words if not w in stop_words]
    stemmed = [stemmer.stem(item) for item in no_stop_words]

    return stemmed

In [5]:
def reduce_by_location(data):
    """ Reduces dataset to only contain tweets with location
        marked as either WA or MA """
    temp = data[(data.location.str.contains(r'[.]+ WA$'))
            | (data.location.str.contains(r'[.]+ MA$'))
            | (data.location.str.contains('Boston'))
            | (data.location.str.contains('Seattle'))
            | (data.location.str.contains(r'[.]+ Washington\s'))
            | (data.location.str.contains('Massachusetts'))]
    
    return temp

In [6]:
def map_locations(data):
    """ Creates target variables 1: for WA and 0: MA """
    targets = []
    data.location.apply(lambda x: x.encode('utf-8').strip())
    for location in data.location:
        if (r'[.]+ WA$' in location) or ('Seattle' in location) or (r'[.]+ Washington\s' in location):
            targets.append(1)
        else:
            targets.append(0)
    return np.array(targets)

In [7]:
def balance_datasets(data, targets):
    """ Balances datasets by selecting random points from
        the minority class """
    new_data = data.copy()
    if (len(targets[targets==0])) > (len(targets[targets==1])):
        points_needed = len(targets[targets==0]) - len(targets[targets==1])
        indices = np.where(targets == 1)
    else:
        points_needed = len(targets[targets==1]) - len(targets[targets==0])
        indices = np.where(targets == 0)

    np.random.shuffle(indices)
    indices = np.resize(indices, points_needed)
    new_data = new_data.append(data.iloc[indices])
    targets_to_add = targets[indices]
    new_targets = np.concatenate([targets, targets_to_add])
    return new_data, new_targets

In [8]:
DATA_FOLDER = 'tweet_data/'
filename = 'tweets_#superbowl.txt'

In [9]:
# Collect tweets from superbowl
count=0

tweets_ = []
with open(filename, 'r') as f:
    for row in f:
        if(count>40000):
            break
        jrow = json.loads(row)
        d = {
            'tweet': jrow['title'],
            'location': jrow['tweet']['user']['location']
        }
        tweets_.append(d)
        count=count+1

In [10]:
all_data = pd.DataFrame(tweets_)
all_data

Unnamed: 0,location,tweet
0,#Seahawks #Mariners,I'm so excited the road to #SuperBowlXLIX will...
1,,At http://t.co/Vd0RWOeAed -- #Seahawks #12thMA...
2,k-town Ak.,You been 12ed pass it on #SeahawkNation #LOB #...
3,Brazil,27 days to the SuperBowl 🏈🏈\n#Katyperry #KatyC...
4,Iowa,Check out the cool event that #budlight has p...
5,http://ask.fm/lKatzPerryl,#SuperBowl2015 #Halftime 🏈🎉 http://t.co/qk8Wew...
6,Cloud 9,Lenny Kravitz acompañará a Katy Perry en el #H...
7,LATINOAMÉRICA,#AlertaQRP NFL anuncia que Katy Perry estará a...
8,Westcoast right near da beach,So this official OMG Super Bowl #prediction \n...
9,"Boston, MA",Our @ButchStearns talked #Patriots w/3X #Super...


In [11]:
# Filter out tweets by appropriate location data
reduced_data = reduce_by_location(all_data)
reduced_data

Unnamed: 0,location,tweet
9,"Boston, MA",Our @ButchStearns talked #Patriots w/3X #Super...
13,Seattle,#MondayMotivation #GOHAWKS #SEAHAWKS #superbow...
35,"West Seattle, Wa",@DougBaldwinJr just this #mediocre #SuperBowlC...
37,"West Seattle, Wa",Just a #SuperBowlChampion #Pedestrian wide re...
72,"Boston, MA, USA",@jc_nogales Sunday February 1st. Se juega en A...
79,Central Massachusetts,@Deadspin Is it worse than #NFL with #Patriots...
116,Massachusetts,"@katyperry You have to play your ""Lose Your Lo..."
156,"Seattle, WA",Seattle #Seahawks #LegionofBoom Youth Tee #Sup...
157,"Seattle, WA",Seattle #Seahawks #LegionofBoom Youth Tee #Sup...
164,Massachusetts,@JessicCarpenter thanks for the follow Go #Pat...


In [12]:
# Create target label
# 0: MA 1: WA
all_targets = map_locations(reduced_data)
all_targets

array([0, 1, 1, ..., 0, 0, 0])

In [13]:
# Balance datset
data, train_targets = balance_datasets(reduced_data, all_targets)

In [14]:
# # Vectorize tweets
vectorizer = CountVectorizer(analyzer='word', stop_words='english', tokenizer=tokenize)
tfidf_transformer = TfidfTransformer()
train_counts = vectorizer.fit_transform(data.tweet)
train_tfidf = tfidf_transformer.fit_transform(train_counts)

In [15]:
# Truncate twitter data to 50 features
svd = TruncatedSVD(n_components=50, random_state=42)
train_reduced = svd.fit_transform(train_tfidf)

In [16]:
# Feature Scaling For Certain Algorithms Require Nonnegative Values
min_max_scaler = preprocessing.MinMaxScaler()
train_data = min_max_scaler.fit_transform(train_reduced)

In [17]:
k=5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [18]:
def test_naivebayes():
    # Perform 5-Fold CV to fit Naive Bayes Model

    accuracies = 0
    for train_index, test_index in kf.split(train_data):
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = train_targets[train_index], train_targets[test_index]

        clf = MultinomialNB().fit(X_train, y_train)
        predicted_bayes = clf.predict(X_test)
        accuracy_bayes = np.mean(predicted_bayes == y_test)
        accuracies += accuracy_bayes

    print("Average CV-Accuracy of Multinomial Naive Bayes: " + str(accuracies/k))
    print(classification_report(y_test, predicted_bayes))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predicted_bayes))

In [19]:
def test_logreg():
    # Perform 5-Fold CV to fit Logistic Regression
    
    accuracies = 0
    for train_index, test_index in kf.split(train_data):
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = train_targets[train_index], train_targets[test_index]

        logit = sk.LogisticRegression().fit(X_train, y_train)
        probabilities = logit.predict(X_test)
        predicted_lr = (probabilities > 0.5).astype(int)
        accuracy_lr = np.mean(predicted_lr == y_test)
        accuracies += accuracy_lr

    print("Average CV-Accuracy of Logistic Regression: " + str(accuracies/k))
    print(classification_report(y_test, predicted_lr))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predicted_lr))

In [20]:
def test_svm():
    # Perform 5-Fold CV to fit Logistic Regression
    
    accuracies = 0
    for train_index, test_index in kf.split(train_data):
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = train_targets[train_index], train_targets[test_index]

        linear_SVM = LinearSVC(dual=False, random_state=42).fit(X_train, y_train)
        predicted_svm = linear_SVM.predict(X_test)
        accuracy_svm = np.mean(predicted_svm == y_test)
        accuracies += accuracy_svm

    print("Average CV-Accuracy of Linear SVM: " + str(accuracies/k))
    print(classification_report(y_test, predicted_svm))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predicted_svm))

In [21]:
print("The Comparisions are shown below\n\n")
test_naivebayes()
test_logreg()
test_svm()

The Comparisions are shown below


Average CV-Accuracy of Multinomial Naive Bayes: 0.8807153046668026
             precision    recall  f1-score   support

          0       0.89      0.93      0.91       350
          1       0.93      0.88      0.90       350

avg / total       0.91      0.91      0.91       700

Confusion Matrix:
[[326  24]
 [ 41 309]]
Average CV-Accuracy of Logistic Regression: 0.932646423476666
             precision    recall  f1-score   support

          0       0.91      0.94      0.93       350
          1       0.94      0.91      0.92       350

avg / total       0.93      0.93      0.93       700

Confusion Matrix:
[[329  21]
 [ 31 319]]
Average CV-Accuracy of Linear SVM: 0.932075402486244
             precision    recall  f1-score   support

          0       0.93      0.92      0.92       350
          1       0.92      0.93      0.92       350

avg / total       0.92      0.92      0.92       700

Confusion Matrix:
[[323  27]
 [ 26 324]]
