In [0]:
#unzipping

import zipfile
with zipfile.ZipFile("required_files.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [0]:
import json

In [0]:
def get_data(query, location):
    # print()
    # print(query, location)
    #Get the data, either from cache or from the internet
    path = "required_files/"
    try:
        with open(path + query + "_" + location + "_cache.txt", "r") as f:
            data = json.loads(f.read())
        # print("Found cache")
    except:
        urls = get_urls(query, location)
        print("Collecting data from the internet")
        data = get_data_from_internet(urls)
        with open(path + query + "_" + location + "_cache.txt", "w") as f:
            if (data != []):
                f.write(json.dumps(data))
    print(f"Collected {len(data):3d} unique tweets for this query and location")
    return data

In [9]:
#This url can be intuitively modified to suit our query
#This specific url searches for tweets mentioning @realDonaldTrump between the dates 2016-10-08 and 2016-11-08
queries = ["trump", "clinton"]
locations = ["philadelphia", "chester", "belmont", "hamilton", "newhanover", 
             "wake", "watauga", "duval", "hillsborough", "miamidade", 
             "allegheny", "atlantic", "maricopa"]
data_dict = dict()
for query in queries:
    for location in locations:
        data_dict[(query, location)] = get_data(query, location)

Collected 147 unique tweets for this query and location
Collected  56 unique tweets for this query and location
Collected   7 unique tweets for this query and location
Collected  58 unique tweets for this query and location
Collected  15 unique tweets for this query and location
Collected  47 unique tweets for this query and location
Collected 238 unique tweets for this query and location
Collected  72 unique tweets for this query and location
Collected  72 unique tweets for this query and location
Collected 142 unique tweets for this query and location
Collected  76 unique tweets for this query and location
Collected  47 unique tweets for this query and location
Collected 210 unique tweets for this query and location
Collected 111 unique tweets for this query and location
Collected  22 unique tweets for this query and location
Collected   5 unique tweets for this query and location
Collected  44 unique tweets for this query and location
Collected   4 unique tweets for this query and l

# Sentiment Anaylysis

We will train a sentiment analysis model on stanford's Sentiment140 dataset. It is a dataset of tweets, many of which are about political topics. Upon manual inspection, this dataset looks like training on it will generalize well to our specific data which is also tweets and is solely about politics. Note that this dataset is not the focus of this project and is only being used to train a sentiment analysis model which will be used to analyze our actual dataset.

In [0]:
import pandas as pd

# If you use this data, please cite Sentiment140 as your source.
data_path = "required_files/trainingandtestdata/training.1600000.processed.noemoticon.csv"

training_data = pd.read_csv(data_path, encoding="latin1", header=None)
training_data = training_data.sample(frac=1).reset_index(drop=True)

In [20]:
# parse data into tweets and correponding labels (numpy arrays)
import collections
import nltk
# for package in ["wordnet", "punkt", "stopwords"]:
#   nltk.download(package)

labels = training_data[0].to_numpy()
tweets = training_data[5].to_numpy()

#TODO: add some more preprocessing, like punctuation, tokenization etc..
for i in range(len(tweets)):
  tweets[i] = tweets[i].lower()

nltk_stop_words = set(stopwords.words("english"))

def is_stop_word(word):
    if word in nltk_stop_words: #first layer is nltk stopwords
      return True
    if (word in ["", " "]): #removing empty strings
      return True
    if "http" in word: #removing urls
      return True
    #remove topic of analysis for ubiased sentiment estimate
    if word in ["trump", "donald", "hillary", "clinton"]:
      return True
    #if not a stopword, use it
    return False


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [21]:
#TODO: make tweets into bag of words representation (check past hw)
#TODO: alternatively, make into reverse index representation

import random
import re
from nltk.classify import util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import TweetTokenizer

#strip_handles makes it remove usernames and reduce_len deletes repeated letters
#example: huuuuuuuuuuuuge would become huuuge
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

#lemmatizer gets stems of words (eg. dogs becomes dog, saddens becomes sad)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def make_features(tweet):

    #some more preprocessing
    tweet = tweet.lower()
    tweet = re.sub(r"'s?", "", tweet)

    words = tokenizer.tokenize(tweet)
    words = [lemmatizer.lemmatize(t) for t in words]

    return {word:True for word in words if not is_stop_word(word)}

#good split to have low variance in test accuracy
train_frac = 80/100

processed = [(make_features(tweets[i]), labels[i]) for i in range(len(labels))]

train = processed[:int(len(processed)*train_frac)]

test = processed[int(len(processed)*train_frac):]
testpos = [t for t in test if t[1] == 4]
testneg = [t for t in test if t[1] == 0]

print(f"Training sample size: {len(train)}")
print(f"Testing sample size: {len(test)}")
print(f"Split of pos/neg labels in test data: {len(testpos)} : {len(testneg)}")

classifier = NaiveBayesClassifier.train(train)

print("\nOverall train accuracy:", util.accuracy(classifier, train))
print("Overall test accuracy:", util.accuracy(classifier, test))

#try on different test sets to check for bias
print("\nTest accuracy on pos:", util.accuracy(classifier, testpos))
print("Test accuracy on neg:", util.accuracy(classifier, testneg))

#prints the words that were the strongest indicators of sentiment
classifier.show_most_informative_features()


Training sample size: 1280000
Testing sample size: 320000
Split of pos/neg labels in test data: 159743 : 160257

Overall train accuracy: 0.8041265625
Overall test accuracy: 0.77084375

Test accuracy on pos: 0.715192527998097
Test accuracy on neg: 0.8263164791553567
Most Informative Features
                 saddens = True                0 : 4      =     57.7 : 1.0
                saddened = True                0 : 4      =     54.6 : 1.0
                   saddd = True                0 : 4      =     51.0 : 1.0
                dividend = True                4 : 0      =     43.0 : 1.0
             shareholder = True                4 : 0      =     41.0 : 1.0
                  farrah = True                0 : 4      =     39.2 : 1.0
                  ouchhh = True                0 : 4      =     37.0 : 1.0
                     447 = True                0 : 4      =     35.6 : 1.0
                  ouchie = True                0 : 4      =     33.8 : 1.0
                    sadd = True  

In [0]:
#try it out

text = "vote for trump" 
#TODO: mixture model is probably gonna be the only thing that classifies this
# properly. at least i hope

#TODO: filtering out subject of sentiment 
# analysis since we want sentiment about the subject and don't want the subject's own associated sentiment to bias our results
print(classifier.classify(features(text)))
print(TextBlob(text).sentiment)

In [0]:
# compare to famous TextBlob's accuracy, we actually do much better
from textblob import TextBlob



correct_count = 0
n = len(labels)//5
for i in range(n):
  #Note: adding our preprocessing makes it perform worse so I skip that for
  # a fair comparison
  if (TextBlob(tweet).sentiment.polarity > 0):
    if labels[i] == 4:
      correct_count += 1
  else:
    if labels[i] == 0:
      correct_count += 1

acc = (correct_count/n)*100
print(acc)
