### Step 1: Accept a search term from the user and download the last 100 tweets with that term.

In [1]:
# The python-twitter module is used because it has a RESTful API that returns
# pre-parsed objects instead of raw JSON to make working with data easier.

import twitter
import json

# The twitter library uses an API object authenticated with app keys to access the API.
# For privacy reasons, these keys are loaded from a local JSON file not included in this repo.

with open('twitter_keys.json') as keystore:
    keys = json.load(keystore)

api = twitter.Api(consumer_key=keys['consumer_key'],
                  consumer_secret=keys['consumer_secret'],
                  access_token_key=keys['access_token_key'],
                  access_token_secret=keys['access_token_secret'])

In [2]:
# This is a function to accept a search term and fetch the tweets with that term.
def fetchTestData(search_string):
    try:
        tweets_fetched = api.GetSearch(search_string, count=100)
        print("Fetched",str(len(tweets_fetched)),"tweets with the term",search_string)
        return [{'text':status.text,'label':None} for status in tweets_fetched]
    except:
        print("Failed to fetch tweets with the term",search_string)
        return None

In [3]:
search_string = input("Hello. Please enter a search term: ")
testData = fetchTestData(search_string)

Hello. Please enter a search term: apple
Fetched 96 tweets with the term apple


In [4]:
# First 10 tweets that were fetched.
for tweet in testData[0:9]:
    print(tweet['text'])

The new trailer for #FiftyShadesDarker features #IDontWannaLiveForever! Check it out below &amp; then pick up the song:… https://t.co/9XrxHNm0xL
Hey @Apple, I'm 35. Stop autocorrecting it to ducking. Thanks!
Surprise "Thank You Followers" Twitter Giveaway! Win I Love Juicy Couture To enter follow @davelackie &amp; RT Taffy ap… https://t.co/Lkaya4upeT
@qszell najlepsza opcja to https://t.co/ikM3Q9lpTN i Cortland
RT @GSMHProductions: https://t.co/2BTnybR4Ma #cannabiscommunity #RT #Podcast #PodcastTrending #PodernFamily #Therapy #psychology #selfhelp…
RT @bekotaizi: taiji hasegawa「PixelEscape」

無事iOS版も配信されました（＾ω＾） https://t.co/IGyUOKeICm
Qualcomm’s Snapdragon 835 will bring improved battery life and better VR experiences #androbrix1 #android #apple #r… https://t.co/P8RnIeMx0I
雷音のゲーム実況 ドラゴンクエストヒーローズ2 2017.01.03 2
#マスクドＤＪ雷音 #ドラゴンクエストヒーローズ #ゲーム実況 https://t.co/wflF8WzZup
https://t.co/cwD9NAJnWJ
Guile: deceit; trickery #word #vocabulary #interesting  https://t.co/RENl8NNdll https://t.co/IEcdM9

### Step 2: Classify these tweets as positive or negative.

In [5]:
# This requires downloading a corpus of tweet data. However, twitter only allows
# tweet ID's to be shared, and not the tweets themselves. The API can be used to
# cross-reference the corpus, but considering that Niek Sanders corpus contains
# 5000 tweets and twitter limits API pulls to 180/15 mins, it will take several 
# hours to complete the pull.

def createTrainingCorpus(corpusFile,tweetFile):
    import csv
    corpus=[]
    with open(corpusFile,'r') as csvfile:
        lineReader = csv.reader(csvfile,delimiter=',',quotechar='"')
        for row in lineReader:
            corpus.append({'tweet_id':row[2],'label':row[1],'topic':row[0]})
    # The tweet pull is self-throttled to work around twitter's rate limit.
    import time
    rate_limit=180
    sleep_time=900/180 # 15 minutes / limit
    trainingData=[]
    downloadCount = 1
    for tweet in corpus:
        try:
            status = api.GetStatus(tweet['tweet_id'])
            tweet['text'] = status.text
            print('fetched tweet',str(downloadCount),'of',str(len(corpus)))
            downloadCount += 1
            trainingData.append(tweet)
            time.sleep(sleep_time) # to avoid rate limit
        except:
            continue
    with open(tweetFile,'w') as csvfile:
        lineWriter = csv.writer(csvfile,delimiter=',',quotechar='"')
        for tweet in trainingData:
            # Ignore failures on line write so the trainingData doesn't get lost due to a single encoding error.
            try:
                lineWriter.writerow([tweet['tweet_id'],tweet['text'],tweet['label'],tweet['topic']])
            except:
                continue
    return trainingData

In [6]:
## Commented out so 'Run All' doesn't kick off a 10-hour loop.
#trainingData = createTrainingCorpus('./corpus.csv','./tweets.csv')

In [7]:
# Since I don't want to deal with loading 10 hours worth of tweets, this fetches the
# training data from my pre-saved csv file.
def loadTrainingData(tweetsFile):
    import csv
    with open(tweetsFile,'r') as dataFile:
        lineReader = csv.reader(dataFile, delimiter=',', quotechar='"')
        trainingData = []
        for row in lineReader:
            if len(row) > 0: # Ignore empty rows, which would cause an 'out of range' indexing error.
                trainingData.append({'tweet_id':row[0],'text':row[1],'label':row[2],'topic':row[3]})
    return trainingData

In [8]:
trainingData = loadTrainingData('./tweets.csv')

In [9]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# A class to preprocess all tweets (both test & training)
class TweetSweeper:
    def __init__(self):
        self._at_user = 'AT_USER'
        self._url = 'URL'
        self._stopwords=set(stopwords.words('english')+list(punctuation)+[self._at_user,self._url])
        
    # Accepts tweets as a list of dictionaries with keys, "text" and "label".
    # Returns a list of tuples, each with a list of words and the label.
    def tidy(self, tweets):
        scrubbed = []
        for tweet in tweets:
            scrubbed.append((self._tidy(tweet['text']),tweet['label']))
        return scrubbed
    
    # Accepts a tweet which is then scrubbed in several steps.
    # Returns a tokenized list of words in the tweet, sans any stopwords.
    def _tidy(self, tweet):
        # 1. Convert to lower case.
        tweet = tweet.lower()
        # 2. Replace links with __url.
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', self._url, tweet)
        # 3. Replace user mentions with __at_user
        tweet = re.sub('@[^\s]+', self._at_user, tweet)
        # 4. Replace hashtags with the raw word. (i.e. '#word' => 'word')
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # The post-tag word is grouped '()' so it can be referred to as \1.
        # Finally, tokenize the tweet into a list of words...
        tweet = word_tokenize(tweet)
        # ...and return, minus any stopwords.
        return [word for word in tweet if word not in self._stopwords]        

  regargs, varargs, varkwargs, defaults = inspect.getargspec(func)


In [10]:
twitterMaid = TweetSweeper()
cleanTrainingData = twitterMaid.tidy(trainingData)
cleanTestData = twitterMaid.tidy(testData)

In [11]:
# Extract features and train classifier. For this exercise, two methods will be used.

In [18]:
# NAIVE BAYES
import nltk

def buildVocab(tidyData):
    all_words = []
    # This gives a list where all the words in all tweets are present.
    for (words, sentiment) in tidyData:
        all_words.extend(words)
    wordlist = nltk.FreqDist(all_words) # Create a dict of each word with frequency.
    word_features = wordlist.keys() # A unique list of words.
    return word_features

# NLTK has an apply_features function that takes a user-defined function to extract features.
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

word_features = buildVocab(cleanTrainingData)
trainingSet = nltk.classify.apply_features(extract_features, cleanTrainingData)
# apply_features applies the previously defined extract_features function to each element of cleanTrainingData.
# It automagically identifies each element as a tuple and assumes it to take the (text, label) format before
# applying the function to the text.

NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingSet)

In [21]:
# SUPPORT VECTOR MACHINE
from nltk.corpus import sentiwordnet as swn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Format has to be altered for CountVectorizer because it takes in the document directly and
# builds its own vocabulary; therefore, data and labels will be fed separately (as opposed to in tuples).
# Of note is that Naive Bays allows for N classes, while SVM is a binary classifier only.

svmTrainingData = [''.join(tweet[0]) for tweet in cleanTrainingData]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(svmTrainingData).toarray()
vocabulary = vectorizer.get_feature_names()

# Now, use swn to weight these features.
swn_weights = []
for word in vocabulary:
    # Wrapped in a try because some words may not be in swn.
    try:
        synset = list(swn.senti_synsets(word))
        common_meaning = synset[0] # use most common meaning by default.
        if common_meaning.pos_score() > common_meaning.neg_score():
            weight = common_meaning.pos_score()
        elif common_meaning.pos_score() < common_meaning.neg_score():
            weight = -(common.meaning.neg_score())
        else:
            weight = 0
    except:
        weight = 0
    swn_weights.append(weight)
    
swn_X = []
for row in X:
    swn_X.append(np.multiply(row,np.array(swn_weights))) # convert list to a np array
swn_X = np.vstack(swn_X)

# Prepare labels
array_labels = {'irrelevant':0, 'positive':1, 'negative':2, 'neutral':3}
labels = [array_labels[tweet[1]] for tweet in cleanTrainingData]
y = np.array(labels)

# Finally, build the SVM classifier
from sklearn.svm import SVC
SVMClassifier = SVC()
SVMClassifier.fit(swn_X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
# Run classifier on downloaded tweets.

# Naive Bayes
NBResultLabels = [NBayesClassifier.classify(extract_features(tweet[0])) for tweet in cleanTestData]

# SVM
SVMResultLabels = []
for tweet in cleanTestData:
    tweet_sentence = ''.join(tweet[0]) # needs a full sentence
    svmFeatures = np.multiply(vectorizer.transform([tweet_sentence]).toarray(),np.array(swn_weights))
    # Predict returns a list of np.arrays. There's only 1 element and 1 array, so grab [0][0].
    SVMResultLabels.append(SVMClassifier.predict(svmFeatures)[0])