### Step 1: Accept a search term from the user and download the last 100 tweets with that term.

In [1]:
# The python-twitter module is used because it has a RESTful API that returns
# pre-parsed objects instead of raw JSON to make working with data easier.

import twitter
import json

# The twitter library uses an API object authenticated with app keys to access the API.
# For privacy reasons, these keys are loaded from a local JSON file not included in this repo.

with open('twitter_keys.json') as keystore:
    keys = json.load(keystore)

api = twitter.Api(consumer_key=keys['consumer_key'],
                  consumer_secret=keys['consumer_secret'],
                  access_token_key=keys['access_token_key'],
                  access_token_secret=keys['access_token_secret'])

In [2]:
# This is a function to accept a search term and fetch the tweets with that term.
def fetchTestData(search_string):
    try:
        tweets_fetched = api.GetSearch(search_string, count=100)
        print("Fetched",str(len(tweets_fetched)),"tweets with the term",search_string)
        return [{'text':status.text,'label':None} for status in tweets_fetched]
    except:
        print("Failed to fetch tweets with the term",search_string)
        return None

In [3]:
search_string = input("Hello. Please enter a search term: ")
testData = fetchTestData(search_string)

Hello. Please enter a search term: apple
Fetched 100 tweets with the term apple


In [4]:
# First 10 tweets that were fetched.
for tweet in testData[0:9]:
    print(tweet['text'])

Hey @Apple, I'm 35. Stop autocorrecting it to ducking. Thanks!
Happy NYE! Wanted to give you all a little something special...my new updated emojis are on sale for .99 cents! ✨… https://t.co/FeQsifsvV6
What capitulation and normalization looks like: WSJ won't call Donald Trump's lies 'lies'  https://t.co/zf4zvatiEs
Kindly follow us and ask your follower to follow us if they use apple app #retweet # followback #weretweet
Me gustó un video de @YouTube https://t.co/SwLSkwqyak Uncharted 4 Multiplayer - Hilarious Hook Melee Only Gameplay! (22 Downs)
iPhone用　無料アプリ　医学部門　第40位
循環器疾患　ナースフル疾患...
https://t.co/NQDvn2IDuc
看護師（ナース）＆看護学生のための学習・勉強アプリ！...
#メディカル #アプリ https://t.co/0xIHoPx9iL
Apple refugees dish on how iPhone development culture echoes into Pearl Automation https://t.co/OddbWXCSJn… https://t.co/HgtnmkJ0fx
RT @fe_city_boy: Компания Apple пытается сделать все продукты беспроводными, потому что до сих пор не научилась делать провода.
@chadwildclay I'm #bingewatching your videos. Pen Pineappl

### Step 2: Classify these tweets as positive or negative.

In [5]:
# This requires downloading a corpus of tweet data. However, twitter only allows
# tweet ID's to be shared, and not the tweets themselves. The API can be used to
# cross-reference the corpus, but considering that Niek Sanders corpus contains
# 5000 tweets and twitter limits API pulls to 180/15 mins, it will take several 
# hours to complete the pull.

def createTrainingCorpus(corpusFile,tweetFile):
    import csv
    corpus=[]
    with open(corpusFile,'r') as csvfile:
        lineReader = csv.reader(csvfile,delimiter=',',quotechar='"')
        for row in lineReader:
            corpus.append({'tweet_id':row[2],'label':row[1],'topic':row[0]})
    # The tweet pull is self-throttled to work around twitter's rate limit.
    import time
    rate_limit=180
    sleep_time=900/180 # 15 minutes / limit
    trainingData=[]
    downloadCount = 1
    for tweet in corpus:
        try:
            status = api.GetStatus(tweet['tweet_id'])
            tweet['text'] = status.text
            print('fetched tweet',str(downloadCount),'of',str(len(corpus)))
            downloadCount += 1
            trainingData.append(tweet)
            time.sleep(sleep_time) # to avoid rate limit
        except:
            continue
    with open(tweetFile,'w') as csvfile:
        lineWriter = csv.writer(csvfile,delimiter=',',quotechar='"')
        for tweet in trainingData:
            # Ignore failures on line write so the trainingData doesn't get lost due to a single encoding error.
            try:
                lineWriter.writerow([tweet['tweet_id'],tweet['text'],tweet['label'],tweet['topic']])
            except:
                continue
    return trainingData

In [6]:
## Commented out so 'Run All' doesn't kick off a 10-hour loop.
#trainingData = createTrainingCorpus('./corpus.csv','./tweets.csv')

In [7]:
# Since I don't want to deal with loading 10 hours worth of tweets, this fetches the
# training data from my pre-saved csv file.
def loadTrainingData(tweetsFile):
    import csv
    with open(tweetsFile,'r') as dataFile:
        lineReader = csv.reader(dataFile, delimiter=',', quotechar='"')
        trainingData = []
        for row in lineReader:
            if len(row) > 0: # Ignore empty rows, which would cause an 'out of range' indexing error.
                trainingData.append({'tweet_id':row[0],'text':row[1],'label':row[2],'topic':row[3]})
    return trainingData

In [8]:
trainingData = loadTrainingData('./tweets.csv')

In [9]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# A class to preprocess all tweets (both test & training)
class TweetSweeper:
    def __init__(self):
        self._at_user = 'AT_USER'
        self._url = 'URL'
        self._stopwords=set(stopwords.words('english')+list(punctuation)+[self._at_user,self._url])
        
    # Accepts tweets as a list of dictionaries with keys, "text" and "label".
    # Returns a list of tuples, each with a list of words and the label.
    def tidy(self, tweets):
        scrubbed = []
        for tweet in tweets:
            scrubbed.append((self._tidy(tweet['text']),tweet['label']))
        return scrubbed
    
    # Accepts a tweet which is then scrubbed in several steps.
    # Returns a tokenized list of words in the tweet, sans any stopwords.
    def _tidy(self, tweet):
        # 1. Convert to lower case.
        tweet = tweet.lower()
        # 2. Replace links with __url.
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', self._url, tweet)
        # 3. Replace user mentions with __at_user
        tweet = re.sub('@[^\s]+', self._at_user, tweet)
        # 4. Replace hashtags with the raw word. (i.e. '#word' => 'word')
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # The post-tag word is grouped '()' so it can be referred to as \1.
        # Finally, tokenize the tweet into a list of words...
        tweet = word_tokenize(tweet)
        # ...and return, minus any stopwords.
        return [word for word in tweet if word not in self._stopwords]        

  regargs, varargs, varkwargs, defaults = inspect.getargspec(func)


In [10]:
twitterMaid = TweetSweeper()
cleanTrainingData = twitterMaid.tidy(trainingData)
cleanTestData = twitterMaid.tidy(testData)

In [11]:
# Extract features and train classifier

In [12]:
# Run classifier on downloaded tweets