### Import relevant libraries

In [22]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk import download
import string
import re
import json
import itertools
import collections
import matplotlib.pyplot as plt
%matplotlib inline 

### Install some required packages

In [23]:
!sudo pip install oauth oauth2

Cleaning up...


### Define some utilities

In [24]:
def process(tweet):
    # removes links, usernames, twitter special word 'RT' (retweet) and emoticons
    p = re.compile('(http[s]*://[^\s]+|@[^\s]*|RT|(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$))|&\w*')
    return p.sub("", tweet)

In [25]:
def sanitize(text):
    # removes punctuation character
    return text.translate(None, string.punctuation)

In [26]:
def removeStopWords(tokens):
    return [token for token in tokens if  token not in ["an", "a", "the"]]

In [27]:
def find_ngrams(input_list, n=2): 
  return zip(*[input_list[i:] for i in range(n)])

In [28]:
def chain2(nested_list):
    for l in neg_clean:
        for tup in l:
            yield(tup)

In [29]:
def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom')

In [30]:
def plotTagDistrubition(pos_dist):
    N = len(pos_dist)
    neg_counts = pos_dist.values()

    ind = np.arange(N)  # the x locations for the groups
    width = 0.35       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, neg_counts, width, color='r')
    fig.set_size_inches(25,8)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Count')
    ax.set_title(r'P^T values')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(pos_dist.keys())

    #ax.legend((rects1[0], rects2[0]), ('Men', 'Women'))
    ax.legend((rects1[0], ('Negative')))

    autolabel(rects1)
    #autolabel(rects2)

    plt.show()

In [31]:
def getTokenizedTweets(rawTweets):
    clean = [removeStopWords(word_tokenize(process(tweet["text"]))) for tweet in json.loads(rawTweets[0])["statuses"]]
    return clean

In [32]:
def getPosTweetsFromTokenized(tokenizedTweets):
    return [pos_tag(tweet) for tweet in getTokenizedTweets(neu_tweets)]

In [33]:
def nl():
    print("\n")

### Download sentence tokenizer

In [34]:
download("averaged_perceptron_tagger")
download("punkt")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ds/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Scraping some data

### Fetching some tweets 

In [293]:
neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100"

In [307]:
max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]

In [308]:
negs = []

In [309]:
neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100"
negs = negs + getTokenizedTweets(neg_tweets)
max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]
for i in range(2):
    neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100&max_id="+max_id
    negs = negs + getTokenizedTweets(neg_tweets)
    max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]

In [295]:
pos_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%29&lang=eu&count=100"

In [296]:
neu_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=from%3AHuffingtonPost&count=100"

In [301]:
print ("Negative tweets dataset size: " + str(len(getTokenizedTweets(neg_tweets))))
print ("Positive tweets dataset size: " + str(len(getTokenizedTweets(pos_tweets))))
print ("Neutral tweets dataset size: " + str(len(getTokenizedTweets(neu_tweets))))

Negative tweets dataset size: 100
Positive tweets dataset size: 100
Neutral tweets dataset size: 100


In [189]:
import pandas as pd

In [217]:
tweets = pd.read_csv("./tweets_20160803_2207.csv", nrows=10000)

In [218]:
tweets.head()

Unnamed: 0,TAG,TEXT
0,positive,Guarda CarloBlack76officialtube con me su Twit...
1,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...
2,positive,RT @RomanAtwood: TONIGHT!! Show starts at 7pm ...
3,negative,I was literally saying this the other day. I n...
4,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...


In [219]:
def clean(tweet):
    processed = process(tweet.decode("utf-8"))
    return [w.lower() for w in removeStopWords(word_tokenize(processed)) if w not in string.punctuation]

In [220]:
tweets["CLEAN"] = tweets["TEXT"].map(clean)
tweets.head()

Unnamed: 0,TAG,TEXT,CLEAN
0,positive,Guarda CarloBlack76officialtube con me su Twit...,"[guarda, carloblack76officialtube, con, me, su..."
1,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...,"[it, is, vlog, time, thank, you, so, much, for..."
2,positive,RT @RomanAtwood: TONIGHT!! Show starts at 7pm ...,"[tonight, show, starts, at, 7pm, columbus, ohi..."
3,negative,I was literally saying this the other day. I n...,"[i, was, literally, saying, this, other, day, ..."
4,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...,"[it, is, vlog, time, thank, you, so, much, for..."


In [221]:
all_words = []
for index, tweet in tweets.iterrows():
    all_words.extend(tweet["CLEAN"])

In [222]:
len(all_words)

131624

In [223]:
from nltk import FreqDist
def get_word_features(wordlist):
    wordlist = FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [224]:
word_features = get_word_features(all_words)

In [225]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [226]:
for word in word_features:
    tweets[word] = tweets.CLEAN.apply(lambda x: word in x)

In [227]:
test_data = tweets.sample(100)

Unnamed: 0,TAG,TEXT,CLEAN,raining,todays,gatekeeper,magnetic,doubts,oxywhey,overwatch,...,indiegamelover,4s,mohenjodaro,extras,sci…,balst,trims,staygold,flyinhongkong,8th
3417,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...,"[it, is, vlog, time, thank, you, so, much, for...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1943,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...,"[it, is, vlog, time, thank, you, so, much, for...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2463,positive,RT @RomanAtwood: Unreal guys! We smashed throu...,"[unreal, guys, we, smashed, through, 2, billio...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
153,positive,RT @RomanAtwood: PHILLY!! You guys are absolut...,"[philly, you, guys, are, absolutely, amazing, ...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4535,positive,RT @RomanAtwood: New Vlog in 2 hours!! :),"[new, vlog, in, 2, hours]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5430,positive,RT @RomanAtwood: New Vlog in 2 hours!! :),"[new, vlog, in, 2, hours]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7734,positive,RT @RomanAtwood: WE ARE BACK!!! Thanks so much...,"[we, are, back, thanks, so, much, for, waiting...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6946,positive,RT @RomanAtwood: HAHAHA!! This was a blast! Th...,"[hahaha, this, was, blast, thank, you, for, sh...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1270,positive,RT @JonLeeBrody: Such a bummer @caitylotz! Our...,"[such, bummer, our, booths, were, gon, na, be,...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
789,positive,RT @RomanAtwood: IT IS VLOG TIME!! :) thank yo...,"[it, is, vlog, time, thank, you, so, much, for...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [228]:
import numpy as np
X = tweets[word_features]
y = tweets["TAG"]
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [229]:
predictions = clf.predict(test_data[word_features])
print(predictions)

['positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'negative' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'negative' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'negative' 'negative' 'positive' 'positive'
 'positive' 'positive' 'positive' 'negative' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'negative' 'positive' 'positive'
 'negative' 'negative' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'negative' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'negative' 'positive' 'positive' 'positive' 'negative'
 'positive' 'positive' 'positive' 'positive' 'positive' 'negat

In [230]:
precision = predictions == test_data["TAG"] 
sum(precision)/float(len(precision))

0.89000000000000001

In [233]:
sum(tweets["TAG"] == "positive") / float(len(tweets))

0.93430000000000002

In [234]:
precision = ["positive" == k for k in test_data["TAG"] ]
sum(precision)/float(len(precision))

0.97

# Feature extraction

### We remove URL links, Twitter user names and Twitter special words (e.g. "RT" (retweet)) and emoticons   

In [47]:
processed = process(tweet)
print ("Before: " + tweet)
print ("After: " + processed)

Before: And now for something !!don't I'm completely different the http://google.fr
After: And now for something !!don't I'm completely different the 


### Tokenization 

In [52]:
tokenized = word_tokenize(processed)
print ("Before: " + tweet)
print ("After: " + str(tokenized))

Before: And now for something !!don't I'm completely different the http://google.fr
After: ['And', 'now', 'for', 'something', '!', '!', 'do', "n't", 'I', "'m", 'completely', 'different', 'the']


### Removing stop words from bag of words

In [55]:
withoutStopWords = removeStopWords(tokenized)
print ("Before: " + tweet)
print ("After: " + str(withoutStopWords))

Before: And now for something !!don't I'm completely different the http://google.fr
After: ['And', 'now', 'for', 'something', '!', '!', 'do', "n't", 'I', "'m", 'completely', 'different']


### Compute Part-Of-Speech tags 

In [69]:
pos = pos_tag(withoutStopWords)
print ("Before: " + tweet)
print ("After: " + str(pos))

Before: And now for something !!don't I'm completely different the http://google.fr
After: [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('!', '.'), ('!', '.'), ('do', 'VBP'), ("n't", 'RB'), ('I', 'PRP'), ("'m", 'VBP'), ('completely', 'RB'), ('different', 'JJ')]


### Remove punctuation

In [70]:
features_pos = [(k,v) for (k,v) in pos if k not in  string.punctuation ]
features_ngrams = find_ngrams([k for (k,_) in features_pos])
print ("Features set 1: " + str(features_pos))
nl()
print ("Features set 2: " + str(features_ngrams))

Features set 1: [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('do', 'VBP'), ("n't", 'RB'), ('I', 'PRP'), ("'m", 'VBP'), ('completely', 'RB'), ('different', 'JJ')]


Features set 2: [('And', 'now'), ('now', 'for'), ('for', 'something'), ('something', 'do'), ('do', "n't"), ("n't", 'I'), ('I', "'m"), ("'m", 'completely'), ('completely', 'different')]


# Feature engineering (Please bring some coffee)

In [164]:
neutral_pos = getPosTweetsFromTokenized(getTokenizedTweets(neu_tweets))

In [None]:
features_pos = [(k,v) for (k,v) in chain2(neutral_pos) if k not in  string.punctuation]

In [274]:
neu_pos = [(k,v) for (k,v) in chain2(neutral_pos) if k not in  string.punctuation]

In [279]:
neu_pos_dist = collections.Counter([pos for (_,pos) in neu_pos])

In [281]:
plotTagDistrubition(neu_pos_dist)