### Import relevant libraries

In [237]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk import download
import string
import re
import json
import itertools
import collections
import matplotlib.pyplot as plt
%matplotlib inline 

### Install some required packages

In [289]:
!sudo pip install oauth oauth2

Cleaning up...


### Define some utilities

In [30]:
def process(tweet):
    # removes links, usernames, twitter special word 'RT' (retweet) and emoticons
    p = re.compile('(http[s]*://[^\s]+|@[^\s]*|RT|(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$))|&\w*')
    return p.sub("", tweet)

In [9]:
def sanitize(text):
    # removes punctuation character
    return text.translate(None, string.punctuation)

In [10]:
def removeStopWords(tokens):
    return [token for token in tokens if  token not in ["an", "a", "the"]]

In [11]:
def find_ngrams(input_list, n=2): 
  return zip(*[input_list[i:] for i in range(n)])

In [265]:
def chain2(nested_list):
    for l in neg_clean:
        for tup in l:
            yield(tup)

In [266]:
def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom')

In [288]:
def plotTagDistrubition(pos_dist):
    N = len(pos_dist)
    neg_counts = pos_dist.values()

    ind = np.arange(N)  # the x locations for the groups
    width = 0.35       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, neg_counts, width, color='r')
    fig.set_size_inches(25,8)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Count')
    ax.set_title(r'P^T values')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(pos_dist.keys())

    #ax.legend((rects1[0], rects2[0]), ('Men', 'Women'))
    ax.legend((rects1[0], ('Negative')))

    autolabel(rects1)
    #autolabel(rects2)

    plt.show()

In [300]:
def getTokenizedTweets(rawTweets):
    clean = [removeStopWords(word_tokenize(process(tweet["text"]))) for tweet in json.loads(rawTweets[0])["statuses"]]
    return clean

In [None]:
def getPosTweetsFromTokenized(tokenizedTweets):
    return [pos_tag(tweet) for tweet in getTokenizedTweets(neu_tweets)]

In [12]:
def nl():
    print("\n")

### Download sentence tokenizer

In [41]:
download("averaged_perceptron_tagger")
download("punkt")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ds/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Scraping some data

### Fetching some tweets 

In [293]:
neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100"

In [307]:
max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]

In [308]:
negs = []

In [309]:
neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100"
negs = negs + getTokenizedTweets(neg_tweets)
max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]
for i in range(2):
    neg_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%28&lang=en&count=100&max_id="+max_id
    negs = negs + getTokenizedTweets(neg_tweets)
    max_id = json.loads(neg_tweets[0])["search_metadata"]["max_id"]

In [315]:
for tokenizedNegTweet in negs:
    print tokenizedNegTweet

[u'I', u"'m", u'gon', u'na', u'miss', u'this', u'hot', u'summer', u'we', u'had', u'.']
[u'sorry', u'crainer', u'i', u'didnt', u'mean', u'what', u'i', u'said', u'!', u'friends', u'?', u'...']
[u'i', u'hope', u'u', u'had', u'good', u'sleep', u'i', u'love', u'u']
[u'garing', u'bege', u':', u'(', u'(']
[u'Can', u'not', u'baaaaalive', u'ewe', u'stopped', u'doing', u'rock', u'cakes', u'and', u'scones', u'What', u'next', u'sausage', u'rolls', u'and', u'fresh', u'cream', u'cakes', u'lol', u'xx']
[u'JUST', u'WHEN', u'I', u'WENT', u'ONLINE', u'?', u'\U0001f624', u'lol', u'jk', u'bb', u'I', u'hope', u'you', u"'re", u'ok']
[u'taeyong', u',', u',', u',', u'pls', u'eat', u'more', u'babe', u':', u'(', u'(', u'(', u'(', u'(', u'you', u"'re", u'so', u'thin', u'I', u'get', u'so', u'worried']
[u'it', u'depends', u'uno', u'.', u'maybe', u'hes', u'broke', u'atm', u'i', u'wouldnt', u'end', u'it', u'but', u'certainly', u'not', u'having', u'that']
[u'THREE', u'HUNDRED', u'AND', u'SIXTY', u'PELLETS']
[u'I', u'

In [295]:
pos_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=%3A%29&lang=eu&count=100"

In [296]:
neu_tweets = !python ./twitterstream.py "https://api.twitter.com/1.1/search/tweets.json?q=from%3AHuffingtonPost&count=100"

In [301]:
print ("Negative tweets dataset size: " + str(len(getTokenizedTweets(neg_tweets))))
print ("Positive tweets dataset size: " + str(len(getTokenizedTweets(pos_tweets))))
print ("Neutral tweets dataset size: " + str(len(getTokenizedTweets(neu_tweets))))

Negative tweets dataset size: 100
Positive tweets dataset size: 100
Neutral tweets dataset size: 100


# Feature extraction

### We remove URL links, Twitter user names and Twitter special words (e.g. "RT" (retweet)) and emoticons   

In [47]:
processed = process(tweet)
print ("Before: " + tweet)
print ("After: " + processed)

Before: And now for something !!don't I'm completely different the http://google.fr
After: And now for something !!don't I'm completely different the 


### Tokenization 

In [52]:
tokenized = word_tokenize(processed)
print ("Before: " + tweet)
print ("After: " + str(tokenized))

Before: And now for something !!don't I'm completely different the http://google.fr
After: ['And', 'now', 'for', 'something', '!', '!', 'do', "n't", 'I', "'m", 'completely', 'different', 'the']


### Removing stop words from bag of words

In [55]:
withoutStopWords = removeStopWords(tokenized)
print ("Before: " + tweet)
print ("After: " + str(withoutStopWords))

Before: And now for something !!don't I'm completely different the http://google.fr
After: ['And', 'now', 'for', 'something', '!', '!', 'do', "n't", 'I', "'m", 'completely', 'different']


### Compute Part-Of-Speech tags 

In [69]:
pos = pos_tag(withoutStopWords)
print ("Before: " + tweet)
print ("After: " + str(pos))

Before: And now for something !!don't I'm completely different the http://google.fr
After: [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('!', '.'), ('!', '.'), ('do', 'VBP'), ("n't", 'RB'), ('I', 'PRP'), ("'m", 'VBP'), ('completely', 'RB'), ('different', 'JJ')]


### Remove punctuation

In [70]:
features_pos = [(k,v) for (k,v) in pos if k not in  string.punctuation ]
features_ngrams = find_ngrams([k for (k,_) in features_pos])
print ("Features set 1: " + str(features_pos))
nl()
print ("Features set 2: " + str(features_ngrams))

Features set 1: [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('do', 'VBP'), ("n't", 'RB'), ('I', 'PRP'), ("'m", 'VBP'), ('completely', 'RB'), ('different', 'JJ')]


Features set 2: [('And', 'now'), ('now', 'for'), ('for', 'something'), ('something', 'do'), ('do', "n't"), ("n't", 'I'), ('I', "'m"), ("'m", 'completely'), ('completely', 'different')]


# Feature engineering (Please bring some coffee)

In [164]:
neutral_pos = getPosTweetsFromTokenized(getTokenizedTweets(neu_tweets))

In [None]:
features_pos = [(k,v) for (k,v) in chain2(neutral_pos) if k not in  string.punctuation]

In [274]:
neu_pos = [(k,v) for (k,v) in chain2(neutral_pos) if k not in  string.punctuation]

In [279]:
neu_pos_dist = collections.Counter([pos for (_,pos) in neu_pos])

In [281]:
plotTagDistrubition(neu_pos_dist)