# Step 0 : Downloading/loading libraries and data 

## Downloading data

In [5]:
# Downloading tweets 
! wget http://172.22.218.81/tweets+labels.tsv.gz

--2016-11-20 13:51:21--  http://172.22.218.81/tweets+labels.tsv.gz
Connecting to 172.22.218.81:80... failed: Connection refused.


In [None]:
## Extracting and cleaning tweets 
! gunzip tweets+labels.tsv.gz 
! cat tweets+labels.tsv | tr -d "\r" > cleaned_dataset.tsv

gzip: tweets+labels.tsv already exists; do you wish to overwrite (y or n)? 

## Downloading libraries

In [9]:
import nltk
# Downloading required nltk data 
nltk.download('punkt') ## data used for the word tokenizer 
nltk.download("wordnet") ## data used for the lemmatizer
nltk.download("stopwords") ## data used for removing stopwords

[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ds/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading libraries

In [172]:
# pandas for data manipulation 
import pandas as pd
# nltk for natural language utilities 
from nltk import word_tokenize, WordNetLemmatizer
# re stands for Regular Expression, this lib gathers tools to work with regex
import re
# string is useful to access punctuation characters list
import string
# useful to access english stop words list
from nltk.corpus import stopwords

# Step 1 : loading data 

In [74]:
data = pd.read_csv("cleaned_dataset.tsv", sep= "\t", header=None, names=["id", "sentiment", "text"])

In [75]:
## Sampling 6800 tweets positive / negative / neutral 
pos = data[data["sentiment"] == "positive"].sample(6800)
neg = data[data["sentiment"] == "negative"].sample(6800)
neu = data[data["sentiment"] == "neutral"].sample(6800)

In [210]:
## Combining pos, neg, neu 
dat = pd.concat([pos, neg, neu]).sample(frac=1)

# Step 2 : Preprocessing

### Defining some custom utilities to process tweets

In [211]:
# removes links, usernames, twitter special word 'RT' (retweet) and emoticons
def process(tweet):
    p = re.compile('(http[s]*://[^\s]+|@[^\s]*|RT|(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$))|&\w*')
    return p.sub("", tweet)

In [212]:
# linking the different forms of the same word (for example, price and prices, is and are) to each other
wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = lambda list_of_words : [wordnet_lemmatizer.lemmatize(word) for word in list_of_words]

In [213]:
# Tokenize i.e. transform a sentence in a list of words
tokenizer = lambda x: nltk.word_tokenize(x.decode('utf-8'))

In [214]:
# removes stop words and converts to lowercase
remove_stop_word = lambda list_of_words: [word.lower() for word in list_of_words if word not in stopwords.words("english")]

In [215]:
remove_punctuation = lambda list_of_words: [word for word in list_of_words if word not in string.punctuation]

### Applying text processing

In [216]:
dat["text"] = dat["text"].apply(process).apply(tokenizer).apply(remove_stop_word).apply(remove_punctuation).apply(lemmatizer)

In [217]:
dat.head()["text"]

1597378                       [awww, hockey, went, baseball]
1731055    [ny, time, risking, integrity, campaign, hillary]
1500931             [it, 's, always, freezing, cold, office]
1276011    [first, time, baking, new, flat, think, ruined...
582523     [awesome, ahah, be, honest, video, man, we, ge...
Name: text, dtype: object

# Step 3 : Feature extraction

bi-gram works better for our problem (A.Pak & P. Paroubek)

### Constructing bi-grams

In [218]:
# source : http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

In [219]:
def test_find_ngrams():
    words = ["There", "is", "a", "dog", "out", "there"]
    print(find_ngrams(words, 2))
    assert(find_ngrams(words, 2)==[('There', 'is'), ('is', 'a'), ('a', 'dog'), ('dog', 'out'), ('out', 'there')])
test_find_ngrams()

[('There', 'is'), ('is', 'a'), ('a', 'dog'), ('dog', 'out'), ('out', 'there')]


In [220]:
def find_bigrams(input_list):
  return find_ngrams(input_list,2)

In [221]:
dat["bigrams"] = dat["text"].apply(find_bigrams)

In [222]:
dat.head()

Unnamed: 0,id,sentiment,text,bigrams
1597378,unavailable,negative,"[awww, hockey, went, baseball]","[(awww, hockey), (hockey, went), (went, baseba..."
1731055,unavailable,neutral,"[ny, time, risking, integrity, campaign, hillary]","[(ny, time), (time, risking), (risking, integr..."
1500931,790624145113350144,negative,"[it, 's, always, freezing, cold, office]","[(it, 's), ('s, always), (always, freezing), (..."
1276011,790480414284455940,negative,"[first, time, baking, new, flat, think, ruined...","[(first, time), (time, baking), (baking, new),..."
582523,789933764642213889,positive,"[awesome, ahah, be, honest, video, man, we, ge...","[(awesome, ahah), (ahah, be), (be, honest), (h..."


In [223]:
def get_bigrams_in_tweets(tweet_bigrams):
    all_bigrams = []
    for bigrams in tweet_bigrams:
        all_bigrams.extend(bigrams)
    return all_bigrams

In [140]:
get_bigrams_in_tweets(dat["bigrams"]) 

[(u'trump', u'happens'),
 (u'happens', u'allegedly'),
 (u'allegedly', u'good'),
 (u'good', u'people'),
 (u'people', u'stand'),
 (u'stand', u'silent'),
 (u'silent', u'there'),
 (u'there', u'trump'),
 (u'trump', u'team'),
 (u'team', u'noeffingwaytrump'),
 (u'hey', u'sorry'),
 (u'sorry', u'annoying'),
 (u'annoying', u'rt'),
 (u'rt', u'help'),
 (u'help', u'much'),
 (u'much', u'please'),
 (u'please', u'\U0001f614'),
 (u'i', u'disappointed'),
 (u'disappointed', u'find'),
 (u'find', u'1'),
 (u'1', u'product'),
 (u'product', u'shelf'),
 (u'shelf', u'hair'),
 (u'hair', u'type'),
 (u'cute', u'i'),
 (u'i', u"n't"),
 (u"n't", u'collect'),
 (u'collect', u'idolish7'),
 (u'idolish7', u'thing'),
 (u'good', u'night'),
 (u'night', u'chel'),
 (u'chel', u'\u2764\ufe0f'),
 (u"'s", u'weird'),
 (u'weird', u'one'),
 (u'one', u'but'),
 (u'but', u'i'),
 (u'i', u'really'),
 (u'really', u'hope'),
 (u'hope', u"n't"),
 (u"n't", u'fucked'),
 (u'feel', u'better'),
 (u'better', u'si'),
 (u'let', u'drift'),
 (u'ca', u"

In [224]:
def get_bigram_features(bigramlist):
    bigramlist = nltk.FreqDist(bigramlist)
    bigram_features = bigramlist.keys()
    return bigram_features

In [290]:
bigram_features = get_bigram_features(get_bigrams_in_tweets(dat.head()["bigrams"]))

In [291]:
bigram_features

[(u'time', u'baking'),
 (u'campaign', u'hillary'),
 (u'hockey', u'went'),
 (u'video', u'man'),
 (u'awesome', u'ahah'),
 (u'risking', u'integrity'),
 (u'genuinely', u'notice'),
 (u"'s", u'always'),
 (u'upside-down', u'cake'),
 (u'banana', u'upside-down'),
 (u'think', u'ruined'),
 (u'time', u'risking'),
 (u'cake', u'fml'),
 (u'we', u'genuinely'),
 (u'freezing', u'cold'),
 (u'always', u'freezing'),
 (u'ny', u'time'),
 (u'integrity', u'campaign'),
 (u'went', u'baseball'),
 (u'it', u"'s"),
 (u'ruined', u'banana'),
 (u'baking', u'new'),
 (u'first', u'time'),
 (u'ahah', u'be'),
 (u'cold', u'office'),
 (u'honest', u'video'),
 (u'be', u'honest'),
 (u'new', u'flat'),
 (u'man', u'we'),
 (u'flat', u'think'),
 (u'awww', u'hockey')]

In [292]:
for bigram in bigram_features : 
    dat[str(bigram)] = dat["bigrams"].apply(lambda x: bigram in x)

In [301]:
from sklearn import preprocessing
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(dat.iloc[:,4:], dat.iloc[:,1], test_size=0.4, random_state=0)

MemoryError: 

In [296]:
from sklearn.naive_bayes import GaussianNB
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(X_train, y_train)
print(model)


GaussianNB()


In [297]:
# make predictions
expected = y_test
predicted = model.predict(X_test)

In [299]:
from sklearn import metrics

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

   negative       0.00      0.00      0.00         2
    neutral       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00         2

[[0 2]
 [0 0]]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [300]:
X_train

Unnamed: 0,"(u'time', u'baking')","(u'campaign', u'hillary')","(u'hockey', u'went')","(u'video', u'man')","(u'awesome', u'ahah')","(u'risking', u'integrity')","(u'genuinely', u'notice')","(u""'s"", u'always')","(u'upside-down', u'cake')","(u'banana', u'upside-down')",...,"(u'slaved', u'stove')","(u'billionaire', u'buffet')","(u'know', u'much')","(u'working', u'2')","(u'trigger', u'go')","(u'free', u'till')","(u'small', u'blessing')","(u'continue', u'page')","(u'robb', u'bank')","(u'month', u""n't"")"
1731055,False,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1276011,True,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
582523,False,False,False,True,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [226]:
def extract_features(tweet):
    tweet_bigrams = set(find_bigrams(lemmatizer(remove_punctuation(remove_stop_word(tokenizer(process(tweet)))))))
    #features = {}
    features = []
    for bigram in bigram_features:
        #features['contains(%s)' % str(bigram)] = (bigram in tweet_bigrams)
        features.append(bigram in tweet_bigrams)
    return features

In [227]:
find_bigrams(lemmatizer(remove_punctuation(remove_stop_word(tokenizer(process("The dog is in the kitchen and I hate it"))))))

[(u'the', u'dog'), (u'dog', u'kitchen'), (u'kitchen', u'i'), (u'i', u'hate')]

In [282]:
dat.head().iloc[:,1]

1597378    negative
1731055     neutral
1500931    negative
1276011    negative
582523     positive
Name: sentiment, dtype: object

In [285]:
dat.head().iloc[:,4:]

Unnamed: 0,"(u'time', u'baking')","(u'campaign', u'hillary')","(u'hockey', u'went')","(u'video', u'man')","(u'awesome', u'ahah')","(u'risking', u'integrity')","(u'genuinely', u'notice')","(u""'s"", u'always')","(u'upside-down', u'cake')","(u'banana', u'upside-down')",...,"(u'baking', u'new')","(u'first', u'time')","(u'ahah', u'be')","(u'cold', u'office')","(u'honest', u'video')","(u'be', u'honest')","(u'new', u'flat')","(u'man', u'we')","(u'flat', u'think')","(u'awww', u'hockey')"
1597378,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1731055,False,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1500931,False,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
1276011,True,False,False,False,False,False,False,False,True,True,...,True,True,False,False,False,False,True,False,True,False
582523,False,False,False,True,True,False,True,False,False,False,...,False,False,True,False,True,True,False,True,False,False


In [235]:
from sklearn import preprocessing
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(dat["text"], dat["sentiment"], test_size=0.4, random_state=0)

In [194]:
training_set = nltk.classify.apply_features(extract_features, X_train.tolist())

In [175]:
import numpy as np
import scipy
import sklearn

In [51]:
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

Hello  World     


In [242]:
[extract_features(X) for X in X_train.tolist()]

AttributeError: 'list' object has no attribute 'decode'

In [237]:
X_train.tolist()

[[u'did', u"n't", u'get', u'picked', u'purchase', u'world', u'series', u'tix'],
 [u'beautiful-model',
  u'dress',
  u'the',
  u'fabric',
  u'seems',
  u'sheer',
  u'style',
  u'good',
  u'see',
  u'``',
  u'runway',
  u"''"],
 [u'life',
  u'short',
  u'i',
  u'lost',
  u'good',
  u'friend',
  u'enjoy',
  u'moment',
  u'appreciate',
  u'people',
  u'love',
  u'never',
  u'know',
  u'\U0001f622'],
 [u'darya',
  u'prystupa',
  u'ukrainian',
  u'400m',
  u'runner',
  u'diagnosed',
  u'cancer',
  u'sadnews',
  u'athletics',
  u'prystupa'],
 [u'link', u'po', u'please\U0001f62d\U0001f64f\U0001f618'],
 [u'well', u'least', u'bernie', u'supporter', u'alternative'],
 [u'i',
  u'know',
  u'puppy',
  u'miss',
  u'since',
  u'stuck',
  u'cause',
  u'i',
  u"n't",
  u'home',
  u'day'],
 [u'phone', u'exo'],
 [u'thanks',
  u'allesandra',
  u'\U0001f60a\U0001f389',
  u'yeah',
  u'i',
  u'wish',
  u'i',
  u'\U0001f62d'],
 [u'nooo',
  u'i',
  u"'ve",
  u'trying',
  u'multiple',
  u'time',
  u'really',
  u

In [195]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [196]:
print (classifier.most_informative_features(5))

[("contains((u'like', u'\\U0001f440\\U0001f44f\\U0001f3fd'))", False), ("contains((u'really', u'quite'))", False), ("contains((u'job', u'look'))", False), ("contains((u'look', u'perfect'))", False), ("contains((u'close', u'outside'))", False)]


In [197]:
tweet = "Larry is my friend"

In [201]:
print (classifier.classify(extract_features(tweet)))

xx


In [186]:
# Feature extraction

In [201]:
from collections import Counter

In [202]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

In [None]:
all_features = [(get_features(email, 'bow'), label) for (email, label) in all_emails]