# Step 0 : Downloading/loading libraries and data 

## Downloading data

In [1]:
# Downloading tweets 
! wget http://172.22.218.81/tweets+labels.tsv.gz

--2016-11-16 09:16:18--  http://172.22.218.81/tweets+labels.tsv.gz
Connecting to 172.22.218.81:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79966529 (76M) [application/octet-stream]
Saving to: ‘tweets+labels.tsv.gz’


2016-11-16 09:16:47 (2.66 MB/s) - ‘tweets+labels.tsv.gz’ saved [79966529/79966529]



In [2]:
## Extracting and cleaning tweets 
! gunzip tweets+labels.tsv.gz 
! cat tweets+labels.tsv | tr -d "\r" > cleaned_dataset.tsv

## Downloading libraries

In [3]:
import nltk
# Downloading required nltk data 
nltk.download('punkt') ## data used for the word tokenizer 
nltk.download("wordnet") ## data used for the lemmatizer
nltk.download("stopwords") ## data used for removing stopwords

[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/ds/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/ds/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Loading libraries

In [4]:
# pandas for data manipulation 
import pandas as pd
# nltk for natural language utilities 
from nltk import word_tokenize, WordNetLemmatizer
# re stands for Regular Expression, this lib gathers tools to work with regex
import re
# string is useful to access punctuation characters list
import string
# useful to access english stop words list
from nltk.corpus import stopwords

# Step 1 : loading data 

In [5]:
data = pd.read_csv("cleaned_dataset.tsv", sep= "\t", header=None, names=["id", "sentiment", "text"])

In [6]:
## Sampling 6800 tweets positive / negative / neutral 
pos = data[data["sentiment"] == "positive"].sample(6800)
neg = data[data["sentiment"] == "negative"].sample(6800)
neu = data[data["sentiment"] == "neutral"].sample(6800)

In [7]:
## Combining pos, neg, neu 
dat = pd.concat([pos, neg, neu]).sample(frac=1)

# Step 2 : Preprocessing

### Defining some custom utilities to process tweets

In [8]:
# removes links, usernames, twitter special word 'RT' (retweet) and emoticons
def process(tweet):
    p = re.compile('(http[s]*://[^\s]+|@[^\s]*|RT|(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$))|&\w*')
    return p.sub("", tweet)

In [9]:
# linking the different forms of the same word (for example, price and prices, is and are) to each other
wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = lambda list_of_words : [wordnet_lemmatizer.lemmatize(word) for word in list_of_words]

In [10]:
# Tokenize i.e. transform a sentence in a list of words
tokenizer = lambda x: nltk.word_tokenize(x.decode('utf-8'))

In [11]:
# removes stop words and converts to lowercase
remove_stop_word = lambda list_of_words: [word.lower() for word in list_of_words if word not in stopwords.words("english")]

In [12]:
remove_punctuation = lambda list_of_words: [word for word in list_of_words if word not in string.punctuation]

### Applying text processing

In [13]:
dat["text"] = dat["text"].apply(process).apply(tokenizer).apply(remove_stop_word).apply(remove_punctuation).apply(lemmatizer)

# Step 3 : Feature extraction

bi-gram works better for our problem (A.Pak & P. Paroubek)

### Constructing bi-grams

In [15]:
# source : http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

In [28]:
def test_find_ngrams():
    words = ["There", "is", "a", "dog", "out", "there"]
    print(find_ngrams(words, 2))
    assert(find_ngrams(words, 2)==[('There', 'is'), ('is', 'a'), ('a', 'dog'), ('dog', 'out'), ('out', 'there')])
test_find_ngrams()

[('There', 'is'), ('is', 'a'), ('a', 'dog'), ('dog', 'out'), ('out', 'there')]


Hello  World     


In [186]:
# Feature extraction

In [14]:
dat

Unnamed: 0,id,sentiment,text
244437,783352688826122240,positive,"[trickster, ep1, pretty, promising, wow, i, n'..."
283925,783435526040788992,negative,"[fricken, hoo]"
1393047,790564385076940800,positive,[]
318641,783541537808482304,positive,"[h, vape, god, glad, spent, whole, summer, w, ..."
1718342,unavailable,neutral,"[be, back, bit, ✌🏻]"
645777,789982378294775808,negative,"[aw, i, needed, go]"
1407474,790572397837504512,positive,"[hey, you, check, live, piano, cover, heathen,..."
1160505,790378489631248384,negative,"[shoulder, cry, come]"
1690844,unavailable,positive,"[consider, long, way, 've, come, today]"
1693940,unavailable,neutral,"[i, 'm, glad, 're, page]"


In [201]:
from collections import Counter

In [202]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

In [None]:
all_features = [(get_features(email, 'bow'), label) for (email, label) in all_emails]