# Step 0 : Downloading/loading libraries and data 

## Downloading data

In [33]:
# Downloading tweets 
! wget http://172.22.218.81/tweets+labels.tsv.gz

--2016-10-31 21:46:01--  http://172.22.218.81/tweets+labels.tsv.gz
Connecting to 172.22.218.81:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79966529 (76M) [application/octet-stream]
Saving to: ‘tweets+labels.tsv.gz’


2016-10-31 21:46:32 (2.48 MB/s) - ‘tweets+labels.tsv.gz’ saved [79966529/79966529]



In [34]:
## Extracting and cleaning tweets 
! gunzip tweets+labels.tsv.gz 
! cat tweets+labels.tsv | tr -d "\r" > cleaned_dataset.tsv

## Downloading libraries

In [35]:
# Downloading required nltk data 
nltk.download('punkt') ## data used for the word tokenizer 
nltk.download("wordnet") ## data used for the lemmatizer

[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading libraries

In [36]:
import pandas as pd
from nltk import word_tokenize, WordNetLemmatizer

# Step 1 : loading data 

In [37]:
data = pd.read_csv("cleaned_dataset.tsv", sep= "\t", header=None, names=["id", "sentiment", "text"])

In [38]:
## Sampling 6800 tweets positive / negative / neutral 
pos = data[data["sentiment"] == "positive"].sample(6800)
neg = data[data["sentiment"] == "negative"].sample(6800)
neu = data[data["sentiment"] == "neutral"].sample(6800)

In [39]:
## Combining pos, neg, neu 
dat = pd.concat([pos, neg, neu]).sample(frac=1)

# Step 2 : Preprocessing

In [23]:
wordnet_lemmatizer = WordNetLemmatizer()

In [24]:
def preprocess(sentence):
    return [wordnet_lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence.decode('utf-8'))]

Splitting the text by white spaces and punctuation marks – the tools that are used for this purpose are called tokenizers, and you can use a tokenizer provided with the NLTK.

In [26]:
dat.head()

Unnamed: 0,id,sentiment,text
933680,790230694211489792,positive,@BonusFPL oh dear I've got 1 Chelsea :)
1726869,unavailable,neutral,I remember when you tweeted about it lol https...
21568,788102159288020992,negative,@combusts we failed to protect mia :(
1036210,790293176208723968,negative,@LyricOfWisdom no melee pools yet :(
1133364,790361468428574720,negative,You'll never know. I hope the next person you ...


In [173]:
tokenizer = lambda x: nltk.word_tokenize(x.decode('utf-8'))

In [29]:
dat["text"] = dat["text"].apply(preprocess)

In [184]:
dat.head()

Unnamed: 0,id,sentiment,text
1383809,790558958679302144,positive,"[@, matthewlau18, @, jeraldawj, trade, for, mb..."
1825340,unavailable,neutral,"[@, theeconomist, it, appears, poll, will, ret..."
826728,790153184236961792,positive,"[post, scene…, lovely, bruise, :, ), http, :, ..."
1188372,790398248418553856,negative,"[all, i, want, in, life, is, to, hug, a, panda..."
1720396,unavailable,neutral,"[griersunflower, http, :, //t.co/gjbcxzpzt1]"


linking the different forms of the same word (for example, price and prices, is and are) to each other – the tools that can do that are called lemmatizers, and you can again use one of those that come with the NLTK.

converting all words to lowercase so that the classifier does not treat People, people and PEOPLE as three separate features.

In [185]:
dat

Unnamed: 0,id,sentiment,text
1383809,790558958679302144,positive,"[@, matthewlau18, @, jeraldawj, trade, for, mb..."
1825340,unavailable,neutral,"[@, theeconomist, it, appears, poll, will, ret..."
826728,790153184236961792,positive,"[post, scene…, lovely, bruise, :, ), http, :, ..."
1188372,790398248418553856,negative,"[all, i, want, in, life, is, to, hug, a, panda..."
1720396,unavailable,neutral,"[griersunflower, http, :, //t.co/gjbcxzpzt1]"
537744,789904006978035713,positive,"[@, keplermessiah, @, maggyw519, @, ejlandwehr..."
1525166,790636482532958208,positive,"[@, everythingrobyn, @, seattlegators, we, 'd,..."
1704587,unavailable,neutral,"[happy, birthday, !, !, !, wish, u, many, more..."
69935,782720905566838784,positive,"[@, hh_kevriel91, thanks, for, welcoming, ,, k..."
1717237,unavailable,neutral,"[kai, is, not, a, good, guy, ., he, is, not, a..."


In [186]:
# Feature extraction

In [187]:
from nltk.corpus import stopwords

In [189]:
stoplist = stopwords.words("english")

In [191]:
twitter_stop_words = ["@", "rt"]

In [196]:
remove_stop_word = lambda list_of_words: [word for word in list_of_words if word not in stoplist+twitter_stop_words]

In [197]:
remove_stop_word(["a", "dog"])

['dog']

In [198]:
dat["text"] = dat["text"].apply(remove_stop_word)

In [201]:
from collections import Counter

In [202]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

In [None]:
all_features = [(get_features(email, 'bow'), label) for (email, label) in all_emails]