# Importing necessary libraries

In [293]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Preparing dataset

More information about the dataset can be found here - http://help.sentiment140.com/for-students/ .
<br/>Tweets are labelled either positive or negative.

The data is a CSV with emoticons removed. Data file format has 6 fields:

0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
<br/>1 - the id of the tweet (2087)
<br/>2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
<br/>3 - the query (lyx). If there is no query, then this value is NO_QUERY.
<br/>4 - the user that tweeted (robotickilldozr)
<br/>5 - the text of the tweet (Lyx is cool)

In [108]:
columns = ['sentiment','id','date','query_string','user','text']
df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",header=None, names=columns, encoding='latin-1')

In [72]:
df.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [73]:
df.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

As we can see, 50% of the data has negative labels, and another 50% - positive ones.

To make using of the dataset more convenient I will remove unnecessary columns and change the value for positive labels to 1. 

In [109]:
df.drop(['id','date','query_string','user'],axis=1,inplace=True)
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})
df.sentiment.value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [110]:
df.head(20)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


# Data preparation

Step 1: HTML decoding

In [111]:
tweet = df.text[492]
tweet

"pears &amp; Brie, bottle of Cabernet, and &quot;Win a Date With Tad Hamilton&quot;... oh gawwd my life flashed forward to when I'm 40 with my 75 cats "

In [112]:
example1 = BeautifulSoup(tweet, 'html.parser')
print(example1.get_text())

pears & Brie, bottle of Cabernet, and "Win a Date With Tad Hamilton"... oh gawwd my life flashed forward to when I'm 40 with my 75 cats 


Step 2: dealing with @mention

In [113]:
tweet = df.text[19]
tweet

'@FakerPattyPattz Oh dear. Were you drinking out of the forgotten table drinks? '

In [114]:
re.sub(r'@[A-Za-z0-9]+','',tweet)

' Oh dear. Were you drinking out of the forgotten table drinks? '

Step 3: dealing with URL links

In [305]:
tweet = df.text[0]
tweet

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [307]:
re.sub('https?://[A-Za-z0-9./]+','', tweet)

"@switchfoot  - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [308]:
re.sub('www.[^ ]+','', tweet)

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

Step 4: UTF-8 BOM (Byte Order Mark)

In [179]:
tweet = df.text[226]
tweet

'Tuesdayï¿½ll start with reflection ï¿½n then a lecture in Stress reducing techniques. That sure might become very useful for us accompaniers '

In [180]:
testing = bytes(tweet,'iso-8859-1').decode('utf-8-sig')
testing

'Tuesday�ll start with reflection �n then a lecture in Stress reducing techniques. That sure might become very useful for us accompaniers '

In [181]:
testing.replace("\ufffd", "?")

'Tuesday?ll start with reflection ?n then a lecture in Stress reducing techniques. That sure might become very useful for us accompaniers '

Step 5: hashtag / numbers

In [291]:
tweet = df.text[175]
tweet

"@machineplay I'm so sorry you're having to go through this. Again.  #therapyfail"

In [292]:
re.sub("[^a-zA-Z]", " ", tweet)

' machineplay I m so sorry you re having to go through this  Again    therapyfail'

Now I'll put all these methods in one function on order to apply it to the whole dataset. Moreover, CLASS IN THE FILE




Below is the updated datacleaning function. The order of the cleaning is

    Souping
    BOM removing
    url address('http:'pattern), twitter ID removing
    url address('www.'pattern) removing
    lower-case
    negation handling
    removing numbers and special characters
    tokenizing and joining



In [310]:
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(tweet):
    soup = BeautifulSoup(tweet, 'html.parser')
    souped = soup.get_text()
    try:
        bom_removed = souped.bytes(tweet,'iso-8859-1').decode('utf-8-sig').replace("\ufffd", "?")
    except:
        bom_removed = souped
        
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

# Remove all Non-ASCII characters
# tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
# tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)

In [299]:
df.shape[0]

1600000

In [311]:
print("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(df.shape[0]):
    if (i+1)%100000 == 0:
        print("Tweets {} of {} has been processed".format(i+1, df.shape[0]))                                                                  
    clean_tweet_texts.append(tweet_cleaner(df['text'][i]))

Cleaning and parsing the tweets...

Tweets 100000 of 1600000 has been processed
Tweets 200000 of 1600000 has been processed
Tweets 300000 of 1600000 has been processed
Tweets 400000 of 1600000 has been processed
Tweets 500000 of 1600000 has been processed
Tweets 600000 of 1600000 has been processed
Tweets 700000 of 1600000 has been processed
Tweets 800000 of 1600000 has been processed
Tweets 900000 of 1600000 has been processed
Tweets 1000000 of 1600000 has been processed
Tweets 1100000 of 1600000 has been processed
Tweets 1200000 of 1600000 has been processed
Tweets 1300000 of 1600000 has been processed
Tweets 1400000 of 1600000 has been processed
Tweets 1500000 of 1600000 has been processed
Tweets 1600000 of 1600000 has been processed


In [None]:
# check for null values and remove rows with null`s

# Saving prepared dataset

In [312]:
new_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
new_df['sentiment'] = df.sentiment
new_df.head()

Unnamed: 0,text,sentiment
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [303]:
new_df.to_csv('cleaned_data.csv',encoding='utf-8')