# Preprocessing of Twitter Dataset

In [1]:
# importing libraries
import pandas as pd

In [2]:
import numpy as np

# Importing Twitter Dataset

In [3]:
# Read Twitter dataset from .csv file and store into a DataFrame object
df = pd.read_csv('../datasets/training.1600000.processed.noemoticon.csv', names=['sentiment', 'id', 'date', 'query', 'user_name', 'text'], encoding='ISO-8859-1')

In [4]:
# Read the first 5 entries in the dataset using dataframe.head(n) method
df.head(5)

Unnamed: 0,sentiment,id,date,query,user_name,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Read the last 5 entries  in the dataset using dataframe.tail(n) method
df.tail(5)

Unnamed: 0,sentiment,id,date,query,user_name,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
# Drop all rows with null values
df.dropna(inplace=True)

In [7]:
# Create new column "rating"
# assign all positive tweets with 1
# assign all negative tweets with 0
df['rating'] = np.where(df['sentiment'] > 0, 1, 0)

In [8]:
# Check the dataset for last change
df.head()

Unnamed: 0,sentiment,id,date,query,user_name,text,rating
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [9]:
# We drop the following columns: sentiment, id, date, query, user_name
df_dataset = df.drop(['sentiment', 'id', 'date', 'query', 'user_name'], axis=1)

In [10]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [11]:
# We perform lowering case to all tweets 
df_dataset['text'] = df_dataset['text'].str.lower()

In [12]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he can't update his facebook by ...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [13]:
# We perform transformation on negative words: e.g can't to cant
df_dataset['text'] = df_dataset['text'].replace("can't", 'cant', regex=True)

In [14]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [15]:
# We perform transformation on negative words: e.g don't to dont
df_dataset['text'] = df_dataset['text'].replace("don't", 'dont', regex=True)

In [16]:
# We check the new dataset for last changes
df_dataset.head(10)

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@kwesidei not the whole crew,0
6,need a hug,0
7,@loltrish hey long time no see! yes.. rains a...,0
8,@tatiana_k nope they didn't have it,0
9,@twittera que me muera ?,0


In [17]:
# We perform transformation on negative words: e.g didn't to didnt
df_dataset['text'] = df_dataset['text'].replace("didn't", 'didnt', regex=True)

In [18]:
# We check the new dataset for last changes
df_dataset.head(10)

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@kwesidei not the whole crew,0
6,need a hug,0
7,@loltrish hey long time no see! yes.. rains a...,0
8,@tatiana_k nope they didnt have it,0
9,@twittera que me muera ?,0


In [19]:
# We perform transformation on negative words: e.g couldn't to couldnt
df_dataset['text'] = df_dataset['text'].replace("couldn't", 'couldnt', regex=True)

In [20]:
# We check the new dataset for last changes
df_dataset.head(20)

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@kwesidei not the whole crew,0
6,need a hug,0
7,@loltrish hey long time no see! yes.. rains a...,0
8,@tatiana_k nope they didnt have it,0
9,@twittera que me muera ?,0


In [21]:
# We perform transformation on negative words: e.g wouldn't to wouldnt
df_dataset['text'] = df_dataset['text'].replace("wouldn't", 'wouldnt', regex=True)

In [22]:
# We perform transformation on negative words: e.g shouldn't to shouldnt
df_dataset['text'] = df_dataset['text'].replace("shouldn't", 'shouldnt', regex=True)

In [23]:
# We perform transformation on negative words: e.g won't to wont
df_dataset['text'] = df_dataset['text'].replace("won't", 'wont', regex=True)

In [24]:
# We perform transformation on negative words: e.g weren't to werent
df_dataset['text'] = df_dataset['text'].replace("weren't", 'werent', regex=True)

In [25]:
# We perform transformation on negative words: e.g doesn't to doesnt
df_dataset['text'] = df_dataset['text'].replace("doesn't", 'doesnt', regex=True)

In [26]:
# We perform transformation on negative words: e.g haven't to havent
df_dataset['text'] = df_dataset['text'].replace("haven't", 'havent', regex=True)

In [27]:
# We check the new dataset for last changes
df_dataset.head(20)

Unnamed: 0,text,rating
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@kwesidei not the whole crew,0
6,need a hug,0
7,@loltrish hey long time no see! yes.. rains a...,0
8,@tatiana_k nope they didnt have it,0
9,@twittera que me muera ?,0


In [28]:
# Remove urls starting e.g http, https
df_dataset['text'] = df_dataset['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [29]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,@switchfoot - a that's a bummer. you shoulda...,0
1,is upset that he cant update his facebook by t...,0
2,@kenichan i dived many times for the ball. man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [30]:
# Remove all entries (users) starting with @
df_dataset['text'] = df_dataset['text'].replace(r'@\S+', '', regex=True)

In [31]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,- a that's a bummer. you shoulda got david ...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0


In [32]:
# Remove all numbers from tweet text
df_dataset['text'] = df_dataset['text'].str.replace('\d+', '')

In [33]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,- a that's a bummer. you shoulda got david ...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0


In [34]:
# Remove punctuations from tweet text
df_dataset['text'] = df_dataset['text'].str.replace('[^\w\s]',' ')

In [35]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,a that s a bummer you shoulda got david ...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am...,0


In [36]:
# Remove 'quot' string from tweet text  
df_dataset['text'] = df_dataset['text'].replace(r'quot\S+', '', regex=True)

In [37]:
# Removing all english stopwords from tweet text
!pip3 install --user nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /home/dacy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
from nltk.corpus import stopwords

In [39]:
stopword_list = stopwords.words('english')

In [40]:
df_dataset['text'] = df_dataset['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword_list)]))

In [41]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,bummer shoulda got david carr third day,0
1,upset cant update facebook texting might cry r...,0
2,dived many times ball managed save rest go bounds,0
3,whole body feels itchy like fire,0
4,behaving mad cant see,0


In [42]:
# Remove white spaces in the tweet text
df_dataset['text'] = df_dataset['text'].replace('\s+', ' ', regex=True)

In [43]:
# We check the new dataset for last changes
df_dataset.head()

Unnamed: 0,text,rating
0,bummer shoulda got david carr third day,0
1,upset cant update facebook texting might cry r...,0
2,dived many times ball managed save rest go bounds,0
3,whole body feels itchy like fire,0
4,behaving mad cant see,0


# Export dataset and preprocess to fastText format

In [44]:
import codecs

In [45]:
import csv

In [46]:
df_dataset.to_csv('../datasets/preprocessed_data/sentiment140.csv', index=False)

In [47]:
fastText_file = open('../datasets/preprocessed_data/fastText.txt', 'w')

In [48]:
csv_file_path = '../datasets/preprocessed_data/sentiment140.csv'

In [49]:
with codecs.open(csv_file_path, 'r', encoding='utf-8', errors='ignore') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    prefix = '__label__'
    #read line by line from csv file and parse data to fasttext format
    for row in csv_reader:
        fastText_file.write(prefix + row[1] + ' ' + row[0] + '\n')
    print("Successfully parsed to fastText format!")

Successfully parsed to fastText format!
