    TWITTER SENTIMENT ANALYSIS

IMPORTING REQUIRED LIBRARIES

In [36]:
import pandas as pd
import numpy as np
import re
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sweek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

TAKING A LOOK AT THE DATASET


In [37]:
df = pd.read_csv(r'C:\Projects\Twitter sentiment analysis\tweets.csv', encoding="Latin-1", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


THE DATASET DOES NOT HAVE COLUMNS, ADDING COLUMN NAMES

In [39]:
column_names = ["target", "id", "date", "flag", "user", "text"]
df.columns = column_names
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [40]:
features, columns = df.shape
print(f"The dataset has {features} features and {columns} columns.")
print(df.dtypes)

The dataset has 1600000 features and 6 columns.
target     int64
id         int64
date      object
flag      object
user      object
text      object
dtype: object


CHECKING FOR EMOJIS

In [41]:
df["emoji"] = df["text"].apply(lambda text: any(char in emoji.EMOJI_DATA for char in text))
print(df["emoji"].unique())

[False  True]


DATA PREPROCESSING

In [42]:
# Text Lowercasing
df['preprocessed'] = df["text"].apply(lambda texts: texts.lower())
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,is upset that he can't update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,@kenichan i dived many times for the ball. man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"@nationwideclass no, it's not behaving at all...."


In [43]:
# Removing URLs
df['preprocessed'] = df['preprocessed'].str.replace(r'http\S+|www\S+', '', regex=True)
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,@switchfoot - a that's a bummer. you shoulda...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,is upset that he can't update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,@kenichan i dived many times for the ball. man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"@nationwideclass no, it's not behaving at all...."


In [44]:
# We will be using TweetTokenizer to tokenize our text. We use TweetTokenizer so that our contractions don't split into two tokens.

# TweetTokenizer required the values to be string
df['preprocessed'] = df['preprocessed'].astype(str)

In [45]:
#Tokenization

# Used tqdm to create a progress bar to report progress.
from tqdm import tqdm
import time

tokenizer = TweetTokenizer()

total_tweets = len(df)

tokenized_tweets = []
for index, row in tqdm(df.iterrows(), total=total_tweets, desc="Tokenizing tweets"):
    tweet = row['preprocessed']
    tokens = tokenizer.tokenize(tweet)
    tokenized_tweets.append(tokens)

Tokenizing tweets: 100%|██████████| 1600000/1600000 [02:10<00:00, 12269.44it/s]


In [46]:
total_tweets = len(df)
num_tweets_to_print = 2  # Defining the number of tokenized tweets to print

# Printing the tokenized tweets
for i in range(min(num_tweets_to_print, total_tweets)):
    print(f"Tokenized tweet {i+1}: {tokenized_tweets[i]}")

Tokenized tweet 1: ['@switchfoot', '-', 'a', "that's", 'a', 'bummer', '.', 'you', 'shoulda', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', ';d']
Tokenized tweet 2: ['is', 'upset', 'that', 'he', "can't", 'update', 'his', 'facebook', 'by', 'texting', 'it', '...', 'and', 'might', 'cry', 'as', 'a', 'result', 'school', 'today', 'also', '.', 'blah', '!']


In [47]:
df['preprocessed'] = tokenized_tweets
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"[@switchfoot, -, a, that's, a, bummer, ., you,..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,"[is, upset, that, he, can't, update, his, face..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,"[@kenichan, i, dived, many, times, for, the, b..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"[@nationwideclass, no, ,, it's, not, behaving,..."


In [48]:
#Removing punctuation
df['preprocessed'] = df['preprocessed'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"[@switchfoot, a, that's, a, bummer, you, shoul..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,"[is, upset, that, he, can't, update, his, face..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,"[@kenichan, i, dived, many, times, for, the, b..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"[@nationwideclass, no, it's, not, behaving, at..."


In [49]:
#Removing stopwords
stop_words = set(stopwords.words('english'))
df['preprocessed'] = df['preprocessed'].apply(lambda tokens: [token for token in tokens if token not in stop_words])
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"[@switchfoot, that's, bummer, shoulda, got, da..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,"[upset, can't, update, facebook, texting, ...,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,"[@kenichan, dived, many, times, ball, managed,..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,"[whole, body, feels, itchy, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"[@nationwideclass, behaving, i'm, mad, can't, ..."


In [50]:
lemmatizer = WordNetLemmatizer()

df['preprocessed'] = df['preprocessed'].apply(lambda tokens: [lemmatizer.lemmatize(token, pos='v') for token in tokens])
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"[@switchfoot, that's, bummer, shoulda, get, da..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,"[upset, can't, update, facebook, texting, ...,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,"[@kenichan, dive, many, time, ball, manage, sa..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,"[whole, body, feel, itchy, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"[@nationwideclass, behave, i'm, mad, can't, see]"


In [51]:
#Combining tokens into a sentence
df['clean_tweets'] = df['preprocessed'].apply(lambda tokens: ' '.join(tokens))
df.head()

Unnamed: 0,target,id,date,flag,user,text,emoji,preprocessed,clean_tweets
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",False,"[@switchfoot, that's, bummer, shoulda, get, da...",@switchfoot that's bummer shoulda get david ca...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,False,"[upset, can't, update, facebook, texting, ...,...",upset can't update facebook texting ... might ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,False,"[@kenichan, dive, many, time, ball, manage, sa...",@kenichan dive many time ball manage save 50 r...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,False,"[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",False,"[@nationwideclass, behave, i'm, mad, can't, see]",@nationwideclass behave i'm mad can't see


In [52]:
#Saving the cleaned dataframe
df.to_csv("cleaned_tweets.csv", index=False)