# Data pre-processing

### Combining dataframes

In [1]:
# import necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [2]:
# importing all comment csv files

df1 = pd.read_csv("./data/01_scmp_comments.csv")
df2 = pd.read_csv("./data/02_aj_comments.csv")
df3 = pd.read_csv("./data/03_ewd_comments.csv")
df4 = pd.read_csv("./data/04_cs_comments.csv")
df5 = pd.read_csv("./data/05_ofs_comments.csv")
df6 = pd.read_csv("./data/06_pm_comments.csv")
df7 = pd.read_csv("./data/07_nd_comments.csv")
df8 = pd.read_csv("./data/08_ng_comments.csv")

In [3]:
# combine all dataframes into 1

all_comments = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=0)

In [4]:
# reset index

all_comments.reset_index(inplace=True)
all_comments.drop(columns = 'index', inplace=True)

In [5]:
all_comments.shape

(25444, 3)

After combining all comments into a single dataframe, there are 25,444 rows - which should give us enough data to train and test our classification model.

In [6]:
# save as csv

all_comments.to_csv('./data/all_comments.csv', index=False)

### Data cleaning (part 1)
- Check if values are in the correct column
- Check dtypes
- Check for duplicates and null values
- Identify language, drop non-English comments

#### Check if values are in the correct column

In [7]:
# check values are in the right column

all_comments['video_id'].value_counts()

8XNu282FkvM                     8637
Hkxf4SC_SBk                     4408
xi6r3hZe5Tg                     4350
GL1JdIeoo4A                     3808
XSOgcpRbrCo                     2294
XDYy8z7krAI                      951
vyfJgJBB3Vk                      748
xJlgtV8L7Jc                      247
documentary on my country :)       1
Name: video_id, dtype: int64

In [8]:
all_comments[all_comments['video_id'] == 'documentary on my country :)'].index

Int64Index([23114], dtype='int64')

After examining the csv file, it seems that the original comment was split and placed into the wrong columns. Since it is only one line, we can correct this manually in order to preserve the number of rows.

In [9]:
# correcting extraction mistake

all_comments['comment'].iloc[23113] = all_comments['comment'].iloc[23113] + all_comments['video_id'].iloc[23114]

In [10]:
all_comments['comment_date'].iloc[23113] = all_comments['comment'].iloc[23114]

In [11]:
all_comments.drop(index=23114, inplace=True)

In [12]:
all_comments['video_id'].value_counts()

8XNu282FkvM    8637
Hkxf4SC_SBk    4408
xi6r3hZe5Tg    4350
GL1JdIeoo4A    3808
XSOgcpRbrCo    2294
XDYy8z7krAI     951
vyfJgJBB3Vk     748
xJlgtV8L7Jc     247
Name: video_id, dtype: int64

#### Check dtypes

In [13]:
all_comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25443 entries, 0 to 25443
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      25443 non-null  object
 1   comment       25441 non-null  object
 2   comment_date  25443 non-null  object
dtypes: object(3)
memory usage: 795.1+ KB


In [14]:
all_comments['comment_date'] = pd.to_datetime(all_comments['comment_date'])

In [15]:
all_comments['comment_date'].head()

0   2023-07-26 04:11:40+00:00
1   2023-08-03 03:52:04+00:00
2   2023-08-02 15:10:36+00:00
3   2023-08-01 17:07:10+00:00
4   2023-08-01 08:46:42+00:00
Name: comment_date, dtype: datetime64[ns, UTC]

#### Check duplicates and null values

In [16]:
# check for duplicates

all_comments.duplicated().value_counts()

False    25440
True         3
dtype: int64

In [17]:
# remove duplicates

all_comments.drop_duplicates(keep='first', inplace=True)

In [18]:
# check for null values

all_comments.isnull().value_counts()

video_id  comment  comment_date
False     False    False           25438
          True     False               2
dtype: int64

In [19]:
all_comments[(all_comments['comment'].isnull() == True)]

Unnamed: 0,video_id,comment,comment_date
7350,vyfJgJBB3Vk,,2023-03-23 11:09:43+00:00
21361,xi6r3hZe5Tg,,2022-01-25 20:03:32+00:00


In [20]:
# drop null rows as it is only two data rows

all_comments.dropna(axis=0, inplace=True)

In [21]:
all_comments.shape

(25438, 3)

#### Identify language

In [22]:
# detect language and drop if not english

from langdetect import detect

def get_lang(text):
    try:
        result = detect(text)
        return result
    except:
        return None

In [23]:
all_comments['language'] = all_comments['comment'].apply(get_lang)

In [24]:
all_comments['language'].value_counts()

en       22164
tl         559
it         273
id         251
af         222
de         166
so         129
nl         116
es         101
no          97
fr          96
ro          89
pl          83
cy          82
et          76
sw          73
da          59
ca          58
pt          49
hr          45
vi          42
tr          41
sl          40
fi          38
sq          36
ko          34
sv          31
zh-cn       30
ar          24
ru          16
sk          13
hu          13
cs          11
lt           9
bn           8
th           8
lv           6
ja           6
zh-tw        5
ta           4
hi           4
fa           4
he           1
mr           1
el           1
uk           1
kn           1
Name: language, dtype: int64

In [25]:
# drop non-english rows, as there will still be enough rows for analysis after dropping
# dropped rows are 12.9% of total rows

all_comments_en = all_comments.drop(index = all_comments[(all_comments['language'] != 'en')].index)

In [26]:
# drop language column, as all will be in english

all_comments_en.drop(columns = 'language' , inplace=True)

In [27]:
all_comments_en

Unnamed: 0,video_id,comment,comment_date
0,xJlgtV8L7Jc,For more on this: https://sc.mp/gjys,2023-07-26 04:11:40+00:00
1,xJlgtV8L7Jc,"Yes, punish the criminals , not the innocent ...",2023-08-03 03:52:04+00:00
2,xJlgtV8L7Jc,Flor Contemplacion had been there. Filipinos k...,2023-08-02 15:10:36+00:00
4,xJlgtV8L7Jc,Don't mind these westoids. I'm half Indonesian...,2023-08-01 08:46:42+00:00
5,xJlgtV8L7Jc,No wonder Singapore has the most powerful pass...,2023-08-01 07:57:54+00:00
...,...,...,...
25437,xi6r3hZe5Tg,+1 science is difficult,2018-11-24 13:06:36+00:00
25440,xi6r3hZe5Tg,I live in Australia and have always wanted to ...,2018-11-24 13:03:39+00:00
25441,xi6r3hZe5Tg,yeah! second here ❤,2018-11-24 13:03:13+00:00
25442,xi6r3hZe5Tg,Awesome... civilization is getting to it's pea...,2018-11-24 13:02:55+00:00


In [28]:
# save as pickle to save dtypes

all_comments_en.to_pickle('./data/all_comments_en.pkl')

### Sentiment analysis using a pre-trained model

The objective of this project is to build a classification model for sentiment analysis. The raw data does not have a `y_true` label for use in testing. As such, we will use a pre-trained sentiment analysis model to get a sentiment score for each comment.

We will start with the VADER model as it was trained on social media texts. It can deal with word-shapes (eg. ALL CAPS), emojis, slang, and includes these heuristics in its sentiment calculation. As such, we have not cleaned our text of these attibutes.

#### Getting the labels

In [29]:
# opening pickle file

comments_en = pd.read_pickle("data/all_comments_en.pkl")

In [30]:
comments_en.head()

Unnamed: 0,video_id,comment,comment_date
0,xJlgtV8L7Jc,For more on this: https://sc.mp/gjys,2023-07-26 04:11:40+00:00
1,xJlgtV8L7Jc,"Yes, punish the criminals , not the innocent ...",2023-08-03 03:52:04+00:00
2,xJlgtV8L7Jc,Flor Contemplacion had been there. Filipinos k...,2023-08-02 15:10:36+00:00
4,xJlgtV8L7Jc,Don't mind these westoids. I'm half Indonesian...,2023-08-01 08:46:42+00:00
5,xJlgtV8L7Jc,No wonder Singapore has the most powerful pass...,2023-08-01 07:57:54+00:00


In [31]:
# instantiating the model 

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [32]:
def get_sentiment_score(text):
    score = sia.polarity_scores(text)
    return score

In [33]:
comments_en.columns

Index(['video_id', 'comment', 'comment_date'], dtype='object')

In [34]:
comments_en['sentiment_score'] = comments_en['comment'].apply(get_sentiment_score)

In [35]:
compound_senti = []

for ss in comments_en['sentiment_score'].values:
    s_score = ss.get('compound')
    compound_senti.append(s_score)

In [36]:
comments_en['compound'] = compound_senti

The generally accepted threshold values for VADER are:

- positive sentiment: compound score >= 0.05
- neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
- negative sentiment: compound score <= -0.05

In addition, as this model will be used to train a binary classification, we will combine the positive and neutral into one group.

In [37]:
sentiment=[]

for i in range(0, len(comments_en)):
    if comments_en['compound'].values[i] >= 0.05:
        sentiment.append('positive')
    elif comments_en['compound'].values[i] <= -0.05:
        sentiment.append('negative')
    else:
        sentiment.append('neutral')

In [38]:
comments_en['sentiment'] = sentiment

In [39]:
comments_en.reset_index(inplace=True)

In [40]:
comments_en.drop(columns='index', inplace=True)

In [41]:
comments_en['binary'] = comments_en['sentiment'].map({'neutral' : 1, 'positive' : 1, 'negative': 0})

In [42]:
comments_en

Unnamed: 0,video_id,comment,comment_date,sentiment_score,compound,sentiment,binary
0,xJlgtV8L7Jc,For more on this: https://sc.mp/gjys,2023-07-26 04:11:40+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral,1
1,xJlgtV8L7Jc,"Yes, punish the criminals , not the innocent ...",2023-08-03 03:52:04+00:00,"{'neg': 0.577, 'neu': 0.253, 'pos': 0.17, 'com...",-0.7533,negative,0
2,xJlgtV8L7Jc,Flor Contemplacion had been there. Filipinos k...,2023-08-02 15:10:36+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral,1
3,xJlgtV8L7Jc,Don't mind these westoids. I'm half Indonesian...,2023-08-01 08:46:42+00:00,"{'neg': 0.0, 'neu': 0.824, 'pos': 0.176, 'comp...",0.4588,positive,1
4,xJlgtV8L7Jc,No wonder Singapore has the most powerful pass...,2023-08-01 07:57:54+00:00,"{'neg': 0.099, 'neu': 0.763, 'pos': 0.139, 'co...",0.2247,positive,1
...,...,...,...,...,...,...,...
22159,xi6r3hZe5Tg,+1 science is difficult,2018-11-24 13:06:36+00:00,"{'neg': 0.455, 'neu': 0.545, 'pos': 0.0, 'comp...",-0.3612,negative,0
22160,xi6r3hZe5Tg,I live in Australia and have always wanted to ...,2018-11-24 13:03:39+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral,1
22161,xi6r3hZe5Tg,yeah! second here ❤,2018-11-24 13:03:13+00:00,"{'neg': 0.0, 'neu': 0.445, 'pos': 0.555, 'comp...",0.3595,positive,1
22162,xi6r3hZe5Tg,Awesome... civilization is getting to it's pea...,2018-11-24 13:02:55+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral,1


#### Testing the labels

We will manually sample 100 random rows to check if VADER has accurately labelled the sentiment.

In [43]:
# use np.rand to generate 100 rows from our dataset

np.random.seed(123)
row_no = np.random.choice(range(0,len(comments_en)), size=100, replace=False)
row_no

array([15361, 20914, 14231, 20369, 11848, 14385, 15052,  9924,  2671,
        3541,  6826, 17065,  6041, 14213,   358, 15657,   149, 10211,
       20185, 16834,  2487, 14704,  3649, 16701,  4332,  5974,  4549,
       17203, 20620,   605, 10154,  1770,  4349,  3172,  2260, 17974,
        5464, 16786, 11257, 10756,  8333,  5278, 12461,  4301, 11613,
        2335, 18963,  1984,  1043,  4576, 13297, 17499,  6460, 15585,
        8493,  9914,  5137, 12724, 18734,  4011, 21959, 15368,  2295,
       19901,  8171,  5914,  1833, 17785, 11059, 14665,  1370, 16412,
        8490, 20997, 10314, 10346, 20902,  7349,  8790,  6730, 12093,
        8873, 12981,  9182, 21826,  2688, 15471, 19618,  8161, 11477,
       11000, 21890, 17656, 17686, 21651, 19344, 14202, 18203,   827,
       11827])

In [44]:
# get VADER's analysis

pd.set_option('display.max_colwidth', None)

for no in row_no:
    vader_sentiment = comments_en.iloc[no]['binary']
    print(vader_sentiment)

1
1
1
1
0
0
1
0
0
0
1
1
1
0
1
1
1
1
1
1
0
1
0
0
1
1
1
1
1
1
0
0
1
0
0
1
0
1
1
1
1
1
1
0
1
1
0
0
0
1
1
1
1
1
0
1
0
1
1
1
0
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
0
1
0
1
1
1
1
0
0
1
1
1
0
1
1
1
1
1
1
1
1
0
1


In [45]:
# identifying the 100 comments

for no in row_no:
    vader_sentiment = comments_en.iloc[no][['comment']]
    print(vader_sentiment)

comment    Now this is why I want to go to Singapore 🇸🇬
Name: 15361, dtype: object
comment    I love there schooling.
Name: 20914, dtype: object
comment    I love your teshirt bro
Name: 14231, dtype: object
comment    Singapore is the most underrated tech savvy and wealthy city on earth, it’s becoming bigger and better than Dubai, and most people don’t even talk about it or see it as a poor region
Name: 20369, dtype: object
comment    I live there and when I went to us I was like wtf
Name: 11848, dtype: object
comment    stupid people here on Balkans once have had that multi ethnic policy based country, and they ruined it. They think are better with hating each others.
Name: 14385, dtype: object
comment    Singapore is my fav country, USA, Canada, Russia, Indonesia and Malaysia (my country)
Name: 15052, dtype: object
comment    i can guarantee there are a lot of unhappy people in Singapore
Name: 9924, dtype: object
comment    Here in philippines thousands of drug pushers and users are 

When comparing VADER sentiment predictions to the manual sentiment predictions (please see VADER_manual.pdf in assets folder), there is a 73% match. Studies show that humans can agree on sentiment only around [65-80% of the time](https://link.springer.com/article/10.1007/s11069-022-05307-w), as such, we will accept VADER sentiment analysis, and use it as our ground truth by which to compare our train and test scores.

### Data cleaning (part 2)

- Clean text of special characters, punctuation, excessive letters, numbers, etc.
- Tokenize
- Lemmatizing/stemming

#### Cleaning text

In [46]:
# remove punctuation, special characters, url, html
# convert all text to lowercase

import re

def get_clean(text):
    text = text.lower()
    text = re.sub(r'https?://(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}(?:/[\w./%?=&-]*)?', 'url', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace('_', '')
    return text

In [47]:
comments_en['clean_text'] = comments_en['comment'].apply(get_clean)

In [48]:
comments_en

Unnamed: 0,video_id,comment,comment_date,sentiment_score,compound,sentiment,binary,clean_text
0,xJlgtV8L7Jc,For more on this: https://sc.mp/gjys,2023-07-26 04:11:40+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0000,neutral,1,for more on this url
1,xJlgtV8L7Jc,"Yes, punish the criminals , not the innocent people",2023-08-03 03:52:04+00:00,"{'neg': 0.577, 'neu': 0.253, 'pos': 0.17, 'compound': -0.7533}",-0.7533,negative,0,yes punish the criminals not the innocent people
2,xJlgtV8L7Jc,"Flor Contemplacion had been there. Filipinos know it, and a movie had been made out of her experience.",2023-08-02 15:10:36+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0000,neutral,1,flor contemplacion had been there filipinos know it and a movie had been made out of her experience
3,xJlgtV8L7Jc,Don't mind these westoids. I'm half Indonesian-Malaysian and I applaud Singapore's effort for combating drug trafficking,2023-08-01 08:46:42+00:00,"{'neg': 0.0, 'neu': 0.824, 'pos': 0.176, 'compound': 0.4588}",0.4588,positive,1,dont mind these westoids im half indonesianmalaysian and i applaud singapores effort for combating drug trafficking
4,xJlgtV8L7Jc,"No wonder Singapore has the most powerful passport, toppling Japan this year.\n\nThey don't fck around with their laws.",2023-08-01 07:57:54+00:00,"{'neg': 0.099, 'neu': 0.763, 'pos': 0.139, 'compound': 0.2247}",0.2247,positive,1,no wonder singapore has the most powerful passport toppling japan this year they dont fck around with their laws
...,...,...,...,...,...,...,...,...
22159,xi6r3hZe5Tg,+1 science is difficult,2018-11-24 13:06:36+00:00,"{'neg': 0.455, 'neu': 0.545, 'pos': 0.0, 'compound': -0.3612}",-0.3612,negative,0,1 science is difficult
22160,xi6r3hZe5Tg,I live in Australia and have always wanted to visit Singapore.,2018-11-24 13:03:39+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0000,neutral,1,i live in australia and have always wanted to visit singapore
22161,xi6r3hZe5Tg,yeah! second here ❤,2018-11-24 13:03:13+00:00,"{'neg': 0.0, 'neu': 0.445, 'pos': 0.555, 'compound': 0.3595}",0.3595,positive,1,yeah second here
22162,xi6r3hZe5Tg,Awesome... civilization is getting to it's peak...,2018-11-24 13:02:55+00:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0000,neutral,1,awesome civilization is getting to its peak


In [49]:
# remove numbers

def no_nums(text):
    return re.sub(r'[/\d+/]', 'number ', text)

In [50]:
comments_en['clean_text'] = comments_en['clean_text'].apply(no_nums)

In [51]:
comments_en[['clean_text']]

Unnamed: 0,clean_text
0,for more on this url
1,yes punish the criminals not the innocent people
2,flor contemplacion had been there filipinos know it and a movie had been made out of her experience
3,dont mind these westoids im half indonesianmalaysian and i applaud singapores effort for combating drug trafficking
4,no wonder singapore has the most powerful passport toppling japan this year they dont fck around with their laws
...,...
22159,number science is difficult
22160,i live in australia and have always wanted to visit singapore
22161,yeah second here
22162,awesome civilization is getting to its peak


In [52]:
# remove repeated words

def no_repeats(text):
    return re.sub(r'\b(\w+)\b.*\b\1\b', r'\1', text)

In [53]:
comments_en['clean_text'] = comments_en['clean_text'].apply(no_repeats)

#### Tokenizing

In [54]:
def get_tokens(text):
    return word_tokenize(text)

In [55]:
comments_en['tokenize'] = comments_en['clean_text'].apply(get_tokens)

In [56]:
# clean up dataframe for eda

comments_en.drop(columns = ['sentiment_score', 'compound', 'sentiment', 'clean_text'], inplace=True)


In [57]:
# save as pickle

comments_en.to_pickle('./data/comments_tokenize.pkl')

### Data dictionary

|column|description|
|-|-|
|`video_id`| url of the youtube video|
|`comment`| comments from the youtube video|
|`comment_date`| date comment was posted and published|
|`binary`| class labels for sentiment of the comment - 1 is positive, 0 is negative|
|`tokenize`| tokenized comments for data exploration and further pre-processing|
