# **Import Necessary Libraries**

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import re
import string
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix


# **Data Loading**

In [4]:
train_df  = pd.read_csv('../Twitter_Sentiment_Analysis_Project/Corona_NLP_train.csv',encoding='ISO-8859-1')
test_df = pd.read_csv('../Twitter_Sentiment_Analysis_Project/Corona_NLP_test.csv')

In [5]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [6]:
train_df.shape

(41157, 6)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


Check for duplicated data

In [8]:
train_df.duplicated().values.any()

False

Check for null data

In [9]:
def missing_data(df):
    total = df.isnull().sum().sort_values(ascending = False)
    Percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, Percentage], axis=1, keys=['Total', 'Percentage'])

missing_data(train_df)


Unnamed: 0,Total,Percentage
Location,8590,20.871298
UserName,0,0.0
ScreenName,0,0.0
TweetAt,0,0.0
OriginalTweet,0,0.0
Sentiment,0,0.0


# **Exploratory Data Analysis**

In [40]:
sent_per = pd.DataFrame(data=train_df.groupby('Sentiment').size()/train_df['UserName'].count()*100,columns=['Percentage']).reset_index()
sent_per


Unnamed: 0,Sentiment,Percentage
0,Extremely Negative,13.317297
1,Extremely Positive,16.094468
2,Negative,24.095537
3,Neutral,18.740433
4,Positive,27.752266


In [41]:
order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

fig = px.bar(
    sent_per, 
    x='Sentiment', 
    y='Percentage', 
    color='Sentiment',
    title='Percentage of Twitter Sentiment',
    labels={'Sentiment': 'Sentiment', 'Percentage': 'Percentage (%)'},
    text='Percentage'
)

fig.update_layout(
    xaxis_title='Sentiment',
    yaxis_title='Percentage (%)',
    xaxis=dict(
        categoryorder='array',
        categoryarray=order
    ),
    yaxis=dict(
        range=[0, 30] 
    ),
    template='plotly_dark'
)

fig.update_traces(
    texttemplate='%{text:.2f}%',
    textposition='outside',
    textfont=dict(size=12, color='white')
)

fig.show()

In [11]:
train_df['TweetAt'] = pd.to_datetime(train_df['TweetAt'])

tweet_counts = train_df.groupby('TweetAt').size().reset_index(name='TweetCount')

fig = px.line(tweet_counts, x='TweetAt', y='TweetCount', title='Daily Tweet Count Over Time')

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Tweet Count',
    template='plotly_dark'  
)

fig.show()

  train_df['TweetAt'] = pd.to_datetime(train_df['TweetAt'])


In [12]:
tweets_per_location = train_df['Location'].value_counts().loc[lambda x:x>100].reset_index(name='counts')
tweets_per_location

Unnamed: 0,Location,counts
0,London,540
1,United States,528
2,"London, England",520
3,"New York, NY",395
4,"Washington, DC",373
5,United Kingdom,337
6,"Los Angeles, CA",281
7,India,268
8,UK,232
9,Australia,225


In [13]:
def categorize_country(location):
    if 'London' in location:
        return 'United Kingdom'
    elif 'United Kingdom' in location:
        return 'United Kingdom'
    elif 'UK' in location:
        return 'United Kingdom'
    elif 'India' in location:
        return 'India'
    elif 'Toronto' in location:
        return 'Canada'
    elif 'Canada' in location:
        return 'Canada'
    elif 'Australia' in location:
        return 'Australia'
    elif 'Global' in location:
        return 'Worldwide'
    elif 'Worldwide' in location:
        return 'Worldwide'
    else:
        return 'United States'
    

In [14]:
tweets_per_location['Country'] = tweets_per_location['Location'].apply(categorize_country)
tweet_counts__per_country = tweets_per_location.groupby('Country', as_index=False)['counts'].sum()

fig = px.bar(
    tweet_counts__per_country, 
    x='Country', 
    y='counts', 
    color='Country',
    title='Tweet Count by Country',
    labels={'Country': 'Country', 'counts': 'Count'},
    text='counts'
)

fig.update_layout(
    xaxis_title='Country',
    yaxis_title='Count',
    yaxis=dict(
        range=[0, 3000] 
    ),
    template='plotly_dark'
)

fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    textfont=dict(size=12, color='white')
)

fig.show()

# **Text Preprocessing**

Remove HTML tags

In [15]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('', text)

Remove URLs

In [16]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

Remove punctuations

In [17]:
exclude=string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

Remove stopwords

In [18]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
            
    x=new_text[:]
    new_text.clear()
    return " ".join(x)

Remove non-english word

In [19]:
nltk.download('words')

english_words = set(words.words())

def remove_non_english_words(text):
    word_list = re.findall(r'\b\w+\b', text)

    filtered_words = [word for word in word_list if word.lower() in english_words]
    
    return ' '.join(filtered_words)

[nltk_data] Downloading package words to
[nltk_data]     /Users/suntaetangsatgatham/nltk_data...
[nltk_data]   Package words is already up-to-date!


Remove emojis

In [20]:
def remove_emoji(text):
    emoji_pattern=re.compile("["
                             u"\U0001F600-\U0001F64F" #emoticons
                             u"\U0001F300-\U0001F5FF" #symbols, pictograph
                              u"\U0001F680-\U0001F6FF" #transport and map symbol
                              u"\U0001F1E0-\U0001F1FF" # flags(IOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002FC2-\U0001F251"
                             "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

Stemming

In [21]:
ps=PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

Create a text cleaning function

In [22]:
def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    text = remove_url(text)
    text = remove_punc(text)
    text = remove_stopwords(text)
    text = remove_non_english_words(text)
    text = remove_emoji(text)
    text = stem_words(text)
    return text

In [23]:
train_df['Cleaned_Text'] = train_df['OriginalTweet'].apply(clean_text)
test_df['Cleaned_Text'] = test_df['OriginalTweet'].apply(clean_text)

train_df['Sentiment_Flag'] = train_df['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})
test_df['Sentiment_Flag'] = test_df['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

X_train = train_df['Cleaned_Text']
y_train = train_df['Sentiment_Flag']
X_test = test_df['Cleaned_Text']
y_test = test_df['Sentiment_Flag']