In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import re
import string,time
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy

ModuleNotFoundError: No module named 'spacy'

# **Data Loading**

In [None]:
train_df  = pd.read_csv('../Twitter_Sentiment_Analysis_Project/Corona_NLP_train.csv',encoding='ISO-8859-1')
test_df = pd.read_csv('../Twitter_Sentiment_Analysis_Project/Corona_NLP_test.csv')

In [None]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
train_df.shape

(41157, 6)

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


Check for duplicated data

In [None]:
train_df.duplicated().values.any()

False

Check for null data

In [None]:
def missing_data(df):
    total = df.isnull().sum().sort_values(ascending = False)
    Percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, Percentage], axis=1, keys=['Total', 'Percentage'])

missing_data(train_df)


Unnamed: 0,Total,Percentage
Location,8590,20.871298
UserName,0,0.0
ScreenName,0,0.0
TweetAt,0,0.0
OriginalTweet,0,0.0
Sentiment,0,0.0


# **Exploratory Data Analysis**

In [None]:
train_df['TweetAt'] = pd.to_datetime(train_df['TweetAt'])

  train_df['TweetAt'] = pd.to_datetime(train_df['TweetAt'])


In [None]:
train_df['Date'] = train_df['TweetAt'].dt.day
train_df['Month'] = train_df['TweetAt'].dt.month
train_df['Year'] = train_df['TweetAt'].dt.year

In [None]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Date,Month,Year
0,3799,48751,London,2020-03-16,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,16,3,2020
1,3800,48752,UK,2020-03-16,advice Talk to your neighbours family to excha...,Positive,16,3,2020
2,3801,48753,Vagabonds,2020-03-16,Coronavirus Australia: Woolworths to give elde...,Positive,16,3,2020
3,3802,48754,,2020-03-16,My food stock is not the only one which is emp...,Positive,16,3,2020
4,3803,48755,,2020-03-16,"Me, ready to go at supermarket during the #COV...",Extremely Negative,16,3,2020


In [None]:
tweet_counts = train_df.groupby('TweetAt').size().reset_index(name='TweetCount')

fig = px.line(tweet_counts, x='TweetAt', y='TweetCount', title='Daily Tweet Count Over Time')

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Tweet Count',
    template='plotly_dark'  
)

fig.show()

In [None]:
tweets_per_location = train_df['Location'].value_counts().loc[lambda x:x>100].reset_index(name='counts')
tweets_per_location

Unnamed: 0,Location,counts
0,London,540
1,United States,528
2,"London, England",520
3,"New York, NY",395
4,"Washington, DC",373
5,United Kingdom,337
6,"Los Angeles, CA",281
7,India,268
8,UK,232
9,Australia,225


In [None]:
def categorize_country(location):
    if 'London' in location:
        return 'United Kingdom'
    elif 'United Kingdom' in location:
        return 'United Kingdom'
    elif 'UK' in location:
        return 'United Kingdom'
    elif 'India' in location:
        return 'India'
    elif 'Toronto' in location:
        return 'Canada'
    elif 'Canada' in location:
        return 'Canada'
    elif 'Australia' in location:
        return 'Australia'
    elif 'Global' in location:
        return 'Worldwide'
    elif 'Worldwide' in location:
        return 'Worldwide'
    else:
        return 'United States'
    

In [None]:
tweets_per_location['Country'] = tweets_per_location['Location'].apply(categorize_country)
tweet_counts__per_country = tweets_per_location.groupby('Country', as_index=False)['counts'].sum()

fig = px.bar(
    tweet_counts__per_country, 
    x='Country', 
    y='counts', 
    color='Country',
    title='Tweet Count by Country',
    labels={'Country': 'Country', 'counts': 'Count'},
    text='counts'
)

fig.update_layout(
    xaxis_title='Country',
    yaxis_title='Count',
    yaxis=dict(
        range=[0, 3000] 
    ),
    template='plotly_dark'
)

fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    textfont=dict(size=12, color='white')
)

fig.show()

# **Text Preprocessing**

Remove HTML tags

In [None]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('', text)

 Hello 


Remove URLs

In [None]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

hello  na kub


Remove punctuations

In [None]:
exclude=string.punctuation

def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [None]:
def remove_emoji(text):
    emoji_pattern=re.compile("["
                             u"\U0001F600-\U0001F64F" #emoticons
                             u"\U0001F300-\U0001F5FF" #symbols, pictograph
                              u"\U0001F680-\U0001F6FF" #transport and map symbol
                              u"\U0001F1E0-\U0001F1FF" # flags(IOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002FC2-\U0001F251"
                             "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)