# Introduction

As per date (Aug 26, 2020):
* Coronavirus Cases: 24,005,460
* Deaths: 821,578
* Recovered: 16,485,396

Stay Home, Stay Safe.

We will deep dive into the dataset and find interesting insights. Created visualisations using plotly library and did dataset manipulation using numpy package.

To Do:
* Importing Libraries
* EDA and Visualisations
* Sentiment Analysis
* Results

Hope you like it! <font color = "red">Please Upvote!</font>

![](https://pbs.twimg.com/media/EQgP2pUW4AA0BkC?format=jpg&name=medium)

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import missingno as msno

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize #(word tokenize, sentence tokenize)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
import re, string, unicodedata

from PIL import Image
import requests
from io import BytesIO
from wordcloud import ImageColorGenerator
from textblob import TextBlob

import plotly.offline
import plotly.express as px
import plotly.graph_objects as go
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable = True)
%matplotlib inline

# Dataset

In [None]:
tweets = pd.read_csv('../input/covid19-tweets/covid19_tweets.csv')
tweets.head()

In [None]:
tweets.info()

In [None]:
tweets.describe()

**Missing Values Visualisation**

In [None]:
msno.matrix(tweets)
#white lines tells the missing values.

**Adding Additional Columns**

In [None]:
tweets['date'] = pd.to_datetime(tweets["date"])
tweets['count'] = 1
tweets['tweet_date'] = tweets['date'].apply(lambda x: x.date())
tweets['day_sent'] = tweets['date'].dt.strftime('%a')
tweets['month_sent'] = tweets['date'].dt.strftime('%b')
tweets['hour_sent'] = tweets['date'].apply(lambda x: x.hour)

In [None]:
tweets.head(3)

# Exploratory Data Analysis

**Tweet Counts Vs Verified User Tweets**

In [None]:
groupedby_date = tweets.groupby('tweet_date').sum().reset_index()

fig = go.Figure(data=[
    go.Bar(name = 'Verified Users', x = groupedby_date['tweet_date'], y = groupedby_date['user_verified'].tolist()),
    go.Bar(name = 'Count Of Tweets', x = groupedby_date['tweet_date'], y= groupedby_date['count'].tolist())])

fig.update_layout(barmode='stack')
fig.show()

**Coorelation Matrix**

In [None]:
#Correlation matrix
tweets[['user_followers', 'user_friends',
        'user_favourites', 'user_verified']].corr().iplot(kind='heatmap',
                                                          colorscale="Blues",
                                                          title="Feature Correlation Matrix")

**Word Cloud (Hashtags)**

In [None]:
hashtags = tweets['hashtags'].dropna().tolist()
unique_hashtags=(" ").join(hashtags)

response = requests.get('https://www.lifewire.com/thmb/Q-QChfPXsb8id3pvLrcXsn2oQNs=/768x0/filters:no_upscale():max_bytes(150000):strip_icc()/twitterlogo-6471b86764ac4076b70f645e632b899e.jpg')
char_mask = np.array(Image.open(BytesIO(response.content)))
image_colors = ImageColorGenerator(char_mask)
plt.figure(figsize = (15,15))
wc = WordCloud(background_color="black", max_words=200, width=400, height=400, mask=char_mask, random_state=1).generate(unique_hashtags)
# to recolour the image
plt.imshow(wc.recolor(color_func=image_colors))

**Top 15 Regions : Tweet Counts**

In [None]:
#Top15_regions
Top15_regions = pd.DataFrame(tweets['user_location'].value_counts().sort_values(ascending=False)[:15]).T
colors = ['lightslategray',] * 15
colors[0] = 'crimson'

fig = go.Figure(data=[go.Bar(x=Top15_regions.columns,
                             y=[Top15_regions[i][0] for i in Top15_regions],
                             marker_color=colors)])
fig.update_layout(title_text='Tweets on User Location')

**Top 10 Sources : To Do Tweets**

In [None]:
Top10_source = pd.DataFrame(tweets['source'].value_counts().sort_values(ascending=False)[:10]).T
colors = ['lightslategray',] * 10
colors[0] = 'crimson'

fig = go.Figure(data=[go.Bar(x=Top10_source.columns,
                             y=[Top10_source[i][0] for i in Top10_source],
                             marker_color=colors)])
fig.update_layout(title_text='Different source used for tweeting.')

**Heatmap: Tweet Counts as per month and days**

In [None]:
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months = ['Jul', 'Aug']

grouped_by_month_and_day = tweets.groupby(['month_sent', 'day_sent']).sum().reset_index()[['month_sent', 'day_sent', 'count']]
pt = grouped_by_month_and_day.pivot_table(index = 'month_sent', columns = 'day_sent', values = 'count').reindex(index = months, columns = days)
pt.iplot(kind='heatmap',colorscale="Blues", title="Heatmap of tweets count as per month and days")

**Timeline: Tweet Counts on 24hrs basis**

In [None]:
grouped_by_time = tweets.groupby('hour_sent').sum().reset_index().sort_values(by = 'count', ascending = False)
fig = px.bar(grouped_by_time, x='hour_sent', y='count', color='hour_sent', 
             labels={'pop':'Count Of Tweets'}, height=400)
fig.show()

**Most Used Words: Tweet**

In [None]:
#Most Used Words in tweets
word_dict = dict.fromkeys(tweets['user_name'].unique()) #collecting all unique userids
for key in word_dict.keys():
  word_dict[key] = {}

for name, msg in zip(tweets['user_name'], tweets['text']):
  for word in msg.split():
    #any media is included then that is excluded
    if word not in ['<Media', 'omitted>']:
      if word in word_dict[name]:
        word_dict[name][word] += 1
      else:
        word_dict[name][word] = 1

for name in tweets['user_name'].unique():
  word_dict[name] = {k: v for k, v in sorted(word_dict[name].items(), 
                                             key = lambda item: item[1], reverse= True)}

In [None]:
grouped_df = tweets.groupby('user_name').sum().reset_index()
grouped_df['Most used words'] = grouped_df['user_name'].apply(lambda x : word_dict[x])
grouped_df[['user_name', 'Most used words']]

# Tweet Analysis 

# Vocabulary

In [None]:
#crating vocab for the tweets
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words

corpus = get_corpus(tweets.text)

In [None]:
from collections import Counter
counter = Counter(corpus)
most_common_words = counter.most_common(10) #prining most common 10 words
most_common_words = dict(most_common_words)
most_common_words

# Data Cleaning

In [None]:
#Data Cleaning - Part-1
stop_words = set(stopwords.words('english')) #set of all stopwords
punctuation = list(string.punctuation) #all punctuation
#adding everything into one set
stop_words.update(punctuation)

def strip_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def url_extract(text):
    return re.sub(r'http\S+', '', text)

def stopwords(text):
    final_text = []
    for i in text.split():
        #checking in stopwords and also lowering the text
        if i.strip().lower() not in stop_words:
            final_text.append(i.strip())
    return " ".join(final_text)

#finally getting all outputs in preprocessing the text using above functions
def preprocess(text):
    text = strip_html(text)
    text = square_brackets(text)
    text = url_extract(text)
    text = stopwords(text)
    return text

In [None]:
tweets['text'] = tweets['text'].apply(preprocess)

In [None]:
#processed tweets
tweets['text'].head(5)

# WordCloud: Tweets

In [None]:
response = requests.get('https://miro.medium.com/proxy/1*SZq4F67FpMACqyQ1-doAFA.jpeg')
char_mask = np.array(Image.open(BytesIO(response.content)))
image_colors = ImageColorGenerator(char_mask)

plt.figure(figsize = (20,20))
wc = WordCloud(background_color="black", max_words=200, width=400, height=400, mask=char_mask, random_state=1).generate(" ".join(tweets.text))
# to recolour the image
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")

# Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()

tweets['sentiment_vader'] = tweets['text'].apply(lambda x: sid.polarity_scores(x)['compound'])
tweets['sentiment_textblob'] = tweets['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

**Negative Tweets: Top 15**

In [None]:
tweets.sort_values(by = 'sentiment_textblob')[['user_name', 'text',
                                               'sentiment_vader', 'sentiment_textblob']].head(15)

**Positive Tweets: Top 15**

In [None]:
#Positive Tweets
tweets.sort_values(by = 'sentiment_textblob', ascending = False)[['user_name', 'text', 'sentiment_vader', 'sentiment_textblob']].head(15)

**Neutral Tweets: Top15**

In [None]:
#Neutral Tweets
tweets[tweets['sentiment_textblob'] == 0.0][['user_name', 'text', 'sentiment_vader', 'sentiment_textblob']].head(15)

# Results

In [None]:
#Combining all Dataframes (Positive, Neutral and Negative) and visualising the results...
neutral = tweets[tweets['sentiment_textblob'] == 0.0]
positive = tweets[tweets['sentiment_textblob'] > 0.0]
negative = tweets[tweets['sentiment_textblob'] < 0.0]

neutral['Sentiment Category'] = 'Neutral'
positive['Sentiment Category'] = 'Positive'
negative['Sentiment Category'] = 'Negative'

frames = [neutral, positive, negative]
result = pd.concat(frames)

In [None]:
colors = ['gold', 'mediumturquoise', 'darkorange']
fig = px.pie(result, values='count', names='Sentiment Category',
             color_discrete_sequence=px.colors.sequential.RdBu,
             title = 'Tweets Distribution Based on Sentiments')
fig.update_traces(textposition='inside', textinfo='percent+label', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

# The End