In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import path
from PIL import Image

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

# Importing TextBlob
from textblob import TextBlob

In [None]:
# Read csv
tweets = pd.read_csv("../input/pfizer-vaccine-tweets/vaccination_tweets.csv")

tweets.head()

In [None]:
tweets.dtypes

In [None]:
tweets.info()

In [None]:
tweets.describe()

In [None]:
# A bit of cleaning

# remove special characters from text column
tweets.text = tweets.text.str.replace('[#,@,&]', '')
#Remove twitter handlers
tweets.text = tweets.text.str.replace('@[^\s]+','')
#Remove digits
tweets.text = tweets.text.str.replace(' \d+ ','')
# remove multiple spaces with single space
tweets.text = tweets.text.str.replace("http\S+", "")
# remove multiple spaces with single space
tweets.text = tweets.text.str.replace('\s+', ' ')
#remove all single characters
tweets.text = tweets.text.str.replace(r'\s+[a-zA-Z]\s+', '')

# WordCloud

In [None]:
# Get stopwords
# Define nltk stopwords in english
stop_words = stopwords.words('english')
stop_words.extend(['ha', 'wa', '-'])

# Get a string of tweets 
tweet_text = ",".join(review.lower() for review in tweets.text if 'covid' not in review)

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(tweet_text)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in tweets',fontsize=15)
plt.show()

# Word frequency lemmatized

In [None]:
# lemmatize text column by using a lemmatize function
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower())]


# Initialize the Lemmatizer and Whitespace Tokenizer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Lemmatize words
tweets['lemmatized'] = tweets.text.apply(lemmatize_text)
tweets['lemmatized'] = tweets['lemmatized'].apply(lambda x: [word for word in x if word not in stop_words])

# use explode to expand the lists into separate rows
wf_tweets = tweets.lemmatized.explode().to_frame().reset_index(drop=True)

# plot dfe
sns.countplot(x='lemmatized', data=wf_tweets, order=wf_tweets.lemmatized.value_counts().iloc[:10].index)
plt.xlabel('Most common used words')
plt.ylabel('Frequency [%]')
plt.xticks(rotation=70)


# Check top 5 most used hashtags

In [None]:
MostUsedTweets = tweets.hashtags.value_counts().sort_values(ascending=False)[:5]
colors = ['lightcoral', 'lightskyblue', 'yellowgreen', 'grey', 'orange']
explode = (0.1, 0.2, 0.1, 0.1, 0.1) 

# Wedge properties 
wp = { 'linewidth' : 0.5, 'edgecolor' : "red" }

# Creating autocpt arguments 
def func(pct, allvalues): 
    absolute = int(pct / 100.*np.sum(allvalues)) 
    return "{:.1f}%\n({:d} g)".format(pct, absolute) 
  
# Creating the plot 
fig, ax = plt.subplots(figsize =(10, 7)) 
wedges, texts, autotexts = ax.pie(MostUsedTweets,  
                                  autopct = lambda pct: func(pct, MostUsedTweets), 
                                  explode = explode,  
                                  labels = MostUsedTweets.keys(), 
                                  shadow = True, 
                                  colors = colors, 
                                  startangle = 90, 
                                  wedgeprops = wp, 
                                  textprops = dict(color ="black")) 
  
# Adding legend 
ax.legend(wedges, MostUsedTweets.keys(), 
          title ="Most used tweets", 
          loc ="center left", 
          bbox_to_anchor =(1, 0, 0.5, 1)) 


plt.setp(autotexts, size=9, weight="bold") 
ax.set_title("Most used tweets") 
plt.axis('equal')
plt.show()

# Tweets source

In [None]:

cmap = cm.get_cmap('Spectral') 

countries=tweets['source'].value_counts().sort_values(ascending=False)[:5].plot(
    kind = 'bar', 
    cmap=cmap, 
    edgecolor='None')

# Top 10 tweet posts countries 

In [None]:

cmap = cm.get_cmap('Spectral') 

countries=tweets['user_location'].value_counts().sort_values(ascending=False)[:5].plot(
    kind = 'barh', 
    cmap=cmap, 
    edgecolor='None')

# Sentiment analysis with TextBlob

[TextBlob](https://textblob.readthedocs.io/en/dev/) is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.

In [None]:
tweets['polarity'] = tweets.text.apply(lambda x: TextBlob(x).polarity)
tweets['subjectivity'] = tweets.text.apply(lambda x: TextBlob(x).subjectivity)

tweets.head()

In TextBlob, based on the polarity and subjectivity, you determine whether it is a positive text or negative or neutral. For TextBlob, if  polarity is > 0, it is considered positive, if polarity < 0 is considered negative and if polarity == 0 is considered as neutral.

In [None]:
tweets['sentiment'] = np.where(tweets.polarity > 0, 'positive', 
                                 np.where(tweets.polarity < 0, 'negative', 'neutral'))
tweets.head()

In [None]:
# Shows the top 5 tweets with highest polarity scores
tweets.nlargest(5,'polarity')['text']

In [None]:
# Shows the top 5 tweets with highest polarity and subjectivity scores
tweets.nlargest(5, ['polarity', 'subjectivity'])['text']

In [None]:
# Shows the top 5 tweets with lowest polarity scores
tweets.nsmallest(5,'polarity')['text']

In [None]:
# Shows the top 5 tweets with lowest polarity and subjectivity scores
tweets.nsmallest(5, ['polarity', 'subjectivity'])['text']

In [None]:
tweets['sentiment'].value_counts()