<a href="https://colab.research.google.com/github/zakstucke/geo-sentiment-analysis/blob/master/ScratchSpace%2BVisualisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment analysis tools


In [None]:
## Test notebook

# import pandas
import nltk

# VADER = Valence Aware Dictionary for Sentiment Reasoning
# Gives polarity and intensity of sentiment
nltk.download('vader_lexicon')

# Right from the documentation https://www.nltk.org/howto/sentiment.html
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




In [None]:
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
len(subj_docs), len(obj_docs)

In [None]:
# Feedback on a simple list of sentences
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentences = [
             "Covid is a good thing",
             "Covid is a bad thing"
             ]
pss = []   # polarity score's
for sentence in sentences:
  sia = SentimentIntensityAnalyzer()
  pss.append(sia.polarity_scores(sentence))

for ps in pss:
  print(f"neg: {ps['neg']}, pos: {ps['pos']}, neu: {ps['neu']}, cmp: {ps['compound']}")

neg: 0.0, pos: 0.492, neu: 0.508, cmp: 0.4404
neg: 0.538, pos: 0.0, neu: 0.462, cmp: -0.5423


In [None]:
# Converting to dataframe for small scale processing
import pandas as pd
df = pd.DataFrame(pss)
print(df)

# Select confidently negative answers
negatives = df[(df['neg']>0.5) & (df['pos']<0.2)]
# Print confidently negative sentences
print(sentences[negatives.index[0]])

     neg    neu    pos  compound
0  0.000  0.508  0.492    0.4404
1  0.538  0.462  0.000   -0.5423
Covid is a bad thing


In [None]:
!pip install nrclex
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nrclex import NRCLex

nltk.download("punkt")
nltk.download('vader_lexicon')

def create_emotions(df, text_column_name):
    emotions = ["emotion_1", "emotion_2", "emotion_3"]
    
    # Add in the new emotion columns:
    for emotion in emotions:
        df[emotion] = np.nan

    def analyze_emotions(row):
        # Analyse and add the emotions of the row:
        analyzer = NRCLex(text=row[text_column_name])
        for index, emotion in enumerate(emotions):
            # If not enough emotions in the list, leave column as NaN:
            if len(analyzer.top_emotions) <= index:
                break
            row[emotion] = analyzer.top_emotions[index]

        return row

    df = df.apply(analyze_emotions, axis=1)

    return df

def create_bias(df, text_column_name):
    # Uses nltk vader to calc positive, negative, neutrality and compound columns

    fields = ["pos", "neg", "neu", "compound"]

    # Setup the new columns:
    for field in fields:
        df[field] = np.nan

    sia = SentimentIntensityAnalyzer()

    def analyze_bias(row):
        nonlocal sia
        
        results = sia.polarity_scores(row[text_column_name])

        for field in fields:
            row[field] = results[field]

        return row

    df = df.apply(analyze_bias, axis=1)

    return df

df = pd.DataFrame({"text": ["What a great day", "Today is awful", "I'm so happy", "Looking forward to a great day tomorrow.", "Yesterday was scary"]})

df = create_emotions(df, "text")
df = create_bias(df, "text")
df

Collecting nrclex
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[?25l[K     |▉                               | 10 kB 24.7 MB/s eta 0:00:01[K     |█▋                              | 20 kB 27.7 MB/s eta 0:00:01[K     |██▌                             | 30 kB 28.4 MB/s eta 0:00:01[K     |███▎                            | 40 kB 14.7 MB/s eta 0:00:01[K     |████▏                           | 51 kB 10.9 MB/s eta 0:00:01[K     |█████                           | 61 kB 12.1 MB/s eta 0:00:01[K     |█████▉                          | 71 kB 13.5 MB/s eta 0:00:01[K     |██████▋                         | 81 kB 13.3 MB/s eta 0:00:01[K     |███████▍                        | 92 kB 14.6 MB/s eta 0:00:01[K     |████████▎                       | 102 kB 11.6 MB/s eta 0:00:01[K     |█████████                       | 112 kB 11.6 MB/s eta 0:00:01[K     |██████████                      | 122 kB 11.6 MB/s eta 0:00:01[K     |██████████▊                     | 133 kB 11.6 MB/s eta 0:00:01[K  

Unnamed: 0,text,emotion_1,emotion_2,emotion_3,pos,neg,neu,compound
0,What a great day,"(fear, 0.0)","(anger, 0.0)","(anticip, 0.0)",0.672,0.0,0.328,0.6249
1,Today is awful,"(fear, 0.2)","(anger, 0.2)","(negative, 0.2)",0.0,0.6,0.4,-0.4588
2,I'm so happy,"(trust, 0.25)","(positive, 0.25)","(joy, 0.25)",0.666,0.0,0.334,0.6115
3,Looking forward to a great day tomorrow.,"(positive, 0.5)","(anticipation, 0.5)",,0.451,0.0,0.549,0.6249
4,Yesterday was scary,"(fear, 0.0)","(anger, 0.0)","(anticip, 0.0)",0.0,0.615,0.385,-0.4939


# Can we observe fake news? / "Most shared covid link by region"

Steps:
- Get regional data
- Clean data (Match the twitter id to 're-hydrate' the data, get the content)
- Follow highly shared links
- Way to determine misinformation?
- Plot globaly
- help: https://link.springer.com/article/10.1007/s11042-021-11621-5

## Fetching data from github

In [None]:
import pandas as pd
main_data = pd.read_csv("https://media.githubusercontent.com/media/zakstucke/geo-sentiment-analysis/master/covidvaccine.csv?token=ARLQAMR52AXM3H4W7ITZ6O3CGRV6G")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# EDA
main_data.columns

Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet'],
      dtype='object')

In [None]:
data_head = main_data.head(5000)

In [None]:
import re
test_url = re.findall('http',str(main_data['text']))

375651

#Geo Sentiment
https://zenodo.org/record/5090588#.YjBMJHrP2Um

# Question 3:



# Question 4:

 # Visualisation

In [None]:
!pip install tsne
!pip install umap

import tsne
import umap

import numpy as np 
import pandas as pd 

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy.io import loadmat
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
import seaborn as sns

from mpl_toolkits.mplot3d import Axes3D

from tsne import bh_sne
from sklearn.manifold import TSNE

# Ensure up to date version of plotly
!pip install plotly==4.14.3
import plotly.express as px

Collecting tsne
  Downloading tsne-0.3.1.tar.gz (547 kB)
[?25l[K     |▋                               | 10 kB 22.9 MB/s eta 0:00:01[K     |█▏                              | 20 kB 29.2 MB/s eta 0:00:01[K     |█▉                              | 30 kB 15.7 MB/s eta 0:00:01[K     |██▍                             | 40 kB 11.0 MB/s eta 0:00:01[K     |███                             | 51 kB 8.3 MB/s eta 0:00:01[K     |███▋                            | 61 kB 9.5 MB/s eta 0:00:01[K     |████▏                           | 71 kB 10.8 MB/s eta 0:00:01[K     |████▉                           | 81 kB 10.4 MB/s eta 0:00:01[K     |█████▍                          | 92 kB 11.3 MB/s eta 0:00:01[K     |██████                          | 102 kB 10.8 MB/s eta 0:00:01[K     |██████▋                         | 112 kB 10.8 MB/s eta 0:00:01[K     |███████▏                        | 122 kB 10.8 MB/s eta 0:00:01[K     |███████▉                        | 133 kB 10.8 MB/s eta 0:00:01[K     |██

In [None]:
def scatter2d(X, y):
    fig, ax = plt.subplots(figsize=(15,10))
    ax.scatter(X[:,0], X[:,1], c=y)
    palette = sns.color_palette("husl", len(np.unique(y)))

    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], y[i], color=palette[y[i]-1], fontsize='small')
        
def scatter3d(X, y):
    fig = plt.figure(figsize=(15,10))
    fig.add_subplot(111, projection='3d')
    ax = Axes3D(fig)
    palette = sns.color_palette("husl", len(np.unique(y)))

    ax.mouse_init()
    ax.scatter(X[:,0], X[:,1], X[:,2], c=y)
    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], X[i,2], y[i], color=palette[y[i]-1], fontsize='small')

## MAP VISUALISATION

In [None]:
fig = px.choropleth(DATASET, # dataset to use
                    locations="Country Code", # column which includes 3 letter country code
                    color="2019", # column which dictates the colour of the map
                    hover_name="Country Name", # column to add to hover information
                    range_color=(0, 30), # range of the colour scale
                    color_continuous_scale="aggrnyl") # colour scale (these can be predefined or you can create your own)
fig.show()

# Start to end data wrangling


In [None]:
!pip install black==22.1.0
!pip install flake8==3.7.9
!pip install notebook==6.4.3
!pip install ipython==7.31.0
!pip install requests==2.27.1
!pip install pandas==1.4.1
!pip install nltk==3.7
!pip install NRCLex==3.0.0
!pip install geopy==2.2.0

Collecting black==22.1.0
  Downloading black-22.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 13.2 MB/s 
[?25hCollecting click>=8.0.0
  Downloading click-8.0.4-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 5.0 MB/s 
Collecting typed-ast>=1.4.2
  Downloading typed_ast-1.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (843 kB)
[K     |████████████████████████████████| 843 kB 51.2 MB/s 
[?25hCollecting platformdirs>=2
  Downloading platformdirs-2.5.1-py3-none-any.whl (14 kB)
Collecting mypy-extensions>=0.4.3
  Downloading mypy_extensions-0.4.3-py2.py3-none-any.whl (4.5 kB)
Collecting pathspec>=0.9.0
  Downloading pathspec-0.9.0-py2.py3-none-any.whl (31 kB)
Installing collected packages: typed-ast, platformdirs, pathspec, mypy-extensions, click, black
  Attempting uninstall: click
    Found existing installation: click 7.1.2
    Unin

Collecting flake8==3.7.9
  Downloading flake8-3.7.9-py2.py3-none-any.whl (69 kB)
[?25l[K     |████▊                           | 10 kB 27.1 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 9.7 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 6.7 MB/s eta 0:00:01[K     |██████████████████▊             | 40 kB 7.4 MB/s eta 0:00:01[K     |███████████████████████▌        | 51 kB 2.6 MB/s eta 0:00:01[K     |████████████████████████████▏   | 61 kB 3.1 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 1.4 MB/s 
[?25hCollecting entrypoints<0.4.0,>=0.3.0
  Downloading entrypoints-0.3-py2.py3-none-any.whl (11 kB)
Collecting pycodestyle<2.6.0,>=2.5.0
  Downloading pycodestyle-2.5.0-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 3.4 MB/s 
[?25hCollecting mccabe<0.7.0,>=0.6.0
  Downloading mccabe-0.6.1-py2.py3-none-any.whl (8.6 kB)
Collecting pyflakes<2.2.0,>=2.1.0
  Downloading pyflakes-2.1.1-py2.py3-none-an

Collecting ipython==7.31.0
  Downloading ipython-7.31.0-py3-none-any.whl (792 kB)
[?25l[K     |▍                               | 10 kB 26.0 MB/s eta 0:00:01[K     |▉                               | 20 kB 30.7 MB/s eta 0:00:01[K     |█▎                              | 30 kB 36.8 MB/s eta 0:00:01[K     |█▋                              | 40 kB 15.7 MB/s eta 0:00:01[K     |██                              | 51 kB 11.9 MB/s eta 0:00:01[K     |██▌                             | 61 kB 13.4 MB/s eta 0:00:01[K     |███                             | 71 kB 11.7 MB/s eta 0:00:01[K     |███▎                            | 81 kB 12.7 MB/s eta 0:00:01[K     |███▊                            | 92 kB 13.9 MB/s eta 0:00:01[K     |████▏                           | 102 kB 13.8 MB/s eta 0:00:01[K     |████▌                           | 112 kB 13.8 MB/s eta 0:00:01[K     |█████                           | 122 kB 13.8 MB/s eta 0:00:01[K     |█████▍                          | 133 kB 13.8 M

Collecting requests==2.27.1
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[?25l[K     |█████▏                          | 10 kB 24.8 MB/s eta 0:00:01[K     |██████████▍                     | 20 kB 32.2 MB/s eta 0:00:01[K     |███████████████▋                | 30 kB 27.2 MB/s eta 0:00:01[K     |████████████████████▊           | 40 kB 11.9 MB/s eta 0:00:01[K     |██████████████████████████      | 51 kB 12.4 MB/s eta 0:00:01[K     |███████████████████████████████▏| 61 kB 14.3 MB/s eta 0:00:01[K     |████████████████████████████████| 63 kB 1.4 MB/s 
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires ipython~=5

Collecting geopy==2.2.0
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 14.1 MB/s 
Installing collected packages: geopy
  Attempting uninstall: geopy
    Found existing installation: geopy 1.17.0
    Uninstalling geopy-1.17.0:
      Successfully uninstalled geopy-1.17.0
Successfully installed geopy-2.2.0


In [None]:
import numpy as np
import nltk
import string


from geopy.geocoders import Nominatim
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nrclex import NRCLex

nltk.download("punkt")
nltk.download("vader_lexicon")

import pandas as pd
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)


def create_emotions(df, text_column_name):
    emotions = ["emotion_1", "emotion_2", "emotion_3"]

    # Add in the new emotion columns:
    for emotion in emotions:
        df[emotion] = np.nan

    def analyze_emotions(row):
        # Analyse and add the emotions of the row:
        analyzer = NRCLex(text=row[text_column_name])
        for index, emotion in enumerate(emotions):
            # If not enough emotions in the list, leave column as NaN:
            if len(analyzer.top_emotions) <= index:
                break
            row[emotion] = analyzer.top_emotions[index]

        return row

    df = df.apply(analyze_emotions, axis=1)

    return df


def create_bias(df, text_column_name):
    # Uses nltk vader to calc positive, negative, neutrality and compound columns

    fields = ["pos", "neg", "neu", "compound"]

    # Setup the new columns:
    for field in fields:
        df[field] = np.nan

    sia = SentimentIntensityAnalyzer()

    def analyze_bias(row):
        nonlocal sia

        results = sia.polarity_scores(row[text_column_name])

        for field in fields:
            row[field] = results[field]

        return row

    df = df.apply(analyze_bias, axis=1)

    return df


def create_cleaned(df, text_column_name):
    fields = ["cleaned_words"]
    for field in fields:
        df[field] = np.nan

    def clean(row):

        text = row[text_column_name]
        # First remove punctuation:
        text = text.translate(str.maketrans("", "", string.punctuation))

        # Split text into words:
        words = text.split()

        # Remove filler words:
        cleaned_words = stripFillerWords(words)

        row["cleaned_words"] = cleaned_words

        return row

    df = df.apply(clean, axis=1)

    return df


def create_geo(df, address_column_name):
    # Adds in the lat long, city and country for the geo information if readable

    fields = ["lat", "long", "city", "country", "country_code"]
    for field in fields:
        df[field] = np.nan

    geolocator = Nominatim(user_agent="bristol_geo_sentiment_analysis_project")

    def _create_geo(row):
        address = row[address_column_name]

        location = geolocator.geocode(address, exactly_one=True, addressdetails=True)
        if location:
            row["lat"] = location.latitude
            row["long"] = location.longitude

            if "address" in location.raw:
                if "city" in location.raw["address"]:
                    row["city"] = location.raw["address"]["city"]
                if "country" in location.raw["address"]:
                    row["country"] = location.raw["address"]["country"]
                if "country_code" in location.raw["address"]:
                    row["country_code"] = location.raw["address"]["country_code"]

        return row

    df = df.apply(_create_geo, axis=1)

    return df

import spacy


def getSingleTweet(data):  # Used for original Main data scraped by TwitterScraper
    # Dictionary has 2 keys
    #       data - for tweet data
    #       includes - for user data
    # Data is made up of a list of tweets, each tweet is its own dictionary
    # This dictionary consists of:
    #                           -id
    #                           -referencedtweets (A list containing a dict for each referenced tweet
    #                           -public_metrics (A dict with: retweet_count,reply_count,like_count,quote_count
    #                           -possibly_sensitive: Bool
    #                           -created_at
    #                           -text
    #                           -lang
    tweetList = data["data"]
    singleTweet = tweetList[0]
    return singleTweet


def getText(data):  # Used for original Main data scraped by TwitterScraper
    tweetList = data["data"]
    allText = []
    counter = 0
    for tweet in tweetList:
        allText.append(str(counter) + ": " + tweet["text"] + "\n\n")
        counter += 1
    return allText


def stripFillerWords(
    words,
):  # Takes a list of words as input and returns list with filler/stop words removed

    # Remove normal filler words (so, was etc):
    nlp = spacy.load("en_core_web_sm")
    result = [word for word in words if word not in nlp.Defaults.stop_words]

    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_csv("https://media.githubusercontent.com/media/zakstucke/geo-sentiment-analysis/master/covidvaccine.csv?token=ARLQAMUQQLR3BHVWWW56WG3CHRRXC", nrows=10)

df = create_cleaned(df, "text")
df = create_bias(df, "text")
df = create_emotions(df, "text")
df = create_geo(df, "user_location")


In [None]:
print(df)

In [None]:
filepath = "covidvaccine_geo_emotions"
df.to_csv(filepath)

### Once checkpointed, skip to here

In [None]:
# import pandas as pd
geo_df = pd.read_csv("covidvaccine_geo_emotions")
geo_df.shape

In [None]:
# Some issue with this module in default install 
!pip uninstall tornado
!pip install tornado==4.5.3
# Ensure up to date version of plotly
!pip install plotly==4.14.3
import plotly.express as px

Found existing installation: tornado 6.1
Uninstalling tornado-6.1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/tornado-6.1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/tornado/*
Proceed (y/n)? y
  Successfully uninstalled tornado-6.1
Collecting tornado==4.5.3
  Downloading tornado-4.5.3.tar.gz (484 kB)
[K     |████████████████████████████████| 484 kB 12.0 MB/s 
[?25hBuilding wheels for collected packages: tornado
  Building wheel for tornado (setup.py) ... [?25l[?25hdone
  Created wheel for tornado: filename=tornado-4.5.3-cp37-cp37m-linux_x86_64.whl size=434031 sha256=7af482dede19ff8c2dbac1285c59d0e4ccb548f4d4b600c08cc0c8cd19adb00b
  Stored in directory: /root/.cache/pip/wheels/a2/45/43/36ec7a893e16c1212a6b1505ded0a2d73cf8e863a0227c8e04
Successfully built tornado
Installing collected packages: tornado
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following 



In [None]:
geo_df.columns

Index(['Unnamed: 0', 'user_name', 'user_location', 'user_description',
       'user_created', 'user_followers', 'user_friends', 'user_favourites',
       'user_verified', 'date', 'text', 'hashtags', 'source', 'is_retweet',
       'cleaned_words', 'pos', 'neg', 'neu', 'compound', 'emotion_1',
       'emotion_2', 'emotion_3', 'lat', 'long', 'city', 'country',
       'country_code'],
      dtype='object')

In [None]:
# Check for current country codes, looks like geo_df is tiny for some reason
cc_present = geo_df["country_code"].unique()

In [None]:
# geo_df["len"] = len(geo_df["emotion_1"])
# geo_df
import regex as re

# f = lambda x: re.search('\([a-zA-Z]*,',x)
# geo_df["main_emo"] = geo_df.apply(f)

# f = lambda x: re.search('\([a-zA-Z]*,',x)
# geo_df["main_emo"] = geo_df.apply(re.search('\([a-zA-Z]*,',geo_df["emotion_1"]))


def extract_emotion(df, text_column_name):
    
    def emotion(row):

        text = row[text_column_name]
        # First remove punctuation:
    #    text = re.search('\([a-zA-Z]*,',text)

        # Split text into words:
        # emo = text.split(",")
        emo = text[0]
        
        # Take out () brackets
        emo = emo.strip("()")
        
        row["extracted_emotion"] = emo

        return row

    df = df.apply(emotion, axis=1)

    return df


def extract_url(df, text_column_name):  
    
    def url(row):

        text = row[text_column_name]
        
        # Look for urls
        url = re.findall("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)", text)
        
        row["extracted_url"] = url

        return row

    df = df.apply(url, axis=1)

    return df

df2 = extract_emotion(df, "emotion_1")
df2 = extract_url(df2, "text")


In [None]:
df2

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,cleaned_words,pos,neg,neu,compound,emotion_1,emotion_2,emotion_3,lat,long,city,country,country_code,extracted_emotion,extracted_url
0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64.0,11.0,110.0,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,False,"[Australia, Manufacture, Covid19, Vaccine, Cit...",0.148,0.0,0.852,0.5106,"(fear, 0.0)","(anger, 0.0)","(anticip, 0.0)",26.407384,93.25513,,India,in,fear,[]
1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1.0,17.0,0.0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,False,"[CoronavirusVaccine, CoronaVaccine, CovidVacci...",0.285,0.0,0.715,0.4927,"(trust, 0.2)","(surprise, 0.2)","(positive, 0.2)",46.314475,11.048029,Ville d'Anaunia,Italia,it,trust,[]
2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143.0,566.0,8.0,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,False,"[Deaths, COVID19, Affected, Countries, Read, M...",0.0,0.118,0.882,-0.1531,"(fear, 0.0)","(anger, 0.0)","(anticip, 0.0)",46.314475,11.048029,Ville d'Anaunia,Italia,it,fear,[]
3,Zane,,Fresher than you.,18-09-2019 11:01,29.0,25.0,620.0,False,18-08-2020 12:45,@Team_Subhashree @subhashreesotwe @iamrajchoco...,,Twitter for Android,False,"[TeamSubhashree, subhashreesotwe, iamrajchoco,...",0.209,0.0,0.791,0.4404,"(trust, 0.3333333333333333)","(positive, 0.3333333333333333)","(joy, 0.3333333333333333)",46.314475,11.048029,Ville d'Anaunia,Italia,it,trust,[]
4,Ann-Maree O’Connor,"Adelaide, South Australia",Retired university administrator. Melburnian b...,24-01-2013 14:53,83.0,497.0,10737.0,False,18-08-2020 12:45,@michellegrattan @ConversationEDU This is what...,,Twitter Web App,False,"[michellegrattan, ConversationEDU, This, passe...",0.0,0.0,1.0,0.0,"(trust, 1.0)",,,-34.928181,138.599931,Adelaide,Australia,au,trust,[]
5,Raunak Scherbatsky DankWorth,,Neuro surgeon + Diagnostician.👨‍⚕️\na good phy...,03-08-2020 13:39,3.0,27.0,918.0,False,18-08-2020 12:44,The Multi-system Inflammatory Syndrome-Childre...,"['COVID19', 'COVID19India']",Twitter for Android,False,"[The, Multisystem, Inflammatory, SyndromeChild...",0.0,0.0,1.0,0.0,"(fear, 0.2)","(anger, 0.2)","(negative, 0.2)",46.314475,11.048029,Ville d'Anaunia,Italia,it,fear,[]
6,Rajesh Tadepalli,"Hyderabad, India",Growth Initiatives @tech_mahindra | AIMer | Br...,07-05-2013 03:57,918.0,2561.0,31837.0,False,18-08-2020 12:34,@PrivilRodrigues @yatish57 @deepkaranahuja @sh...,,Twitter for Android,False,"[PrivilRodrigues, yatish57, deepkaranahuja, sh...",0.0,0.0,1.0,0.0,"(fear, 0.0)","(anger, 0.0)","(anticip, 0.0)",17.360589,78.474061,Hyderabad,India,in,fear,[]
7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321.0,3236.0,264351.0,False,18-08-2020 12:30,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,False,"[MSNBC, Well, let’s, qualify, party, vaccine, ...",0.202,0.0,0.798,0.5859,"(positive, 0.5)","(anticipation, 0.5)",,,,,,,positive,[]
8,Dr. Joseph Santoro,"Washington, DC 20009","Neuro PhD, #Innovator, #Technologist, #Startup...",17-01-2009 21:10,19091.0,20986.0,128119.0,False,18-08-2020 12:15,"Most countries, without the ability to make #V...",['Vaccines'],Hootsuite Inc.,False,"[Most, countries, ability, Vaccines, locally, ...",0.098,0.195,0.707,-0.3532,"(fear, 0.3333333333333333)","(positive, 0.3333333333333333)","(negative, 0.3333333333333333)",38.895037,-77.036543,Washington,United States,us,fear,[]
9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282.0,96.0,788.0,False,18-08-2020 11:57,#DNA zooms up charts in 1st week; hear #vacci...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,False,"[DNA, zooms, charts, 1st, week, hear, vaccines...",0.0,0.0,1.0,0.0,"(fear, 0.3333333333333333)","(negative, 0.3333333333333333)","(sadness, 0.3333333333333333)",36.162277,-86.774298,Nashville-Davidson,United States,us,fear,[]


In [None]:
# df2.columns
group_df = df2.groupby("country_code")["country_code", "extracted_emotion"].agg(pd.Series.mode)
# mode_emotion = geo_df.groupby(["country_code", "emotion_1"]).count()

group_df.columns


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Index(['country_code', 'extracted_emotion'], dtype='object')

In [None]:
fig = px.choropleth(group_df, # dataset to use
                    locations=["Uk","Poland","Brazil"], # column which includes 3 letter country code
                    locationmode = "country names",
                    color=[97,6,170], # column which dictates the colour of the map
                    #hover_name=["au","it"], # column to add to hover information
                    range_color=(0, 200), #,# range of the colour scale
                    color_continuous_scale=["red","blue"] # colour scale (these can be predefined or you can create your own)
)
fig.show()
# "#ff009f","cyan"

('#ff009f', 'cyan')

# Vaccine chloropleth

In [None]:
# imports
import pandas as pd
import plotly.express as px

# Data IN
vaccine_data = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv"
df = pd.read_csv(vaccine_data)

In [None]:
# Features for chloropleth
features = df
print(features.shape)

col = "people_vaccinated"

# Delete rows where case numbers are zero
# This deletion is completed by "selecting" rows where case numbers are non zero
features = features.loc[features[col] != 0]
features = features.loc[features["location"] != "World"].dropna()
print(features.shape)

max_total = features[col].max()
min_total = features[col].min()

# Delete rows where there have been no cases in 24 hours AND no cases in 7 days
# Note that you must put each condition within parenthesis
# data = data.loc[(data["deaths_7_days"] > 0) & (data["deaths_24_hours"] > 0)]
# data.shape

(91808, 16)
(17759, 16)
3462431058.0


Unnamed: 0_level_0,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Africa,OWID_AFR,2022-03-28,4.493217e+08,2.809913e+08,2.084275e+08,1.653224e+07,4169685.0,2949068.0,32.71,20.46,15.18,1.20,2147.0,1940354.0,0.141
Albania,ALB,2022-03-15,2.737859e+06,1.275907e+06,1.209791e+06,2.521610e+05,11284.0,10183.0,95.30,44.41,42.11,8.78,3544.0,3927.0,0.137
Argentina,ARG,2022-03-28,9.639951e+07,4.089994e+07,3.690642e+07,1.889703e+07,564530.0,448185.0,211.38,89.68,80.92,41.44,9827.0,320718.0,0.703
Asia,OWID_ASI,2022-03-28,7.673087e+09,3.462431e+09,3.123384e+09,1.035172e+09,36987971.0,33760230.0,164.01,74.01,66.76,22.13,7216.0,16269965.0,0.348
Australia,AUS,2022-03-27,5.610779e+07,2.219348e+07,2.115337e+07,1.276094e+07,351226.0,294264.0,217.57,86.06,82.03,49.48,11411.0,108417.0,0.420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Upper middle income,OWID_UMC,2022-03-28,4.952664e+09,2.041224e+09,1.920964e+09,9.104451e+08,30754199.0,27402006.0,197.03,81.20,76.42,36.22,10901.0,10073059.0,0.401
Uruguay,URY,2022-03-28,8.017224e+06,2.972438e+06,2.841908e+06,2.202878e+06,63775.0,49499.0,230.04,85.29,81.54,63.21,14203.0,34775.0,0.998
Vietnam,VNM,2022-03-22,2.031444e+08,7.994719e+07,7.775411e+07,4.544308e+07,1345407.0,612180.0,206.93,81.44,79.20,46.29,6236.0,39004.0,0.040
Wales,OWID_WLS,2022-03-27,6.925183e+06,2.528051e+06,2.398676e+06,1.998456e+06,324429.0,61187.0,218.46,79.75,75.67,63.04,19302.0,2507.0,0.079


In [None]:
# Chloropleth
fig = px.choropleth(features, # dataset to use
                    locations="iso_code", # column which includes 3 letter country code
                    locationmode = "ISO-3",
                    color=col, # column which dictates the colour of the map
                    hover_name="location", # column to add to hover information
                    range_color=(min_total, max_total), #,# range of the colour scale
                    color_continuous_scale=["red","blue"] # colour scale (these can be predefined or you can create your own)
)
fig.show()

## People vaccinated per 100

In [None]:
# Features for chloropleth
features = df
print(features.shape)

col = "people_fully_vaccinated_per_hundred"

# Delete rows where case numbers are zero
# features = features.loc[features[col] != 0]

# Delete "World" entries, these cause anomalies 
features = features.loc[features["location"] != "World"]


def featurise(df, text_column_name):   
  def f(row):
    if ("OWID" not in row[text_column_name]):
      return row
    else:
      row[text_column_name] = None
      return row

  df = df.apply(f, axis=1)
  return df




features = featurise(features, "location")
# features = features.loc[features["iso_code"].find("OWID")]

features = features.loc[features["location"] != "Gibraltar"].dropna()
print(features.shape)




max_total = features[col].max()
min_total = features[col].min()
print(max_total)

test = features.groupby("location").agg("max")
test = test.sort_values("people_fully_vaccinated_per_hundred")
print(test)

# Delete rows where there have been no cases in 24 hours AND no cases in 7 days
# Note that you must put each condition within parenthesis
# data = data.loc[(data["deaths_7_days"] > 0) & (data["deaths_24_hours"] > 0)]
# data.shape

(91808, 16)
(17689, 16)
92.6
                      iso_code        date  total_vaccinations  \
location                                                         
Luxembourg                 LUX  2021-04-13            141441.0   
Low income            OWID_LIC  2022-03-27         136775532.0   
Kenya                      KEN  2022-03-19          17327143.0   
Africa                OWID_AFR  2022-03-28         449321666.0   
Libya                      LBY  2022-03-24           3415834.0   
...                        ...         ...                 ...   
United Arab Emirates       ARE  2021-11-25          21757061.0   
Chile                      CHL  2022-03-22          50302179.0   
Malta                      MLT  2022-03-27           1255418.0   
Singapore                  SGP  2022-03-21          13780673.0   
Portugal                   PRT  2022-03-10          22923599.0   

                      people_vaccinated  people_fully_vaccinated  \
location                                    

In [None]:
# Chloropleth
fig = px.choropleth(features, # dataset to use
                    locations="iso_code", # column which includes 3 letter country code
                    locationmode = "ISO-3",
                    color=col, # column which dictates the colour of the map
                    hover_name="location", # column to add to hover information
                    range_color=(min_total, max_total), #,# range of the colour scale
                    color_continuous_scale=["red","blue"] # colour scale (these can be predefined or you can create your own)
)
fig.show()