# Import libriaries, connect to twitter API

In [1]:
import tweepy as tw
import numpy as np
import pandas as pd
import time
import random

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from geopy.geocoders import Nominatim
import gmplot

pd.set_option("max_colwidth", 500)

In [2]:
#my key & secret
consumer_key=  ********
consumer_secret=  ********
access_token_key=  ********
access_token_secret=  ********

In [3]:
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token_key, access_token_secret)
api = tw.API(auth)

In [4]:
res = []
for i in range(10):  #50 times, 100 tweets each
    tweets = tw.Cursor(api.search,
                  q='vaccine-filter:retweets',
                  lang="en",
                  tweet_mode='extended').items(500)

    tweets_list = [[tweet.id, tweet.user.id, tweet.created_at, 
                    tweet.user.location, tweet.full_text] for tweet in tweets]
    res.extend(tweets_list)
    time.sleep(300+5*random.random()) #every 5 miutes
    print(i, 'done')
    
tweets_df = pd.DataFrame(res,columns=['Tweet_id', 'User_id', 'TweetAt', 'Location', 'OriginalTweet'])

In [5]:
#check mark
tweets_df.to_csv("data/new_tweets_May_26_2021.csv")

In [7]:
#read csv file
new_tweets = pd.read_csv("data/new_tweets_May_26_2021.csv", index_col = 0)
new_tweets = new_tweets.drop_duplicates(subset=['Tweet_id'])
new_tweets.drop_duplicates(subset=['OriginalTweet'], inplace = True)

In [8]:
#only select English tweets for analysis
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
new_tweets = new_tweets[new_tweets.OriginalTweet.apply(isEnglish)]
new_tweets.shape #2483 English tweets

(2483, 5)

In [9]:
geolocator = Nominatim(user_agent = "geoapiExercises")

# Go through all tweets and add locations to 'coordinates' dictionary
coordinates = {'latitude': [], 'longitude': []}
for count, user_loc in enumerate(new_tweets.sample(500).Location):
    try:
        location = geolocator.geocode(user_loc)
        
        # If coordinates are found for location
        if location:
            coordinates['latitude'].append(location.latitude)
            coordinates['longitude'].append(location.longitude)
            
    # If too many connection requests
    except:
        pass
    
# Instantiate and center a GoogleMapPlotter object to show our map
gmap = gmplot.GoogleMapPlotter(30, 0, 3)

# Insert points on the map passing a list of latitudes and longitudes
gmap.heatmap(coordinates['latitude'], coordinates['longitude'], radius=20)

# Save the map to html file
gmap.draw("visualization/Coronavirus_May26_2021.html")

### Extract Country information from messy Location column.

In [10]:
def country_convert(loc):
    try:
        geolocator = Nominatim(user_agent = 'myencoder')#"geoapiExercises"
        location = geolocator.geocode(loc)
        return location.address.split(',')[-1].lstrip()
    except AttributeError:
        # print("Unrecognizable: ", loc)
        # time.sleep(5)
        return 'N/A'    
    except:
        pass

In [11]:
new_tweets['Country'] = new_tweets['Location'].apply(country_convert)

In [12]:
new_tweets.to_csv("data/new_tweets_wlocation_May_26_2021.csv")
new_tweets = pd.read_csv("data/new_tweets_wlocation_May_26_2021.csv", index_col = 0)

In [13]:
# Country counts - not much data from UK because the time I collect data is around 7pm CST which is around 1am in UK
new_tweets.Country.value_counts() 

Italia             760
United States      693
Australia          209
Canada             209
United Kingdom      96
                  ... 
Zimbabwe             1
Perú                 1
Barbados             1
North America        1
Κύπρος - Kıbrıs      1
Name: Country, Length: 77, dtype: int64

# Step 2: Text Preprocessing(TF-IDF: tokenization)

### standardizing text

In [15]:
def standardize_text(text):
    text = text.replace(r"http\S+", "")
    text = text.replace(r"http", "")
    text = text.replace(r"@\S+", "")
    text = text.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    text = text.replace(r"@", "at")
    text = text.lower()
    return text

In [16]:
new_tweets['Clean_Tweet'] = new_tweets['OriginalTweet'].apply(standardize_text)

In [17]:
corpus = new_tweets['Clean_Tweet']

stop_words = ENGLISH_STOP_WORDS.union(['Covid', '19', 'Covid19', 'coronavirus', 'covid', 'covid19', 'covid_19'])
vectorizer = TfidfVectorizer(stop_words=stop_words) #(stop_words = 'english')

tweet_word_matrix = vectorizer.fit_transform(corpus) #sparse matrix
vocab = vectorizer.get_feature_names()  #list of all the unique values/words

In [18]:
new_tweets_clean = new_tweets[['Location', 'OriginalTweet','TweetAt', 'Clean_Tweet','Country']] #, 'Country'

# Step 3: NMF Topic Modeling

### Tweet-Topic Matrix(10 components)

In [19]:
nmf_10 = NMF(n_components=10)  #top 10 topic
tweet_topic_matrix = nmf_10.fit_transform(tweet_word_matrix) 

tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix, columns = ["topic_1","topic_2", "topic_3","topic_4", 
                      "topic_5","topic_6","topic_7","topic_8", "topic_9", "topic_10"])

tweet_topic_matrix_df[['OriginalTweet', 'Clean_Tweet', 'TweetAt','Country']] = new_tweets_clean.reset_index()[['OriginalTweet', 'Clean_Tweet','TweetAt','Country']]
tweet_topic_matrix_df.head()



Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,OriginalTweet,Clean_Tweet,TweetAt,Country
0,0.009245,0.0,0.047754,0.0,0.017704,0.014144,0.0,0.066847,0.002193,0.003454,"@DaronKnol @ronnoconiv @patriottakes I think the fact that Covid is not the Black Plague is what will stop them. If 2/3 of people who got Covid died, then vaccines would be made mandatory.\n\nCovid is simply not deathly enough to force unwilling Americans to get injections of a new vaccine.","atdaronknol atronnoconiv atpatriottakes i think the fact that covid is not the black plague is what will stop them. if 2/3 of people who got covid died, then vaccines would be made mandatory.\n\ncovid is simply not deathly enough to force unwilling americans to get injections of a new vaccine.",2021-05-26 22:50:22,United States
1,0.010963,0.0,0.0,0.0,0.0,0.000265,0.082131,0.0,0.0,0.0,@jacds68 @AuthenticVegas @BNODesk Is that why they have such a slow vaccine rollout?,atjacds68 atauthenticvegas atbnodesk is that why they have such a slow vaccine rollout?,2021-05-26 22:50:20,Italia
2,0.042257,0.0,0.0,0.0,0.00063,0.0,0.0,0.0,0.0,6.3e-05,Coronavirus: Influencers offered money to badmouth Pfizer vaccine https://t.co/6GXBw1k6v5,coronavirus: influencers offered money to badmouth pfizer vaccine s://t.co/6gxbw1k6v5,2021-05-26 22:50:19,United States
3,0.028503,0.0,0.028908,0.0,0.0,0.0,0.018887,0.024351,0.0,0.0,@shahid_siddiqui Oh Haan your leader Akhilesh said it's a BJP vaccine now why does he want it...Stop this shit we are not fools and we also know if it was any other government we would have struggled to get even one dose.....India will win under him...not by your hate against him..,atshahid_siddiqui oh haan your leader akhilesh said it's a bjp vaccine now why does he want it...stop this shit we are not fools and we also know if it was any other government we would have struggled to get even one dose.....india will win under him...not by your hate against him..,2021-05-26 22:50:19,Italia
4,0.020996,0.0,0.0,0.0,0.0,0.0,0.0,0.01085,0.0,0.0,"@KatiePavlich But y'all fine to inhale it, not prevent it's spread with a piece of cloth and deny the vaccine. \nWho's stupid now?","atkatiepavlich but y'all fine to inhale it, not prevent it's spread with a piece of cloth and deny the vaccine. \nwho's stupid now?",2021-05-26 22:50:18,Italia


### Have a look at top 6 words for 10 topics to understand the topics

In [20]:
def get_top_topic_words(nmf):
    topic_words = [] # list of lists, one for each topic

    ### find the top 6 word weights for each topic component,
    ### using terms to create a corresponding list of the top 8 words. 
    ### Add each list to topic_words 
    for topic in nmf.components_:  # for each components
        top_terms = topic.argsort()[-6:]  #argsort: only asending order. top 8 words index for later to locate #a array
        topic_words.append([vocab[top_term] for top_term in top_terms])  #use position to get word out from terms
                                                                         #list of list

    return topic_words

In [21]:
get_top_topic_words(nmf_10)
# Topic1: 'pfizer vaccine'
# Topic2: (#need more investigation here, and go back to text cleaning phase to remove the time slots. )
# Topic3: 'people are receiving second dose of the vaccine'
# Topic4: news - 'Tennessee anti-vaxxer arrested after deliberately plowing car through vaccination site '
# Topic5: 'new appointment availible at pharmary(vaccine)'
# Topic6: news - 'LEAKED INTERNAL DOCS(fb)'
# Topic7: Breaking News- 'COVID OUTBREAK IN MELBOURNE, Australia'
# Topic8: still about vaccine
# Topic9: "side effects of vaccine"
# Topic10: vaccine

[['atcatsandbuds420', 'pfizer', 'like', 'getting', 'just', 'vaccine'],
 ['03', '01', 'pune', '11', '00am', '00pm'],
 ['shot', 'today', 'vaccine', 'second', 'dose', 'got'],
 ['site', 'arrested', 'deliberately', 'plowing', 'tennessee', 'anti'],
 ['05', 'registration', 'available', '26', 'appointments', 'new'],
 ['global', 'concerns', 'atfacebook', 'secretly', 'censor', 'exposefacebook'],
 ['auspol', 'government', 'facilities', 'federal', 'rollout', 'quarantine'],
 ['vaccine', 'need', 'want', 'vaccinated', 'don', 'people'],
 ['dies', 'linked', 'rare', 'strokes', 'clots', 'astrazeneca'],
 ['health', 'county', 'second', 'administered', 'cases', 'doses']]

### Topic Interpretation(10 components)

In [22]:
words_topic_df = pd.DataFrame(nmf_10.components_.round(5),
             index = ["topic_1","topic_2", "topic_3","topic_4", 
                      "topic_5","topic_6","topic_7","topic_8", "topic_9", "topic_10"],
             columns = vectorizer.get_feature_names()).T
def topic_check(topic_name, n = 6):
    print(words_topic_df.sort_values(by=topic_name, ascending=False)[topic_name].head(6))  
    print('\n')
    print(tweet_topic_matrix_df.sort_values(by=topic_name, ascending=False)['OriginalTweet'].head(n))

In [23]:
topic_check('topic_8', 5)

people        1.65559
don           0.43263
vaccinated    0.39031
want          0.36502
need          0.17682
vaccine       0.16785
Name: topic_8, dtype: float64


1175    @stephens_ben @shawna_burley @nickiclyne Probably the point that the vaccinated people can sit next to the unvaccinated people and seeing how the vaccine doesn't prevent the virus but reduces the effects of it then vaccinated people can be carriers just like unvaccinated people. So what difference does it make?
888                                       @neontaster So what?  Nobody should be forced to take any vaccine.  Calling people anti-vaxx is just a way of vilifying them.  Besides, by now, people who want/need the vaccine have gotten it.  It doesn't matter that some people will not get the vaccine; don't worry about them.
2276                                                                                                                                                                                              

### Visualization - scattertext

### comparison between USA & Italia

In [24]:
import scattertext as st

In [25]:
corpus = st.CorpusFromPandas(tweet_topic_matrix_df[(tweet_topic_matrix_df['Country']== 'United States') | (tweet_topic_matrix_df['Country']== 'Italia')], #dataframe
                             category_col='Country',  #distinct category
                             text_col='Clean_Tweet',
                             nlp=st.whitespace_nlp_with_sentences
                            ).build()

In [26]:
html = st.produce_scattertext_explorer(
        corpus,
        category='United States',
        category_name='United States',
        not_category_name='Italia',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000
        )

In [27]:
open('comparison USA vs.Italia-May_26_2021.html', 'wb').write(html.encode('utf-8'));

### Important words comparison based on scattertext: 
* USA: 'appointments'(56 vs. 8 per 1000 tweets), 'avaliable'(55 vs. 9) ...
* Italia: 'vaccine rollout' (0 vs. 34 per 1000 tweets)quantine'(0 vs. 25), 'federal goverment'(3 vs. 20), 'astrazeneca'(3 vs. 16) ...
* sharing words: 'mask', 'side effects', 'vaccines'...