In [1]:
import re
import json
import pandas as pd
import numpy as np
from tweet import config
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from itertools import combinations

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy.sparse as ssp
from nltk.stem import WordNetLemmatizer 

np.random.seed(47)

In [2]:
# Read in data
df = pd.read_csv(config.data / 'covid19_tweets_final_denormalized_topic.csv', index_col=0, encoding='utf-8')
df = df.dropna(subset=['topic'])
df['senti'] = (np.where(df['sentiment_tag_hf']=="NEGATIVE", -1, 1)) * df['sentiment_score_hf']
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df.index = range(len(df))
print(df.shape)
df.head()

(308065, 27)


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,full_text,...,tweet_length,sentiment_tag_hf,sentiment_score_hf,sentiment_score_tb,subjectivity_score_tb,sentiment_score_nltk,id,topic,senti,month
0,Taya ❤,,I don't follow back,2011-10-01 05:55:50,283,0,8324,False,2020-01-27 00:24:01,Wildfires\nWar\nTaal Erruption\nJadine's break...,...,112,NEGATIVE,0.997167,0.0,0.0,-0.743,0,year grabe january,-0.997167,1
1,Taya ❤,,I don't follow back,2011-10-01 05:55:50,283,0,8324,False,2020-01-27 00:24:01,Wildfires\nWar\nTaal Erruption\nJadine's break...,...,112,NEGATIVE,0.997167,0.0,0.0,-0.743,0,coronavirus kobe,-0.997167,1
2,Eric Ng,Hong Kong,"Hong Kong-based senior business reporter, Sout...",2015-02-23 12:28:27,1000,191,1,True,2020-01-27 00:37:26,Wuhan virus shot in the arm for health care st...,...,102,NEGATIVE,0.999039,0.0,0.0,-0.0516,1,casinos hurt s,-0.999039,1
3,Eric Ng,Hong Kong,"Hong Kong-based senior business reporter, Sout...",2015-02-23 12:28:27,1000,191,1,True,2020-01-27 00:37:26,Wuhan virus shot in the arm for health care st...,...,102,NEGATIVE,0.999039,0.0,0.0,-0.0516,1,wuhan virus shot,-0.999039,1
4,Eric Ng,Hong Kong,"Hong Kong-based senior business reporter, Sout...",2015-02-23 12:28:27,1000,191,1,True,2020-01-27 00:37:26,Wuhan virus shot in the arm for health care st...,...,102,NEGATIVE,0.999039,0.0,0.0,-0.0516,1,health care stocks,-0.999039,1


In [3]:
def clean_text(text):
    tokens = re.findall('(?u)\\b\\w\\w+\\b', text)
    tokens = list(map(lemmatizer.lemmatize, tokens))
    return ' '.join(tokens)

# Define constants for vectorizer
min_df = 3
max_df = 0.95
max_features=20
ngram_range=(1, 2)

# Narrow down the range of topics
issue_list = ['evict', 'mental', 'depress', 'food', 'money', 'unemploy',
              'shut', 'bankrup', 'friend', 'credit', 'housing', 'medici', 'poverty']

# Gather info for each month
month_dict = {int(m):0 for m in df['month'].unique()}

# Get data for each month
for m in df['month'].unique():
    # Select relevant data
    df_m = df[df['month'] == m]
    selected_ids = []
    for issue in issue_list:
        tmp = list(df_m[df_m['topic'].str.contains(issue)].index)
        selected_ids.extend(tmp)

    selected_ids = list(set(selected_ids))
    print("Month:", m, ",Selected rows:", len(selected_ids))
    
    # Filter to selected rows
    df_m = df_m[df_m.index.isin(selected_ids)]
    
    # Lemmatize the texts
    lemmatizer = WordNetLemmatizer() 
    df_m['clean_topic'] = df_m['topic'].apply(clean_text)
    
    # Apply Tfidf
    vec = TfidfVectorizer(
        stop_words='english',
        token_pattern = '(?u)\\b\\w\\w+\\b',
        min_df=min_df,
        max_df=max_df,
        ngram_range=ngram_range,
        max_features=max_features,
        lowercase=False
    )

    # Calcualte importance based on TfidfVectorizer
    ct_matrix = vec.fit_transform(df_m['clean_topic'].tolist());
    vocab = list(pd.Series(vec.vocabulary_).sort_values().index)
    word_importance = ct_matrix.sum(axis=0).A.reshape(-1).tolist()
    import_df = pd.DataFrame({'vocab':vocab, 'importance':word_importance}).sort_values('importance', ascending=False)

    # Gather data for month
    month_data = []
    for i, row in import_df.iterrows():
        vocab = row.vocab
        imp = row.importance
        senti = df_m[df_m['clean_topic'].str.contains(vocab)]['senti'].mean()
        sample_tweet = df_m[df_m['clean_topic'].str.contains(vocab)].sample(n=1)['full_text'].values[0]
        sample_tweet = sample_tweet.replace("\n", " ").replace("&amp;", " ").replace("\\", " ")
        month_data.append({'text': vocab, 'size': imp, 'senti': senti, 'sample_tweet': sample_tweet})
    month_dict[m] = month_data

Month: 1 ,Selected rows: 36
Month: 2 ,Selected rows: 109
Month: 3 ,Selected rows: 187
Month: 4 ,Selected rows: 186
Month: 5 ,Selected rows: 214
Month: 6 ,Selected rows: 602
Month: 7 ,Selected rows: 783
Month: 8 ,Selected rows: 803
Month: 9 ,Selected rows: 784


In [4]:
month_dict[9]

[{'text': 'money',
  'size': 127.0,
  'senti': -0.7555400570854545,
  'sample_tweet': 'It’s funny how instead of spending money to like actually combat COVID in local high risk areas by tighter lockdowns, they instead chose to spend it on these being put across the country... #CovidUK https://t.co/wFxDLKW8wi'},
 {'text': 'friend',
  'size': 86.18788025102144,
  'senti': -0.2993692050759609,
  'sample_tweet': "I feel outrage everyday for my American friends  but @senatemajldr stating Trump deserves applause for his COVID response is beyond anything I have heard next to Trump's Confession. It makes me shake.   Deaths equal to 911 every 4 days deserves Impeachment."},
 {'text': 'food',
  'size': 73.57409247262937,
  'senti': -0.4964671648001369,
  'sample_tweet': 'We have kept our food pantry doors open through out the pandemic, but food insecurity continues to rise. Some media outlets report that food insecurity has increased by over 120% in Chicago since COVID-19 hit. Unacceptable. #foo

In [5]:
# Save json to disk
save_path = config.data / 'word_cloud_data'
if not save_path.exists():
    save_path.mkdir()
with open(save_path / 'month_dict.json', 'w') as outfile:
    json.dump(month_dict, outfile)