### Sentiment Analysis and Asian Americans

This analysis uses the [GetOldTweets](https://github.com/Mottl/GetOldTweets3) package to retrieve and query tweets with given search parameters.

In [1]:
import re
import pickle
import pandas as pd
import GetOldTweets3 as got
from datetime import datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
# helper functions
def clean_text(text):
    '''removes links, user tags, and hashtag symbols'''
    cleaned = re.sub(r"(((http|https|ftp)://)|www.)[^\s]+", "", text)
    cleaned = re.sub(r"((@[\w]+)|#)", "", cleaned).strip()
    return cleaned
        
def vader_compound(text):
    '''returns compound score of VADER sentiment analysis'''
    return analyzer.polarity_scores(text)['compound']


def covid_keywords(text):
    '''returns dict of two keys of form {"about_covid": 0 or 1, "covid_word": "NA" or keyword}'''
    
    keywords = [(r"kung[-\s]*flu", "kungflu"), (r"chink[-\s]virus*", "chinkvirus"), (r"china[-\s]*virus","china-virus"),\
                (r"chinese[-\s]*virus", "chinese-virus"),(r"sars[-\s]*cov[-\s]*2","sars-cov-2"),(r"ncov","ncov"),\
                (r"covid[-\s]*19","covid-19"),(r"covid","covid"),(r"corona[-\s]*virus", "coronavirus"),\
                (r"quarantin","quarantine"), (r"corona", "corona"), (r"pandemic", "pandemic"),\
                (r"social[-\s]*distanc", "social-distancing"), (r"flatten(\w)*[-\s]*the[-\s]*curve","flatten-the-curve")]
    
    for key in keywords:
        if re.search(key[0], text.lower()):
            return {"about_covid":1, "covid_word": key[1]}
            
    return {"about_covid":0, "covid_word": "NA"}


def china_keywords(text):
    '''returns dict of two keys of form {"about_china": 0 or 1, "china_word": "NA" or keyword}'''
    
    keywords = [(r"yellow[-\s]*peril", "yellow-peril"), (r"kung[-\s]*flu", "kungflu"), (r"chink[-\s]virus*", "chinkvirus"),\
                (r"chink", "chink"),(r"china[-\s]*virus","china-virus"), (r"chinese[-\s]*virus", "chinese-virus"),\
                 (r"wuhan", "wuhan"), (r"chinese[-\s]*america", "chinese-america"), (r"china[-\s]*town", "chinatown"),\
                (r"china", "china"), (r"chinese", "chinese"), (r"asian[-\s]*america","asian-america"),\
                (r"asian", "asian"),(r"asia","asia")]
    
    for key in keywords:
        if re.search(key[0], text.lower()):
            return {"about_china":1, "china_word": key[1]}
            
    return {"about_china":0, "china_word": "NA"}

In [4]:
# list of dictionaries; each dict stores info about the tweet
twt_db = []
query = 'lang:en'
count = 100
feb = [date.strftime("%Y-%m-%d") for date in pd.date_range(start="2020-02-01",end="2020-03-01").to_pydatetime().tolist()]
march = [date.strftime("%Y-%m-%d") for date in pd.date_range(start="2020-03-01",end="2020-04-01").to_pydatetime().tolist()]
april = [date.strftime("%Y-%m-%d") for date in pd.date_range(start="2020-04-01",end="2020-05-01").to_pydatetime().tolist()]

def add_tweets(date_list):
    for i in range(len(date_list)-1):
        tweetCriteria = got.manager.TweetCriteria().setSince(dates[i]).setUntil(dates[i+1])\
                                            .setMaxTweets(count).setQuerySearch(query)
    # list of tweets from this day
        tweets = got.manager.TweetManager.getTweets(tweetCriteria)
        for twt in tweets:
            entry = {}
            cleaned = clean_text(twt.text)
            date = twt.date.strftime("%m-%d-%Y")
            entry.update({"id":twt.id, "tweet":cleaned,"date":date, "sentiment":vader_compound(cleaned)})
            entry.update(covid_keywords(cleaned))
            entry.update(china_keywords(cleaned))
            entry["link"] = twt.permalink
            twt_db.append(entry)

In [5]:
add_tweets(feb)

In [6]:
add_tweets(march)

In [7]:
add_tweets(april)

In [8]:
# save data into txt file for R processing
with open("all_tweets.txt", "w", encoding='utf-8') as out:
    print("id\ttweet\tdate\tsentiment\tabout_covid\tcovid_word\tabout_china\tchina_word\tlink", file=out)
    for twt in twt_db:
        print(str(twt["id"],twt["tweet"],twt["date"],twt["sentiment"],twt["about_covid"], twt["covid_word"],\
              twt["about_china"], twt["china_word"], twt["link"], sep="\t", end="\n", file=out)

# save list of dicts and the dataframe with pickle
with open('tweet_dict.pickle', 'wb') as p_out:
    pickle.dump(twt_db, p_out)

df = pd.DataFrame(twt_db)
df.to_pickle('tweet_df.pickle')

In [11]:
type(twt_db[1]['id'])

str

In [None]:
# with open('tweet_dict.pickle', 'rb') as p_in:
    #tweet_dict = pickle.load(p_in)
# with open('tweet_df.pickle', 'rb') as p_in:
    #tweet_df = pickle.load(p_in)

In [None]:
setTopTweets

# TESTER CODE!

In [None]:
with open("test.txt", "w", encoding='utf-8') as out:
    print("what is this utf-8 testing about?", file=out)

In [None]:
keywords = [(r"kung[-\s]*flu", "kungflu"), (r"chink[-\s]virus*", "chinkvirus"), (r"china[-\s]*virus","china-virus"),\
                (r"chinese[-\s]*virus", "chinese-virus"),(r"sars[-\s]*cov[-\s]*2","sars-cov-2"),\
                (r"ncov","ncov"),(r"covid","covid"),(r"corona[-\s]*virus", "coronavirus"),(r"quarantin","quarantine"),\
                 (r"corona", "corona"), (r"pandemic", "pandemic"), (r"social[-\s]*distanc", "social-distancing"), \
                (r"flatten(\w)*[-\s]*the[-\s]*curve","flatten-the-curve")]
text = "flattening the  curve"
for key in keywords:
    if re.search(key[0], text.lower()):
        print({"about_covid":1, "covid_word": key[1]})
        break

In [None]:
query = 'lang:en'
count = 100
# create query object
tweetCriteria = got.manager.TweetCriteria().setSince("2020-03-01").setUntil("2020-03-02")\
                                            .setMaxTweets(count).setQuerySearch(query)
# list of all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

In [None]:
# list of dictionaries; each dict stores info about the tweet
twt_db = []
for twt in tweets:
    entry = {}
    cleaned = clean_text(twt.text)
    date = twt.date.strftime("%m-%d-%Y %H:%M:%S")
    entry.update({"id":twt.id, "tweet":cleaned,"date":date,\
                  "sentiment":vader_compound(cleaned),"link":twt.permalink})
    twt_db.append(entry)
    
    

In [None]:
for twt in twt_db:
    print(twt["tweet"],twt['date'],twt['sentiment'],"\n\n",sep="\t")

In [None]:
with open("tweets.txt", "w") as out:
    print("id\ttweet\tdate\tsentiment\tlink", file=out)
    for twt in twt_db:
        print(twt["id"],twt["tweet"],twt["date"],twt["sentiment"],twt["link"], sep="\t", end="\n", file=out)

with open('tweet')


In [None]:
testdict = [{"name":"vanessa","school":"harvard","age":19},{"school":"harvard","age":19}]
df = pd.DataFrame(testdict)
df