In [4]:
import pandas as pd
import numpy as np

# location processing
from geotext import GeoText

# word processing
from textblob import TextBlob
import nltk

# misc
from collections import Counter
from datetime import datetime
import ast

# parallel processing
import dask.dataframe as dd
from dask.multiprocessing import get

### Merge data from all months

In [16]:
# merge all month data
data_path = 'tweet_data'
files = os.listdir(data_path)
all_dfs = []
for i,f in enumerate(files):
    path = os.path.join(data_path, f)
    curr_df = pd.read_csv(path, encoding = "ISO-8859-1")
    all_dfs.append(curr_df)
all_dfs
big_df = pd.concat(all_dfs)
big_df.to_csv('other_data/all_tweets.csv')

In [17]:
big_df.shape

(131187, 15)

### Adding location columns

In [60]:
data = pd.read_csv('tweet_data/tweets_2020_08.csv')
data.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text
0,1298433064687763457,"Brooklyn, NY",2020-08-26,1458,861,Wearing a mask in public is an important way t...
1,1294720321392541696,"Sonoma, CA",2020-08-15,12739,4537,Canada shares a 5500 mile border with the US. ...
2,1296797780179144704,"Oakland, NJ",2020-08-21,310,58,The Braves and Nats got rained out yesterday. ...
3,1296941201111879692,"Fort Wayne, IN",2020-08-21,2,0,Purdue is kicking out anyone who dont go by so...
4,1289823031599087616,"Los Angeles, CA",2020-08-02,1979,695,The day camp my son went to was limited to 60 ...


In [25]:
state_abbrs = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
states = ["Alaska", "Alabama", "Arkansas", "American Samoa", 
            "Arizona", "California", "Colorado", "Connecticut", "District of Columbia",
            "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", 
            "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
            "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", 
            "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
            "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
            "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", 
            "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", 
            "Wyoming"]
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
    
# invert the dictionary
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))
names_of_us = ['us','usa','united states','united states of america']

In [25]:
# testing geotext
places = GeoText('California, USA')
places.cities

[]

In [44]:
# function to add standardized location columns
def standardize_location(row):
      if row['index'] % 100 == 0:
            print('row ' + str(row['index']))
      # row['lower_location'] = row.location.lower()
      city = GeoText(row.location).cities # check if city in geotexts list of cities
      location_split = row.location.split(', ')
      if len(location_split) > 1:
            if location_split[1].lower() in names_of_us: # lower() for Usa, USA, etc.
                  row['city'] = 'Not City'
                  row['state'] = location_split[0]
                  row['is_state'] = True
            elif location_split[1].upper() in state_abbrs:
                  row['city'] = location_split[0]
                  row['state'] = abbrev_to_us_state.get(location_split[1].upper()) # upper() handles cases like: Ca, or tx
                  row['is_state'] = False
            elif city:
                  row['city'] = city[0]
                  row['state'] = location_split[1]
                  row['is_state'] = False
            else:
                  row['city'] = 'Not City'
                  row['state'] = location_split[0]
                  row['is_state'] = True
      else:
            row['city'] = 'Not City'
            row['state'] = location_split[0]
            row['is_state'] = False
      row['state_abbr'] = us_state_to_abbrev.get(row['state'])
      try:
            row['city_and_state'] = row['city'] + ', ' +  row['state']
      except:
            print(row, row['city'], row['state'], row['state_abbr'])
      return row

In [45]:
df_all = pd.read_csv('other_data/all_tweets.csv')
df_all['index'] = range(1, len(df_all) + 1)
df_all2 = df_all.apply(lambda row: standardize_location(row),axis=1)
df_all2.head()

row 100
row 200
row 300
row 400
row 500
row 600
row 700
row 800
row 900
row 1000
row 1100
row 1200
row 1300
row 1400
row 1500
row 1600
row 1700
row 1800
row 1900
row 2000
row 2100
row 2200
row 2300
row 2400
row 2500
row 2600
row 2700
row 2800
row 2900
row 3000
row 3100
row 3200
row 3300
row 3400
row 3500
row 3600
row 3700
row 3800
row 3900
row 4000
row 4100
row 4200
row 4300
row 4400
row 4500
row 4600
row 4700
row 4800
row 4900
row 5000
row 5100
row 5200
row 5300
row 5400
row 5500
row 5600
row 5700
row 5800
row 5900
row 6000
row 6100
row 6200
row 6300
row 6400
row 6500
row 6600
row 6700
row 6800
row 6900
row 7000
row 7100
row 7200
row 7300
row 7400
row 7500
row 7600
row 7700
row 7800
row 7900
row 8000
row 8100
row 8200
row 8300
row 8400
row 8500
row 8600
row 8700
row 8800
row 8900
row 9000
row 9100
row 9200
row 9300
row 9400
row 9500
row 9600
row 9700
row 9800
row 9900
row 10000
row 10100
row 10200
row 10300
row 10400
row 10500
row 10600
row 10700
row 10800
row 10900
row 11000
row 1110

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California"
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California"
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California"
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona"
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts"


### Join mask data with twitter data

In [46]:
mask_data = pd.read_csv('mask_mandate_no_na_smaller.csv')
mask_data_sm = mask_data.iloc[:,[1,5,7]]
mask_data_sm = mask_data_sm.drop_duplicates()
mask_data_sm.head()

Unnamed: 0,State_Tribe_Territory,date,Face_Masks_Required_in_Public
0,AL,4/10/2020,No
1,AL,4/11/2020,No
2,AL,4/12/2020,No
3,AL,4/13/2020,No
4,AL,4/14/2020,No


In [32]:
mask_data_sm.loc[mask_data_sm.date == '8/17/2020',:].State_Tribe_Territory.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'AS', 'GU', 'MP', 'PR',
       'VI'], dtype=object)

In [47]:
# df_all = pd.read_csv('tweet_data/model_test_data.csv')
# FOUND ISSUE: tweet data's date column has different formats Y-M-D and M/D/Y
# change all to /
df_all2['date'] = pd.to_datetime(df_all2['date']).dt.strftime('%#m/%#d/%Y')

In [48]:
df_all2.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California"
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California"
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California"
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona"
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts"


In [49]:
# merge with mask mandate data
# mask_data_sm.iloc[:,0].unique() # no NA states
# mask_data_sm.iloc[:,2].unique() # no NA mandates, (all no, yes) 
# print(len(mask_data_sm.index),len(df_all.index)) # 27807 29053
# state_abbrs_freq = Counter(df_all['state_abbr']) # only 1171 none -> problem w join
# state_abbrs_freq
# mask_data_sm.head()
merged = df_all2.merge(mask_data_sm, how='left', 
                        left_on=['date','state_abbr'], 
                        right_on = ['date','State_Tribe_Territory'])
merged.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state,State_Tribe_Territory,Face_Masks_Required_in_Public
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California",,
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California",CA,No
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California",CA,No
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona",AZ,No
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts",,


In [50]:
merged.to_csv('merged_test_data.csv',index=False)

In [8]:
# try text blob
data.text
for text in data.text[:5]:
    test_blob = TextBlob(text)
    print(test_blob.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.22809523809523807, subjectivity=0.6442857142857142)
Sentiment(polarity=0.26666666666666666, subjectivity=0.662962962962963)
Sentiment(polarity=-0.053787878787878794, subjectivity=0.5936868686868687)


### Add column to indicate whether mask mentioned

In [2]:
all_data = pd.read_csv('merged_data_with_sentiment_and_CA_LA.csv')
all_data.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state,State_Tribe_Territory,Face_Masks_Required_in_Public,Polarity,Subjectivity,CA,LA
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California",,,0.0,0.0,Yes,No
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California",CA,No,0.0,0.0,Yes,No
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California",CA,No,0.228095,0.644286,Yes,No
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona",AZ,No,0.266667,0.662963,No,No
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts",,,-0.053788,0.593687,No,No


In [53]:
def tokenize_tweet(row):
    tokens = nltk.word_tokenize(row['text'])
    # tokens = nltk.word_tokenize(row)
    i_offset = 0
    for i, token in enumerate(tokens):
        i -= i_offset
        if token == '@' and i < len(tokens)-1: # if last token @, skip
            if i == 0:
                left = []
            else:
                left = tokens[:i-1] # words before @
            joined = [token + tokens[i + 1]] # @ plus handle
            right = tokens[i + 2:]
            tokens = left + joined + right
            i_offset += 1 # offset index by one from joining
    row['tokens'] = tokens
    return row
    # return tokens

In [65]:
# first tokenize text (with parallel processing)
all_ddata = dd.from_pandas(all_data, npartitions=15)
all_ddata = all_ddata.map_partitions(lambda df: df.apply((lambda row: tokenize_tweet(row)), axis=1)).compute(scheduler='threads')
all_ddata.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state,State_Tribe_Territory,Face_Masks_Required_in_Public,Polarity,Subjectivity,CA,LA,tokens
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California",,,0.0,0.0,Yes,No,"[A, day, without, Hispanics, ...., this, count..."
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California",CA,No,0.0,0.0,Yes,No,"[XO, face, masks, are, back, to, raise, money,..."
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California",CA,No,0.228095,0.644286,Yes,No,"[@ImKingGinger, Well, are, n't, you, special, ..."
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona",AZ,No,0.266667,0.662963,No,No,"[@seanhannity, If, I, was, to, stay, home, and..."
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts",,,-0.053788,0.593687,No,No,"[@tedpanos, Yup, ,, CT, ., Governor, spoke, pr..."


In [117]:
has_mask(all_ddata2.head(1))

ValueError: malformed node or string: ['A', 'day', 'without', 'Hispanics', '....', 'this', 'country', 'could', 'never', '.', 'https', ':', '//t.co/Z5gvH1zku9']

In [127]:
def has_mask(row):
    mask_words = ['mask','masks','masked','cover','covers','covered','n95','mandate','face','nose','mouth']
    try:
        has_mask = any([keyword in [token.lower() for token in row['tokens']] for keyword in mask_words])
    except:
        print(row['tokens'])
        has_mask = False
    row['mentions_mask'] = has_mask
    return row
    # return has_mask

In [128]:
all_ddata2 = dd.from_pandas(all_ddata, npartitions=15)
# all_ddata2.head()
all_data2 = all_ddata2.map_partitions(lambda df: df.apply((lambda row: has_mask(row)), axis=1)).compute(scheduler='threads')

In [133]:
all_data2.head()

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,state_abbr,city_and_state,State_Tribe_Territory,Face_Masks_Required_in_Public,Polarity,Subjectivity,CA,LA,tokens,mentions_mask
0,1.25e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,CA,"Pomona, California",,,0.0,0.0,Yes,No,"[A, day, without, Hispanics, ...., this, count...",False
1,1.25e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,CA,"Santa Barbara, California",CA,No,0.0,0.0,Yes,No,"[XO, face, masks, are, back, to, raise, money,...",True
2,1.25e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,CA,"Not City, California",CA,No,0.228095,0.644286,Yes,No,"[@ImKingGinger, Well, are, n't, you, special, ...",False
3,1.25e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,AZ,"Not City, Arizona",AZ,No,0.266667,0.662963,No,No,"[@seanhannity, If, I, was, to, stay, home, and...",False
4,1.25e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,MA,"Medford, Massachusetts",,,-0.053788,0.593687,No,No,"[@tedpanos, Yup, ,, CT, ., Governor, spoke, pr...",False


In [132]:
# output to csv
print(all_data2.mentions_mask.sum(), all_data2.mentions_mask.sum()/len(all_data2.index)) # number and % of tweets with mask
all_data2.to_csv('complete_data.csv',index=False)

11954 0.09086970072443387


In [21]:
# add case data
data_folder = 'csse_covid_19_daily_reports_us'
case_data = []
for data_file in os.listdir(data_folder):
    try:
        curr_df = pd.read_csv(os.path.join(data_folder,data_file)).iloc[:,[0,5,6,7,8,9,10,11,13,16]]
    except:
        print(data_file)
    curr_df['date'] = datetime.strptime(data_file.split('.')[0],"%m-%d-%Y")
    # curr_df['date'] = data_file
    case_data.append(curr_df)
    # print(curr_df.shape)
all_case_data = pd.concat(case_data)
all_case_data.shape
# all_case_data.head()


(39168, 13)

In [22]:
all_case_data.to_csv('other_data/us_case_data.csv')

In [30]:
def add_state_abbr(row):
    row['state_abbr'] = us_state_to_abbrev.get(row['Province_State'])
    return row

In [31]:
all_case_data = pd.read_csv('other_data/us_case_data.csv')
all_case_data = all_case_data.apply (lambda row: add_state_abbr(row), axis=1)
all_case_data.head()

Unnamed: 0.1,Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,Testing_Rate,date,People_Tested,Mortality_Rate,state_abbr
0,0,Alabama,365747,4872,202137.0,158738.0,1.0,7459.375895,3275341.0,1.332068,66800.2737,2021-01-01,,,AL
1,1,Alaska,47019,206,7165.0,39648.0,2.0,6427.355802,1275750.0,0.438121,174391.185778,2021-01-01,,,AK
2,2,American Samoa,0,0,,,60.0,0.0,2140.0,,3846.084722,2021-01-01,,,AS
3,3,Arizona,530267,9015,76934.0,444318.0,4.0,7285.171274,5155330.0,1.700087,39551.860582,2021-01-01,,,AZ
4,4,Arkansas,229442,3711,199247.0,26484.0,5.0,7602.945718,2051488.0,1.617402,67979.497674,2021-01-01,,,AR


In [58]:
all_case_data = all_case_data.drop(columns = ['FIPS'])
# all_case_data['date'] = pd.to_datetime(all_case_data['date']).dt.strftime('%#m/%#d/%Y')
all_case_data.head()

Unnamed: 0.1,Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,Testing_Rate,date,People_Tested,Mortality_Rate,state_abbr
0,0,Alabama,365747,4872,202137.0,158738.0,7459.375895,3275341.0,1.332068,66800.2737,1/1/2021,,,AL
1,1,Alaska,47019,206,7165.0,39648.0,6427.355802,1275750.0,0.438121,174391.185778,1/1/2021,,,AK
2,2,American Samoa,0,0,,,0.0,2140.0,,3846.084722,1/1/2021,,,AS
3,3,Arizona,530267,9015,76934.0,444318.0,7285.171274,5155330.0,1.700087,39551.860582,1/1/2021,,,AZ
4,4,Arkansas,229442,3711,199247.0,26484.0,7602.945718,2051488.0,1.617402,67979.497674,1/1/2021,,,AR


### Join case data w/ main data

In [53]:
# case data goes 4-12-2020 to 2-15-2022
all_case_data.loc[(all_case_data['state_abbr'] == 'CA') & (all_case_data['date'] == '4/12/2020'),]

Unnamed: 0.1,Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,Testing_Rate,date,People_Tested,Mortality_Rate,state_abbr
8530,4,California,22200,631,,,6.0,56.13959,,,485.423869,4/12/2020,190328.0,2.844649,CA


In [61]:
all_data = pd.read_csv('complete_data.csv')
all_ddata = dd.from_pandas(all_data, npartitions=15)
# large.merge(small) is "embarassingly parrallel" -> 3 seconds???? huh??
all_ddata2 = all_ddata.merge(all_case_data, how='left', 
    left_on = ['date','state_abbr'], 
    right_on=['date','state_abbr'])

In [62]:
all_data2 = all_ddata2.compute()
all_data2

Unnamed: 0,tweet_id,location,date,likes,retweets,text,index,city,state,is_state,...,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,Testing_Rate,People_Tested,Mortality_Rate
0,1.250000e+18,"Pomona, CA",4/4/2020,89921,34665,A day without Hispanics .... this country coul...,1,Pomona,California,False,...,,,,,,,,,,
1,1.250000e+18,"Santa Barbara, CA",4/21/2020,34308,5245,XO face masks are back to raise money and Iâ...,2,Santa Barbara,California,False,...,35750.0,1289.0,,,90.422652,,,765.392916,300100.0,3.607815
2,1.250000e+18,"California, USA",4/21/2020,0,0,@ImKingGinger Well aren't you special? My rela...,3,Not City,California,True,...,35750.0,1289.0,,,90.422652,,,765.392916,300100.0,3.607815
3,1.250000e+18,"Arizona, USA",4/19/2020,4,0,@seanhannity If I was to stay home and eat ice...,4,Not City,Arizona,True,...,4933.0,184.0,994.0,3755.0,67.772933,,,728.012918,65652.0,3.729982
4,1.250000e+18,"Medford, MA",4/6/2020,0,0,"@MisagaLion @tedpanos Yup, CT. Governor spoke ...",5,Medford,Massachusetts,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9514,1.430000e+18,"West Virginia, USA",8/31/2021,0,0,This is not a good sign https://t.co/xNJnb5uSCQ,131185,Not City,West Virginia,True,...,189690.0,3084.0,,,10584.511204,3428187.0,1.625811,191289.386418,,
9515,1.420000e+18,"Jacksonville, FL",8/8/2021,2,1,Iâm only just now hearing about this. \r\n\r...,131186,Jacksonville,Florida,False,...,2768985.0,39695.0,,,12892.349878,34269498.0,1.433558,159558.234650,,
9516,1.420000e+18,"Hitler, North Dakota",8/2/2021,0,0,@schmotdocker @Dennis_Ramen @ememesi @justinba...,131187,Not City,Hitler,True,...,49.0,0.0,,,,,0.000000,,,
9517,1.420000e+18,"Hitler, North Dakota",8/2/2021,0,0,@schmotdocker @Dennis_Ramen @ememesi @justinba...,131187,Not City,Hitler,True,...,103.0,3.0,,,,,2.912621,,,


In [63]:
all_data2.to_csv('complete_data_cases.csv')