In [None]:
import pandas as pd 
import numpy as np 
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
df_covid = pd.read_csv('Datasets/COVID-19_Statistics_by_Jurisdiction.csv')
## https://sdgis-sandag.opendata.arcgis.com/datasets/covid-19-statistics-by-jurisdiction?geometry=-120.348%2C32.566%2C-115.965%2C33.372
df_population = pd.read_csv('Datasets/co-est2019-alldata.csv', encoding='latin-1')

## Data Cleaning

In [None]:
## changing covid's lastupdate to just the date 
lastUpdate = df_covid['lastupdate']
update = []
for t in lastUpdate:
    ##getting rid of the time 
    date = t.split(' ')[0]
    #print(date)
    update.append(datetime.strptime(date, '%Y/%m/%d'))

df_covid['date'] = update

In [None]:
#df_covid['date'].iloc[0].weekday()
## adding day of the week to df covid 
weekday = []
for d in df_covid['date']:
    weekday.append(str(d.isoweekday()))



In [None]:
df_covid['dayOfWeek'] = weekday


In [None]:
##convert time into str for date 
new_date =[]
for e in df_covid.loc[:, 'date']:
    new_date.append(e.strftime('%m-%d'))

df_covid['date'] = new_date

In [None]:
##leaving only the useful columns  for covid
cols = ['name', 'confirmedcases', 'date', 'dayOfWeek']
df_covid = df_covid[cols]

In [None]:
##cleaning Name to title 
df_covid['name'] = df_covid['name'].str.title()
df_covid.rename({'name' : 'City'}, inplace = True, axis = 1)

In [None]:
##check for null values 
df_covid.isnull().sum()

## 1 null in confirmedcases

In [None]:
## inspection of null value 
df_covid[df_covid['confirmedcases'].isnull()]


In [None]:
##City is Unknown, most likely an input error
df_covid = df_covid.dropna()

In [None]:
## inspect City == Unknown 
len(df_covid[df_covid['City'] == 'Unknown'])

In [None]:
## 95 (already got rid of the null row) is the amount of days 

In [None]:
## clean population dataset 

cols = ['STNAME', 'CTYNAME', 'POPESTIMATE2019']
df_population = df_population[cols]

##renaming columns 
df_population.rename({'STNAME' : 'State', 'CTYNAME': 'City', 'POPESTIMATE2019' : 'Population'}, inplace = True, axis =1)


In [None]:
## get california pop 
df_CA = df_population.loc[df_population['State'] == 'California', :]
#df_CA.head()



In [None]:
##getting rid of 'County'
clean = []
clean = df_CA['City'].str.replace('County','')
df_CA['City'] = clean
df_CA.reset_index(drop = True)
df_CA.info()

In [None]:
#df_CA.head()

## Merging 

In [None]:
df_covid_merged = pd.merge(how = 'right', left = df_covid, right = df_CA, on ='City', )

## Analysis

In [None]:
##helper function to determine if desired city is in dataset 
def find_city(city): 
    city = city.title()
    print(city in df_covid['City'].unique())
    

In [None]:
find_city('SAn diego')

In [None]:
##group by date and graph in san diego
df_sd = df_covid.loc[df_covid['City'] == 'San Diego', :]
group_date = df_sd.groupby('date')
sd_daily_count = group_date[['confirmedcases', 'date']].sum()
df_sd.head()

In [None]:
## get daily count in SD
df_sd = df_sd.sort_values('date')
diff = df_sd['confirmedcases'] - df_sd['confirmedcases'].shift(1)
df_sd['DailyCases'] = diff


In [None]:
## make new columns that's the difference between confirmed cases to get 
## daily count 

#df_sd.head()
df_sd = df_sd.sort_values('date')
df_sd.set_index('date')


In [None]:
sd_pv = df_sd.pivot_table(index = 'date', values = 'DailyCases')
sd_pv.tail()


In [None]:
## graph last 14 days in sd 
sd_pv[-14:].plot(kind = 'line', title='COVID in SD from {} to {}'.format(sd_pv.index[-14], sd_pv.index[-1]))




In [None]:
## graphing last week compared to two weeks ago

In [None]:
## making dataframe for 

In [None]:
## could fix this so the two graphs overlap, and use the day 
## of the week as x axis
df_sd_twoweeks_before = df_sd[['dayOfWeek', 'DailyCases']].iloc[-14:-7]
df_sd_oneweek_before = df_sd[['dayOfWeek', 'DailyCases']].iloc[-7:]

plt.plot(df_sd_twoweeks_before['dayOfWeek'], df_sd_twoweeks_before['DailyCases'], label = '{} - {}'.format(df_sd['date'].iloc[-14], df_sd['date'].iloc[-8]))
plt.plot(df_sd_oneweek_before['dayOfWeek'], df_sd_oneweek_before['DailyCases'], label = '{} - {}'.format(df_sd['date'].iloc[-7], df_sd['date'].iloc[-1]))
plt.title('San Diego Covid in the Past 2 Weeks')
plt.ylabel('Daily Confirmed Cases')
plt.xlabel('Day of the week')
plt.legend()





In [None]:
## find out mean of week and see if there is a mismatch amongst days 
##hypothesis: tuesdays move the slowest? 


In [None]:
weekday_sd_pv = df_sd.pivot_table(index = 'dayOfWeek', values = 'DailyCases')



In [None]:
ax = weekday_sd_pv.plot(kind='bar', rot = 0, title = 'New Daily Cases on average by the day of the week in SD')
ax.set_xlabel('Day of Week')


In [None]:
df_covid['City'].unique()

In [None]:
## project: this week compared to the average week/last week???
## in terms of statistical analyssis
