In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
from sklearn import preprocessing
import numpy as np

In [4]:
from pytrends.request import TrendReq


In [5]:
pytrends = TrendReq(hl='en-US', tz=360)

In [6]:
geo = 'US-MI-563'
timeframe = '2012-01-01 2023-08-31'
cat = 419
kwords = [
          #'flu',
          #'fever',
          #'sore throat',
            'cough',
            'cold',
          #'symptoms',
          #'chills',
          ]

In [7]:
pytrends.build_payload(kwords, cat=cat, timeframe=timeframe, geo=geo)

data = pytrends.interest_over_time().reset_index().rename(columns={'date':'Date'})

In [8]:
data.head()

Unnamed: 0,Date,cough,cold,isPartial
0,2012-01-01,35,39,False
1,2012-02-01,35,27,False
2,2012-03-01,34,29,False
3,2012-04-01,34,33,False
4,2012-05-01,31,17,False


In [10]:
data = pd.melt(data, id_vars='Date',value_vars=['cough','cold'],value_name='Normalized Hits',var_name='Search Terms')

In [12]:
fig = px.line(data, x='Date', y='Normalized Hits',color='Search Terms',width=1200,height=400)
fig.show()

In [13]:
google_data = data.copy()

In [14]:
def preprocess(df):
    df = df.loc[df['State Name']=='Michigan']
    df = df.loc[df['county Name']=='Kent']
    df['time'] = pd.to_datetime(df['Date'])
    df['year'] = df['time'].apply(lambda x: x.year)
    df['month'] = df['time'].apply(lambda x: x.month)
    return df

def process(df_list=[]):
    output = pd.DataFrame()
    output = pd.concat([preprocess(df).groupby(['year','month'])['AQI'].mean() for df in df_list])
    return(output)

In [66]:
ldl = []

for i in range(2012,2023):
    ldl.append(pd.read_csv("c:\\Users\\ljens\\OneDrive\\Documents\\MSU MPH\\capstone\\AQI\\daily_aqi_by_county_"+str(i)+".csv", low_memory=False))

In [117]:
aqi_data = pd.DataFrame(process(ldl))

In [57]:
aqi_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AQI
year,month,Unnamed: 2_level_1
2012,1,40.580645
2012,2,40.793103
2012,3,42.580645
2012,4,45.566667
2012,5,59.774194


In [118]:
aqi_data.reset_index(inplace=True)
aqi_data['Date'] = pd.to_datetime(aqi_data['month'].astype('str') + '-' + aqi_data['year'].astype('str'))

In [69]:
aqi_data.head()

Unnamed: 0,year,month,AQI,Date
0,2012,1,40.580645,2012-01-01
1,2012,2,40.793103,2012-02-01
2,2012,3,42.580645,2012-03-01
3,2012,4,45.566667,2012-04-01
4,2012,5,59.774194,2012-05-01


In [21]:
fig = px.line(aqi_data, x='Date',y='AQI',width=1200,height=400)
fig.show()

In [70]:
d = pd.read_csv("C:\\Users\\ljens\\OneDrive\\Documents\\MSU MPH\\capstone\\AQI\\kent_2023_thru_aug.csv")

In [27]:
d.head()

Unnamed: 0,Date,AQI
0,1/1/2023,68
1,1/2/2023,76
2,1/3/2023,33
3,1/4/2023,44
4,1/5/2023,52


In [71]:
d['Date'] = pd.to_datetime(d['Date'])

In [72]:
d.head()

Unnamed: 0,Date,AQI
0,2023-01-01,68
1,2023-01-02,76
2,2023-01-03,33
3,2023-01-04,44
4,2023-01-05,52


In [119]:
aqi_data.drop(columns=['year','month'], inplace=True)

In [120]:
aqi_data.head()

Unnamed: 0,AQI,Date
0,40.580645,2012-01-01
1,40.793103,2012-02-01
2,42.580645,2012-03-01
3,45.566667,2012-04-01
4,59.774194,2012-05-01


In [121]:
d.reset_index(inplace=True)
aqi_data.reset_index(inplace=True)

In [122]:
aqi_data['Date'] = pd.to_datetime(aqi_data['Date'])
d['Date'] = pd.to_datetime(d['Date'])

In [123]:
aqi_data_copy = aqi_data.copy()

In [124]:
aqi_data = pd.concat([d,aqi_data])

In [78]:
fig = px.line(aqi_data, x='Date',y='AQI',width=1200,height=400)
fig.show()

In [125]:
d['year'] = d['Date'].apply(lambda x: x.year)
d['month'] = d['Date'].apply(lambda x: x.month)

In [126]:
d = pd.DataFrame( d.groupby(['year','month'])['AQI'].mean() )

In [127]:
d.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AQI
year,month,Unnamed: 2_level_1
2022,7,50.032258
2022,8,36.516129
2022,9,34.4
2022,10,41.16129
2022,11,45.7


In [128]:
d = d.reset_index()
d['Date'] = pd.to_datetime(d['month'].astype('str') + '-' + d['year'].astype('str'))

In [129]:
d

Unnamed: 0,year,month,AQI,Date
0,2022,7,50.032258,2022-07-01
1,2022,8,36.516129,2022-08-01
2,2022,9,34.4,2022-09-01
3,2022,10,41.16129,2022-10-01
4,2022,11,45.7,2022-11-01
5,2022,12,46.870968,2022-12-01
6,2023,1,43.258065,2023-01-01
7,2023,2,41.892857,2023-02-01
8,2023,3,45.870968,2023-03-01
9,2023,4,49.5,2023-04-01


In [130]:
aqi_data = pd.concat([d,aqi_data])

In [92]:
fig = px.line(aqi_data.sort_values(by='Date'), x='Date',y='AQI',width=1200,height=400)
fig.show()

In [93]:
fig = px.line(data, x='Date', y='Normalized Hits',color='Search Terms',width=1200,height=400)
fig.show()

In [94]:
dd = pd.read_csv("C:\\Users\\ljens\\OneDrive\\Documents\\MSU MPH\\Practicum\\older dates\\data.csv")

In [116]:
flu = dd.loc[dd['Disease Group'] == 'Influenza']
flu = flu[['Time','Count','Year']]
flu['Date'] = pd.to_datetime(flu['Time'])
flu['Flu Cases'] = flu['Count'].astype('int')
flu['Month'] = flu['Date'].apply(lambda x: x.month)
flu.head()

Unnamed: 0,Time,Count,Year,Date,Flu Cases,Month
21,1/1/2012,6609,2012,2012-01-01,6609,1
54,2/1/2012,7066,2012,2012-02-01,7066,2
87,3/1/2012,9728,2012,2012-03-01,9728,3
120,4/1/2012,4698,2012,2012-04-01,4698,4
153,5/1/2012,3449,2012,2012-05-01,3449,5


In [132]:
flu_copy = flu.copy()

In [131]:
aqi_data_copy = aqi_data.copy()

In [97]:
aqi_data.head()

Unnamed: 0,year,month,AQI,Date
0,2022,7,50.032258,2022-07-01
1,2022,8,36.516129,2022-08-01
2,2022,9,34.4,2022-09-01
3,2022,10,41.16129,2022-10-01
4,2022,11,45.7,2022-11-01


In [98]:
google_data.head()

Unnamed: 0,Date,Search Terms,Normalized Hits
0,2012-01-01,cough,35
1,2012-02-01,cough,35
2,2012-03-01,cough,34
3,2012-04-01,cough,34
4,2012-05-01,cough,31


In [115]:
cough_search_data = google_data.loc[google_data['Search Terms']=='cough']

In [114]:
cold_search_data = google_data.loc[google_data['Search Terms']=='cold']

In [103]:
for df in [cold_search_data, cough_search_data, aqi_data, flu]:
    df.set_index('Date',inplace=True)

In [133]:
df = pd.DataFrame()

In [134]:
df['AQI'] = aqi_data['AQI']

In [135]:
df['Flu Cases'] = flu['Flu Cases']


In [136]:
df["'Cold' Searches"] = cold_search_data['Normalized Hits']
df["'Cough' Searches"] = cough_search_data['Normalized Hits']

In [138]:
flu

Unnamed: 0,Time,Count,Year,Date,Flu Cases,Month
21,1/1/2012,6609,2012,2012-01-01,6609,1
54,2/1/2012,7066,2012,2012-02-01,7066,2
87,3/1/2012,9728,2012,2012-03-01,9728,3
120,4/1/2012,4698,2012,2012-04-01,4698,4
153,5/1/2012,3449,2012,2012-05-01,3449,5
...,...,...,...,...,...,...
5269,4/1/2023,1279,2023,2023-04-01,1279,4
5302,5/1/2023,1528,2023,2023-05-01,1528,5
5335,6/1/2023,573,2023,2023-06-01,573,6
5368,7/1/2023,9,2023,2023-07-01,9,7


In [143]:
for f in [flu, cough_search_data, cold_search_data, aqi_data]:
    f.set_index('Date',inplace=True)

In [144]:
flu

Unnamed: 0_level_0,Time,Count,Year,Flu Cases,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01,1/1/2012,6609,2012,6609,1
2012-02-01,2/1/2012,7066,2012,7066,2
2012-03-01,3/1/2012,9728,2012,9728,3
2012-04-01,4/1/2012,4698,2012,4698,4
2012-05-01,5/1/2012,3449,2012,3449,5
...,...,...,...,...,...
2023-04-01,4/1/2023,1279,2023,1279,4
2023-05-01,5/1/2023,1528,2023,1528,5
2023-06-01,6/1/2023,573,2023,573,6
2023-07-01,7/1/2023,9,2023,9,7


In [153]:
cough_search_data.rename(columns={'Normalized Hits':"'Cough' Searches"},inplace=True)
cold_search_data.rename(columns={'Normalized Hits':"'Cold' Searches"},inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [157]:
df = cough_search_data.join(flu, how='inner')

In [159]:
df = df.drop(columns=['Search Terms','Time','Count','Year','Month'])

In [166]:
df = df.join(cold_search_data).join(aqi_data).drop(columns=['Search Terms','year','month','index']).drop_duplicates()

In [167]:
df

Unnamed: 0_level_0,'Cough' Searches,Flu Cases,'Cold' Searches,AQI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,35,6609,39,40.580645
2012-02-01,35,7066,27,40.793103
2012-03-01,34,9728,29,42.580645
2012-04-01,34,4698,33,45.566667
2012-05-01,31,3449,17,59.774194
...,...,...,...,...
2023-04-01,59,1279,38,49.500000
2023-05-01,55,1528,45,57.064516
2023-06-01,39,573,34,89.333333
2023-07-01,39,9,29,61.387097


In [168]:
df.to_csv("dataset.csv")

In [169]:
df_copy = df.copy()

In [None]:
yay['Air Quality'] = preprocessing.StandardScaler().fit(np.array(yay['AQI']).reshape(-1,1)).transform(np.array(yay['AQI']).reshape(-1,1))

In [174]:
for col in df.columns:
    df[col] = preprocessing.StandardScaler().fit(np.array(df[col]).reshape(-1,1)).transform(np.array(df[col]).reshape(-1,1))

In [175]:
df

Unnamed: 0_level_0,'Cough' Searches,Flu Cases,'Cold' Searches,AQI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,-0.525590,1.249129,-0.106692,-0.390494
2012-02-01,-0.525590,1.407486,-0.889409,-0.367528
2012-03-01,-0.586351,2.329905,-0.758956,-0.174302
2012-04-01,-0.586351,0.586942,-0.498050,0.148476
2012-05-01,-0.768637,0.154147,-1.541674,1.684256
...,...,...,...,...
2023-04-01,0.932694,-0.597788,-0.171918,0.573655
2023-05-01,0.689647,-0.511506,0.284667,1.391350
2023-06-01,-0.282542,-0.842426,-0.432824,4.879489
2023-07-01,-0.282542,-1.037860,-0.758956,1.858605


In [177]:
df_melted = pd.melt(df.reset_index(), id_vars='Date',value_vars=[col for col in df.columns],var_name='variable',value_name='Normalized Value')

In [179]:
fig = px.line(df_melted.loc[df_melted['variable']!='Flu Cases'], x='Date', y='Normalized Value',color='variable',width=1200,height=400)
fig.show()

In [183]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [184]:
df

Unnamed: 0_level_0,'Cough' Searches,Flu Cases,'Cold' Searches,AQI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,-0.525590,1.249129,-0.106692,-0.390494
2012-02-01,-0.525590,1.407486,-0.889409,-0.367528
2012-03-01,-0.586351,2.329905,-0.758956,-0.174302
2012-04-01,-0.586351,0.586942,-0.498050,0.148476
2012-05-01,-0.768637,0.154147,-1.541674,1.684256
...,...,...,...,...
2023-04-01,0.932694,-0.597788,-0.171918,0.573655
2023-05-01,0.689647,-0.511506,0.284667,1.391350
2023-06-01,-0.282542,-0.842426,-0.432824,4.879489
2023-07-01,-0.282542,-1.037860,-0.758956,1.858605


In [187]:
import statsmodels.api as sm

In [188]:
exog = ["'Cough' Searches","'Cold' Searches",'AQI']

In [192]:
ex = sm.add_constant(df[[c for c in exog]])

In [193]:
sm.OLS(df['Flu Cases'],ex).fit().summary()

0,1,2,3
Dep. Variable:,Flu Cases,R-squared:,0.174
Model:,OLS,Adj. R-squared:,0.156
Method:,Least Squares,F-statistic:,9.579
Date:,"Mon, 11 Sep 2023",Prob (F-statistic):,8.81e-06
Time:,15:28:30,Log-Likelihood:,-185.23
No. Observations:,140,AIC:,378.5
Df Residuals:,136,BIC:,390.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.776e-17,0.078,-3.56e-16,1.000,-0.154,0.154
'Cough' Searches,-0.1053,0.148,-0.713,0.477,-0.397,0.187
'Cold' Searches,0.2675,0.159,1.681,0.095,-0.047,0.582
AQI,-0.2888,0.091,-3.162,0.002,-0.469,-0.108

0,1,2,3
Omnibus:,5.095,Durbin-Watson:,0.587
Prob(Omnibus):,0.078,Jarque-Bera (JB):,5.105
Skew:,0.433,Prob(JB):,0.0779
Kurtosis:,2.645,Cond. No.,3.96
