In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.ops import unary_union

import matplotlib.pyplot as plt
import seaborn as sns

from libpysal.weights import Kernel
from esda.moran import Moran

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler

In [None]:
# set seaborn theme
sns.set_theme(style='darkgrid')

In [None]:
loc_gdf = gpd.read_file('data/AQMS_loc.shp')

# Initialise

In [None]:
# set buffer zones around each site (1km)
loc_gdf['buffer_1km'] = loc_gdf['geometry'].buffer(1000)

In [None]:
buffer_gdf = loc_gdf[['buffer_1km']]
buffer_gdf = gpd.GeoDataFrame(buffer_gdf, geometry='buffer_1km')
buffer_gdf.to_file('data/buffer.shp')

## gsp modify

In [None]:
loc_gdf

0575 - LH0

1065, 1070, 1565, 1570 - TD5

2565, 2570, 3065, 3070 - CR8

2565, 3065 - ST5

2080, 2580 - KC1

2580, 2585 - CD1

2580 - MY7

2580, 3080 - BL0, CD9

3080 - CT2, CT3

3570, 3575 - HP1, LW2

3575, 3580, 4075, 4080 - GN6

3580 - TH4

4070, 4075, 4570, 4575 - GB0

4070, 4075 - GR9, GR4

4075, 4575 - GN3

5075 - BX9

5080 - HV1

In [None]:
def readin_Gsp(file_name, path='data/OSMM Greenspaces/tq/TQ', suffix='_GreenspaceArea.shp'):
    if type(file_name) == str:
        gdf = gpd.read_file(path+file_name+suffix)
    else:
        gdf = pd.concat(gpd.read_file(path+f+suffix) for f in file_name)
    return gdf

In [None]:
loc_gdf['Gsp'] = gpd.GeoSeries()

In [None]:
loc_gdf.columns.get_loc('Gsp')

In [None]:
def get_Gsp(file_name, index):
    gdf = readin_Gsp(file_name)
    print('Finish reading in shapefile(s)')
    shp = gdf['geometry'].unary_union
    print('Finish unary union.')
    if type(index) == int:
        loc_gdf.iat[index, 4] = shp.intersection(loc_gdf.loc[index, 'buffer_1km'])
    elif type(index) == list:
        for i in index:
            loc_gdf.iat[i, 4] = shp.intersection(loc_gdf.loc[i, 'buffer_1km'])
    else:
        print('invalid type!')

In [None]:
get_Gsp('0575', 13)
loc_gdf.loc[13, 'Gsp']

In [None]:
get_Gsp(['1065','1070','1565','1570'], 17)
loc_gdf.loc[17, 'Gsp']

In [None]:
get_Gsp(['2565','2570','3065','3070'], 6)
loc_gdf.loc[6, 'Gsp']

In [None]:
get_Gsp(['2565','3065'], 18)
loc_gdf.loc[18, 'Gsp']

In [None]:
get_Gsp(['2080','2580'], 14)
loc_gdf.loc[14, 'Gsp']

In [None]:
get_Gsp(['2580','2585'], 3)
loc_gdf.loc[3, 'Gsp']

In [None]:
get_Gsp('2580', 20)
loc_gdf.loc[20, 'Gsp']

In [None]:
get_Gsp(['2580','3080'], [1,2])

In [None]:
loc_gdf.loc[1, 'Gsp']

In [None]:
loc_gdf.loc[2, 'Gsp']

In [None]:
get_Gsp('3080', [4,5])

In [None]:
loc_gdf.loc[4, 'Gsp']

In [None]:
loc_gdf.loc[5, 'Gsp']

In [None]:
get_Gsp(['3570','3575'], [15,16])

In [None]:
loc_gdf.loc[15, 'Gsp']

In [None]:
loc_gdf.loc[16, 'Gsp']

In [None]:
get_Gsp(['3575','3580','4075','4080'], 9)
loc_gdf.loc[9, 'Gsp']

In [None]:
get_Gsp('3580', 19)
loc_gdf.loc[19, 'Gsp']

In [None]:
get_Gsp(['4070','4075','4570','4575'], 8)
loc_gdf.loc[8, 'Gsp']

In [None]:
get_Gsp(['4075', '4575'], [7, 11])

In [None]:
loc_gdf.loc[7, 'Gsp']

In [None]:
loc_gdf.loc[11, 'Gsp']

In [None]:
get_Gsp(['4075', '4575'], 10)
loc_gdf.loc[10, 'Gsp']

In [None]:
get_Gsp('5075', 0)
loc_gdf.loc[0, 'Gsp']

In [None]:
get_Gsp('5080', 12)
loc_gdf.loc[12, 'Gsp']

In [None]:
Gsp_gdf = loc_gdf[['siteid','Gsp']]
Gsp_gdf = Gsp_gdf.set_geometry('Gsp')
Gsp_gdf = Gsp_gdf.set_crs(27700)
Gsp_gdf.crs

In [None]:
Gsp_gdf.to_file('data/gsp_buffer_1km.shp')

## nRd_gsp

In [None]:
Gsp_gdf = gpd.read_file('data/gsp_buffer_1km.shp')

In [None]:
Gsp_gdf.head()

In [None]:
loc_gdf['Gsp'] = Gsp_gdf['geometry']
loc_gdf.info()

In [None]:
del Gsp_gdf

In [None]:
# Read in all the data
AQMS_df = pd.read_csv('data/hourly.csv')
Rd_gdf = gpd.read_file('data/london_Road.shp')
cond = pd.read_csv('data/cond_hourly.csv')

In [None]:
Rd_gdf.head()

In [None]:
for c in Rd_gdf['class'].unique():
    print('Number of ' + c + ': ', Rd_gdf[Rd_gdf['class'] == c].shape[0])

In [None]:
# Get all types of roads
Rd = {}
for c in Rd_gdf['class'].unique():
    Rd[c] = Rd_gdf[Rd_gdf['class'] == c].loc[:, 'geometry'].unary_union
Rd

In [None]:
Rd['other'] = unary_union([Rd['Not Classified'], Rd['Unknown']])
Rd.pop('Not Classified')
Rd.pop('Unknown')
Rd

In [None]:
del Rd_gdf

In [None]:
for key in Rd.keys():
    loc_gdf[key] = loc_gdf['buffer_1km'].intersection(Rd[key])

loc_gdf.head()

In [None]:
del Rd

In [None]:
# Rename columns
loc_gdf.rename(columns={'Unclassified': 'UnC', 
                        'A Road': 'A',
                        'B Road': 'B',
                        'Classified Unnumbered': 'CUn',
                        'Motorway': 'Mt',
                        'other': 'Other'}, inplace=True)

Rd_type = loc_gdf.columns[-6:]
Rd_type

In [None]:
# Get all near-road green spaces
for col in Rd_type:
    loc_gdf['n'+col+'_Gsp'] = loc_gdf['Gsp'].intersection(loc_gdf[col].buffer(50))

loc_gdf.head()

In [None]:
# london boundary read in
london = gpd.read_file('data/london_boundary.shp')

In [None]:
# visualise all the sites on the map
fig,ax = plt.subplots(1, figsize=(15,13))

london.plot(color='lightgrey', ax=ax)
loc_gdf['buffer_1km'].plot(color='silver', ax=ax)
loc_gdf['geometry'].plot(markersize=10, marker='^', color='blue', 
                         label='Air quality monitoring site', ax=ax)

ax.axis('off')

legend=ax.legend(loc='best',shadow=True,fontsize=15)

#plt.savefig('sample1.png',facecolor='black',dpi=500)
plt.show()

There are some buffers that seem to be very close to each other.

In [None]:
# add a column that specifies the shortest distance of a site to its nearest neighbour
loc_gdf['min_dis'] = pd.Series(dtype='float64')
for index, row in loc_gdf.iterrows():
    dis = []
    for i, v in loc_gdf['geometry'].iteritems():
        dis.append(row['geometry'].distance(v))
    dis.remove(0)
    loc_gdf.loc[index, 'min_dis'] = min(dis)

In [None]:
# list sites that are close to each other (within 1.5km)
loc_gdf[loc_gdf['min_dis']<=1500]

In [None]:
# check their readings' descriptive statistics
AQMS_df[AQMS_df['Site'].isin(['BL0', 'CD9', 'GR4', 'GB0'])].groupby('Site').describe()

In [None]:
stats.ttest_rel(AQMS_df[AQMS_df['Site']=='BL0'].Value.values,
                AQMS_df[AQMS_df['Site']=='CD9'].Value.values)

In [None]:
stats.ttest_rel(AQMS_df[AQMS_df['Site']=='GR4'].Value.values,
                AQMS_df[AQMS_df['Site']=='GB0'].Value.values)

Both indicate that we should reject H0, meaning the two datasets are statistically significantly different.

In [None]:
# revmove them from the list
#loc_gdf.drop(['BL0','GR4'], inplace=True)

In [None]:
# get areas and edge lengths of green spaces 
loc_gdf['Gsp_area'] = loc_gdf['Gsp'].area
loc_gdf['Gsp_edge'] = loc_gdf['Gsp'].length

In [None]:
# get road lengths of each type and nRd gsp area percentages
for col in Rd_type:
    loc_gdf[col+'_len'] = loc_gdf[col].length
    loc_gdf['n'+col+'_Gsp_per_'+col+'_len'] = loc_gdf['n'+ col+'_Gsp'].area / loc_gdf[col+'_len']

In [None]:
loc_gdf['Gsp_per_tRd_len'] = loc_gdf['Gsp_area'] / loc_gdf[[col+'_len' for col in Rd_type]].sum(axis=1)

In [None]:
loc_gdf.info()

In [None]:
# merge PM reading and site geogemetry data
df = pd.merge(AQMS_df, loc_gdf, left_on='Site', right_on='siteid')
df.info()

In [None]:
# drop irrelevant columns
df.drop(['siteid', 'sitename', 'geometry', 'buffer_1km', 'Gsp', 'min_dis'], axis=1, inplace=True)
df.drop(Rd_type , axis=1, inplace=True)
df.drop(['n'+rd+'_Gsp' for rd in Rd_type], axis=1, inplace=True)
df.drop([rd+'_len' for rd in Rd_type], axis=1, inplace=True)

df.info()

There are many null values in `nMt_Gsp_per_Mt_len`.

Because only one site has near motorway.

Remove the variable would be the best.

In [None]:
df.drop('nMt_Gsp_per_Mt_len', axis=1, inplace=True)

Some null values in `nB_Gsp_per_B_len` and `nCUn_Gsp_per_CUn_len`

In [None]:
# set the null values to zero
df.fillna(0, inplace=True)
df.info()

In [None]:
# merge with conditional variables
df = df.merge(cond, on='ReadingDateTime')
df.info()

In [None]:
exp_names = df.columns[5:10].tolist()
exp_names

In [None]:
var_names = exp_names + ['Gsp_edge', 'Gsp_per_tRd_len']
var_names

In [None]:
cond_names = df.columns[-3:].tolist()
cond_names

In [None]:
df[var_names + cond_names].describe()

In [None]:
df.to_csv('temp_data.csv', index=False)

# Temporarily save

In [None]:
df = pd.read_csv('temp_data.csv')
df.info()

In [None]:
# covert the DateTime column to numpy.datetime variable
df['ReadingDateTime'] = pd.to_datetime(df['ReadingDateTime'], format="%d/%m/%Y %H:%M")
df.rename(columns={'ReadingDateTime':'DateTime'}, inplace=True)
df.info()

In [None]:
df['Date'] = df['DateTime'].dt.date

sns.scatterplot(x=df['Date'].unique(), y=df.groupby('Date').mean()['Value'])

plt.axhline(y=15, color='red', linestyle='--')
plt.show()

In [None]:
# number of date above WHO guideline
(df.groupby('Date').mean()['Value']>15).sum()

In [None]:
# annual mean for each site
df.groupby('Site').mean()['Value']

In [None]:
exp_names = df.columns[5:10].tolist()
var_names = exp_names + ['Gsp_edge', 'Gsp_per_tRd_len']
cond_names = df.columns[11:14].tolist()

In [None]:
loc_gdf = loc_gdf.set_index('siteid')
loc_gdf = pd.merge(df.groupby('Site').mean()[var_names], loc_gdf, left_index=True, right_index=True)

In [None]:
# kernel weight matrix for the sites
weight = Kernel.from_dataframe(loc_gdf, geom_col='geometry', function='gaussian')

In [None]:
for var in var_names:
    moran_temp = Moran(loc_gdf[var].values, weight)
    print("Global Moran's I for " + var + ' is ', round(moran_temp.I, 5), 
          ' p-value: ', round(moran_temp.p_norm, 5))

In [None]:
df['Value'].hist(bins=list(range(40)))

In [None]:
df['log_Value'] = np.log(df['Value'])
df['log_Value'].hist(bins=40)

In [None]:
df[var_names + cond_names].hist(bins=10)
plt.show()

In [None]:
t_exp_names = exp_names.copy()
t_var_names = var_names.copy()

In [None]:
plt.hist(np.log(df['nA_Gsp_per_A_len']))
plt.show()

In [None]:
df['log_nA_Gsp_per_A_len'] = np.log(df['nA_Gsp_per_A_len'])
t_exp_names[1] = 'log_nA_Gsp_per_A_len'
t_var_names[1] = 'log_nA_Gsp_per_A_len'

In [None]:
plt.hist(np.sqrt(df['nB_Gsp_per_B_len']))
plt.show()

In [None]:
df['sqrt_nB_Gsp_per_B_len'] = np.sqrt(df['nB_Gsp_per_B_len'])
t_exp_names[2] = 'sqrt_nB_Gsp_per_B_len'
t_var_names[2] = 'sqrt_nB_Gsp_per_B_len'

In [None]:
plt.hist(np.log(df['nOther_Gsp_per_Other_len']))
plt.show()

In [None]:
df['log_nOther_Gsp_per_Other_len'] = np.log(df['nOther_Gsp_per_Other_len'])
t_exp_names[4] = 'log_nOther_Gsp_per_Other_len'
t_var_names[4] = 'log_nOther_Gsp_per_Other_len'

In [None]:
plt.hist(np.sqrt(df['Gsp_per_tRd_len']))
plt.show()

In [None]:
df['sqrt_Gsp_per_tRd_len'] = np.sqrt(df['Gsp_per_tRd_len'])
t_var_names[6] = 'sqrt_Gsp_per_tRd_len'

In [None]:
(df['Prec_mean']>0).sum()/21

There is 629 out of 8760 hours in 2019 recorded raining, which is only around 7% of the time. Hence it would be better to use a categorical data (0 being not raining and 1 being raining) to represent the weather.

In [None]:
df['Prec_mean'] = (df['Prec_mean']>0).astype(int)
df['Prec_mean'].sum()/21

In [None]:
df['hour'] = df['DateTime'].dt.hour
df.groupby('hour').mean()['Value'].plot()

In [None]:
df['dayofweek'] = df['DateTime'].dt.dayofweek
df.groupby('dayofweek').mean()['Value'].plot()

In [None]:
df['dayofmonth'] = df['DateTime'].dt.day
df.groupby('dayofmonth').mean()['Value'].plot()

In [None]:
def get_importance(reg, features, target, feature_names, rep=50, method='r2'):
    mean = []
    std = []
    importance = permutation_importance(reg, features, target, n_repeats=rep,
                                        random_state=25, scoring=method)
    for i in range(len(feature_names)):
        mean.append(round(importance.importances_mean[i], 5))
        std.append(round(importance.importances_std[i], 5))
    return mean, std

In [None]:
def get_cv_score(reg, features, target, iter=50, split=10, method='r2'):
    score = []
    for i in range(iter):
        kf = KFold(n_splits=split, shuffle=True, random_state=i)
        cv = cross_val_score(reg, features, target, cv=kf, scoring=method).tolist()
        score = score + cv
    
    return (np.mean(score), np.std(score))

In [None]:
scaler = MinMaxScaler()

In [None]:
reg = LinearRegression()
t_var = t_var_names + cond_names

ap_X = df[t_var].values
ap_y = df['log_Value'].values

ap_X_train, ap_X_test, ap_y_train, ap_y_test = train_test_split(ap_X, ap_y, 
                                                                shuffle=True, 
                                                                random_state=25)

ap_X_train = scaler.fit_transform(ap_X_train)
ap_X_test = scaler.fit_transform(ap_X_test)

reg.fit(ap_X_train, ap_y_train)

get_importance(reg, ap_X_test, ap_y_test, t_var)

In [None]:
reg.score(ap_X_test, ap_y_test)

In [None]:
moran = []
for time in df['DateTime'].unique():
    moran_temp = Moran(df[df['DateTime']==time].log_Value.values, weight)
    moran.append([round(moran_temp.I, 5), round(moran_temp.p_norm, 5)])
moran_df = pd.DataFrame(df['DateTime'].unique(), columns=['DateTime'])
moran_df[['moran', 'p-value']] = moran
moran_df.head()

In [None]:
moran_df['hour'] = moran_df['DateTime'].dt.hour

In [None]:
fig,ax = plt.subplots(4, 6, figsize=(24,16))
i = 0
for hour in range(24):
    sns.lineplot(x=moran_df['DateTime'].dt.date.unique(), 
                 y=moran_df[moran_df['hour']==hour].moran.values, 
                 ax=ax[i//6, i%6], linewidth=1)
    i+=1
plt.show()

In [None]:
h_fi = []
h_score = []
h_coef = []
for hour in df['hour'].unique():
    X = df[df['hour']==hour].loc[:,t_var].values
    y = df[df['hour']==hour].loc[:,'log_Value'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=25)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    reg.fit(X_train, y_train)
    
    fi_mean, fi_std = get_importance(reg, X_test, y_test, feature_names=t_var)
    h_fi.append(fi_mean + fi_std)
    
    h_score.append(reg.score(X_test, y_test))
    
    coef = reg.coef_.tolist()
    coef.append(reg.intercept_)
    h_coef.append(coef)
    
h_fi = pd.DataFrame(h_fi, columns=['fi_' + elem for elem in t_var] + ['fi_std_' + elem for elem in t_var])
h_score = pd.DataFrame(h_score, columns=['r2'])
h_coef = pd.DataFrame(h_coef, columns=t_var+['intercept'])

In [None]:
h_reg = pd.concat([h_coef, h_score, h_fi], axis=1)
h_reg

In [None]:
h_reg['r2'].plot()

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(24, 16))
i = 0
for hour in range(24):
    g = sns.barplot(x=['fi_' + elem for elem in t_var], y=h_reg.loc[hour, ['fi_' + elem for elem in t_var]],
                    ax=ax[i//6, i%6])
    g.set(xticklabels=[])
    i += 1
plt.legend()
plt.show()

In [None]:
# set up a new column for month information
df['month'] = df['DateTime'].dt.month

In [None]:
moran_df['month'] = moran_df['DateTime'].dt.month

In [None]:
fig,ax = plt.subplots(3, 4, figsize=(16,12))
i = 0
for month in range(1,13):
    sns.lineplot(x='DateTime', y='moran', data=moran_df[moran_df['month']==month],
                 ax=ax[i//4, i%4], linewidth=1)
    i+=1
plt.show()

In [None]:
m_fi = []
m_score = []
m_coef = []
for month in df['month'].unique():
    X = df[df['month']==month].loc[:, t_var].values
    y = df[df['month']==month].loc[:, 'log_Value'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=25)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    reg.fit(X_train, y_train)
    
    fi_mean, fi_std = get_importance(reg, X_test, y_test, feature_names=t_var)
    m_fi.append(fi_mean + fi_std)

    m_score.append(reg.score(X_test, y_test))
    
    coef = reg.coef_.tolist()
    coef.append(reg.intercept_)
    m_coef.append(coef)
    
m_fi = pd.DataFrame(m_fi, columns=['fi_' + elem for elem in t_var] + ['fi_std_' + elem for elem in t_var])
m_score = pd.DataFrame(m_score, columns=['r2'])
m_coef = pd.DataFrame(m_coef, columns=t_var+['intercept'])

In [None]:
m_reg = pd.concat([m_coef, m_score, m_fi], axis=1)
m_reg

In [None]:
m_reg['r2'].plot()

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(16, 12))
i = 0
for month in range(1,13):
    g = sns.barplot(x=['fi_' + elem for elem in t_var], y=m_reg.loc[month-1, ['fi_' + elem for elem in t_var]],
                    ax=ax[i//4, i%4])
    g.set(xticklabels=[])
    i += 1
plt.legend()
plt.show()

In [None]:
df.groupby('month').mean()['Value'].plot()

In [None]:
df.info()

In [None]:
high_period = df[df['month'].isin([1, 2, 4])].drop(['hour','dayofweek','dayofmonth','month'],axis=1)
low_period = df[~df['month'].isin([1, 2, 4])].drop(['hour','dayofweek','dayofmonth','month'],axis=1)

print('high period: '+str(high_period.shape)+'\nlow period: '+str(low_period.shape))

In [None]:
hp_X = high_period[t_var].values
hp_y = high_period['log_Value'].values

hp_X_train, hp_X_test, hp_y_train, hp_y_test = train_test_split(hp_X, hp_y, shuffle=True, random_state=25)

hp_X_train = scaler.fit_transform(hp_X_train)
hp_X_test = scaler.fit_transform(hp_X_test)

reg.fit(hp_X_train, hp_y_train)

get_importance(reg, hp_X_test, hp_y_test, feature_names=t_var)

In [None]:
reg.score(hp_X_test, hp_y_test)

In [None]:
reg.coef_.tolist() + [reg.intercept_]

In [None]:
lp_X = low_period[t_var].values
lp_y = low_period['log_Value'].values

lp_X_train, lp_X_test, lp_y_train, lp_y_test = train_test_split(lp_X, lp_y, shuffle=True, random_state=25)
lp_X_train = scaler.fit_transform(lp_X_train)
lp_X_test = scaler.fit_transform(lp_X_test)
reg.fit(lp_X_train, lp_y_train)

get_importance(reg, lp_X_test, lp_y_test, feature_names=t_var)

In [None]:
reg.score(lp_X_test, lp_y_test)

In [None]:
reg.coef_.tolist() + [reg.intercept_]

# Mean analysis

In [None]:
hmean_df = df.groupby(['hour', 'Site']).mean()
hmean_df.info()

In [None]:
hmean_df.drop(['bp_mean', 'tmp_mean', 'Prec_mean', 'dayofweek', 'dayofmonth', 'month'], axis=1, inplace=True)

In [None]:
sns.heatmap(hmean_df[['log_Value']+var_names].corr().round(4),annot=True,fmt='.3f',cmap='magma')
plt.show()

In [None]:
def get_corr(df,iter_range,method='pearson',features=var_names,target='log_Value'):
    result=[]
    for index in iter_range:
        result.append(df.loc[(index,)].corr(method=method).loc[features,target])
    result=np.asarray(result)
    return result

In [None]:
def get_moran(df,iter_range,w=weight,target='log_Value'):
    result=[]
    for index in iter_range:
        result.append(Moran(df.loc[(index,),target].values,w).I)
    return result

In [None]:
def get_cv(df, reg, features, target, iter=100, splits=3):
    cv_r2 = []
    cv_tau = []

    for i in range(iter):
        kf = KFold(n_splits=splits, shuffle=True, random_state=i)
        cvprd = cross_val_predict(reg, features, target, cv=kf)
        
        r = stats.pearsonr(target,cvprd)[0]
        t, p_value = stats.kendalltau(target, cvprd)
        
        cv_r2.append(r**2)
        cv_tau.append(t)

    return [round(np.mean(cv_r2),5), round(np.std(cv_r2),5), round(np.mean(cv_tau),5), round(np.std(cv_tau),5)]

In [None]:
def get_reg_info(df, iter_range, features=var_names, target='log_Value', cv_split=3, reg=LinearRegression()):
    result = []
    for index in iter_range:
        X = df.loc[(index,), features].values        
        y = df.loc[(index,), target].values
        #X = scaler.fit_transform(X)
        
        reg.fit(X, y)
        coef = reg.coef_.tolist()
        cv = get_cv(df, reg, X, y, splits=cv_split)
        coef = coef + cv
        importance, std = get_importance(reg, X, y, features)
        coef = coef + importance + std
        result.append(coef)
    result = pd.DataFrame(result, columns=features+['cv_r2','r2_std','cv_tau','tau_std']+
                          ['fi_'+var for var in features]+['std_fi_'+var for var in features])
    return result

In [None]:
hmean_corr=get_corr(hmean_df,range(24))
sns.lineplot(data=hmean_corr,legend=False)
plt.legend(labels=var_names)
plt.show()

In [None]:
hmean_corr_sp=get_corr(hmean_df,range(24),method='spearman')
sns.lineplot(data=hmean_corr_sp,legend=False)
plt.legend(labels=var_names)
plt.show()

In [None]:
hmean_moran=get_moran(hmean_df,range(24))
sns.lineplot(x=range(24),y=hmean_moran)

In [None]:
hmean_reg=get_reg_info(hmean_df,range(24), cv_split=4)
sns.lineplot(data=hmean_reg[['cv_r2','cv_tau']])

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(24, 16))
i = 0
for hour in range(24):
    g = sns.barplot(x=['fi_' + elem for elem in var_names], 
                    y=hmean_reg.loc[hour, ['fi_' + elem for elem in var_names]],
                    ax=ax[i//6, i%6])
    g.set(xticklabels=[])
    i += 1
plt.legend()
plt.show()

In [None]:
mmean_df=df.groupby(['month','Site']).mean()
mmean_df.info()

In [None]:
mmean_df.drop(['bp_mean','tmp_mean','Prec_mean','hour','dayofweek','dayofmonth'], axis=1, inplace=True)

In [None]:
mmean_corr=get_corr(mmean_df,range(1,13))
sns.lineplot(data=mmean_corr,legend=False)
plt.legend(labels=var_names,loc='upper left')
plt.show()

In [None]:
mmean_corr_sp=get_corr(mmean_df,range(1,13),method='spearman')
sns.lineplot(data=mmean_corr_sp,legend=False)
plt.legend(labels=var_names,loc='upper left')
plt.show()

In [None]:
mmean_moran=get_moran(mmean_df,range(1,13))
sns.lineplot(x=range(1,13),y=mmean_moran)

In [None]:
mmean_reg = get_reg_info(mmean_df,range(1,13),cv_split=2)
sns.lineplot(data=mmean_reg[['cv_r2','cv_tau']])

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(16, 12))
i = 0
for month in range(1,13):
    g = sns.barplot(x=['fi_' + elem for elem in exp_names], 
                    y=mmean_reg.loc[month-1, ['fi_' + elem for elem in exp_names]],
                    ax=ax[i//4, i%4])
    g.set(xticklabels=[])
    i += 1
plt.legend()
plt.show()

In [None]:
# identify high period and low period
high = df[df['month'].isin([1,2,4])].groupby('Site').mean()
low = df[~df['month'].isin([1,2,4])].groupby('Site').mean()

print('high: '+str(high.shape)+'\nlow: '+str(low.shape))

In [None]:
sns.heatmap(high[['log_Value']+var_names].corr().round(4),annot=True,fmt='.4f',cmap='magma')
plt.show()

In [None]:
high_moran=Moran(high['log_Value'].values,weight)
round(high_moran.I,5)

In [None]:
reg_high=LinearRegression()
y_high = high['log_Value'].values
x_high = high[var_names].values

x_high = scaler.fit_transform(x_high)
reg_high.fit(x_high, y_high)
reg_high.score(x_high,y_high)

In [None]:
prd_high = reg_high.predict(x_high)

r = stats.pearsonr(y_high, prd_high)[0]
r2 = r**2
t, p_value = stats.kendalltau(y_high, prd_high)
print('r2 (obs): ', round(r2, 5))
print('tau (obs): ', round(t, 5))

In [None]:
sns.heatmap(low[['Value']+exp_names].corr().round(4),annot=True,fmt='.4f',cmap='magma')
plt.show()

In [None]:
low_moran=Moran(low['Value'].values,weight)
round(low_moran.I,5)

In [None]:
reg_low = LinearRegression()
y_low = low['Value'].values
x_low = low[exp_names].values
reg_low.fit(x_low, y_low)
prd_low = reg_low.predict(x_low)

r = low['Value'].corr(pd.Series(prd_low))
r2 = r**2
t = low['Value'].corr(pd.Series(prd_low), method='kendall')
print("r2 (cv): ", round(r2,3))   
print("tau (cv): ", round(t,3))

In [None]:
sns.heatmap(all[['Value']+exp_names].corr().round(4),annot=True,fmt='.4f',cmap='magma')
plt.show()

In [None]:
all=df.groupby('Site').mean()
all_moran=Moran(all['Value'].values,weight)
round(all_moran.I,5)

In [None]:
y_all=(all.groupby('Site').mean()['Value'].values)
x_all=(all.groupby('Site').mean()[exp_names].values)

reg.fit(x_all,y_all)
reg.score(x_all,y_all)