In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.ops import nearest_points

import seaborn as sns

from mpl_toolkits.axes_grid1 import make_axes_locatable

import math

from scipy.stats import pearsonr

import time


from scipy.stats import boxcox

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE


from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso


from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


from matplotlib import cm

import matplotlib.lines as mlines

import warnings
warnings.filterwarnings("ignore")

sns.set(style = 'whitegrid')
sns.set_palette('bright')
%matplotlib inline

In [11]:
## Reading input data
BC_clustered = pd.read_csv("../Data/BC_input_clustered.csv")
NO2_clustered = pd.read_csv("../Data/NO2_input_clustered.csv")

In [13]:
#Convert columns to non-numeric 
BC = BC_clustered._get_numeric_data() #drop non-numeric cols
NO2 = NO2_clustered._get_numeric_data() #drop non-numeric cols

In [14]:
# Drop first column
BC.drop(BC.columns[0], axis=1, inplace=True)
NO2.drop(NO2.columns[0], axis=1, inplace=True)

<b> <font size = 5> Correlation coefficient and Testing the Significance of the Correlation Coefficient </b> </font>

We perform a hypothesis test of the “significance of the correlation coefficient” to decide whether the linear relationship in the data is strong enough to use to model the relationship. Since we have data for the entire population, we can use the population correlation coefficient. 

Null Hypothesis: H$_{0}$: ρ = 0

Alternate Hypothesis: H$_1$: ρ ≠ 0


ρ = population correlation coefficient

Null Hypothesis H$_0$: The population correlation coefficient <b>is not</b> significantly different from zero. There <b>is not</b> a significant linear relationship(correlation) between x and y in the population.

Alternate Hypothesis H$_1$: The population correlation coefficient is significantly different from zero. There <b>is a significant linear relationship</b> (correlation) between x and y in the population.


<font size = 4> <b> Estimating the Correlation Coefficient (r), 95% Confidence Interval for r,  R2 and p-value for the BC dataset </b> </font>

In [16]:
import pingouin as pg
BC_pearsonr = pg.pairwise_corr(BC, method='pearson', padjust = 'none' )

In [17]:
BC_pearsonr.head(100)

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power
0,BC Value,10457411-Manufacturing-high_dist,pearson,two-sided,13379,-0.330193,"[-0.35, -0.32]",0.109027,0.108894,-0.343045,0.000000e+00,inf,1.0
1,BC Value,10457711-Foundaries-high_dist,pearson,two-sided,13379,0.336358,"[0.32, 0.35]",0.113136,0.113004,0.349980,0.000000e+00,inf,1.0
2,BC Value,10460511-Retail-Res-high_dist,pearson,two-sided,13379,-0.292290,"[-0.31, -0.28]",0.085433,0.085297,-0.301068,9.144344e-262,2.556e+257,1.0
3,BC Value,10469511-Retail-Res-high_dist,pearson,two-sided,13379,-0.364821,"[-0.38, -0.35]",0.133094,0.132965,-0.382436,0.000000e+00,inf,1.0
4,BC Value,10476711-Manufacturing-high_dist,pearson,two-sided,13379,0.077335,"[0.06, 0.09]",0.005981,0.005832,0.077490,3.313170e-19,2.865e+15,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,10457411-Manufacturing-high_dist,15941111-Retail-Res-high_dist,pearson,two-sided,13379,0.984717,"[0.98, 0.99]",0.969667,0.969662,2.433232,0.000000e+00,inf,1.0
96,10457411-Manufacturing-high_dist,168411-Waste-high_dist,pearson,two-sided,13379,0.989539,"[0.99, 0.99]",0.979188,0.979185,2.624017,0.000000e+00,inf,1.0
97,10457411-Manufacturing-high_dist,17244511-Retail-Res-high_dist,pearson,two-sided,13379,-0.992516,"[-0.99, -0.99]",0.985088,0.985085,-2.792179,0.000000e+00,inf,1.0
98,10457411-Manufacturing-high_dist,18134411-Misc-high_dist,pearson,two-sided,13379,0.999905,"[1.0, 1.0]",0.999811,0.999811,4.979440,0.000000e+00,inf,1.0


In [18]:
BC_pearsonr_BC = BC_pearsonr[BC_pearsonr.X == 'BC Value']

In [19]:
BC_pearsonr_BC.head(50)

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power
0,BC Value,10457411-Manufacturing-high_dist,pearson,two-sided,13379,-0.330193,"[-0.35, -0.32]",0.109027,0.108894,-0.343045,0.0,inf,1.0
1,BC Value,10457711-Foundaries-high_dist,pearson,two-sided,13379,0.336358,"[0.32, 0.35]",0.113136,0.113004,0.34998,0.0,inf,1.0
2,BC Value,10460511-Retail-Res-high_dist,pearson,two-sided,13379,-0.29229,"[-0.31, -0.28]",0.085433,0.085297,-0.301068,9.144344e-262,2.556e+257,1.0
3,BC Value,10469511-Retail-Res-high_dist,pearson,two-sided,13379,-0.364821,"[-0.38, -0.35]",0.133094,0.132965,-0.382436,0.0,inf,1.0
4,BC Value,10476711-Manufacturing-high_dist,pearson,two-sided,13379,0.077335,"[0.06, 0.09]",0.005981,0.005832,0.07749,3.3131699999999995e-19,2865000000000000.0,1.0
5,BC Value,10510811-AsphaltPlant-high_dist,pearson,two-sided,13379,0.334598,"[0.32, 0.35]",0.111956,0.111823,0.347997,0.0,inf,1.0
6,BC Value,10517111-Retail-Res-high_dist,pearson,two-sided,13379,-0.293197,"[-0.31, -0.28]",0.085965,0.085828,-0.302061,1.871776e-263,1.244e+259,1.0
7,BC Value,10518911-ConcretePlant-high_dist,pearson,two-sided,13379,-0.347513,"[-0.36, -0.33]",0.120765,0.120634,-0.362612,0.0,inf,1.0
8,BC Value,10707511-EGen-high_dist,pearson,two-sided,13379,-0.283185,"[-0.3, -0.27]",0.080194,0.080056,-0.291142,3.708037e-245,6.542e+240,1.0
9,BC Value,10708011-Waste-high_dist,pearson,two-sided,13379,0.329491,"[0.31, 0.34]",0.108564,0.108431,0.342257,0.0,inf,1.0


<font size = 4> <b> Determine the features that have a p-value that is above the level of signifiance alpha of 0.05 </b> </font>

<font size = 2> We reject the alternative hypothesis and conclude that the null hypothesis is true in this case i.e. there is no statistical significance of the correlation coefficient </font>

In [20]:
BC_pearsonr_BC[BC_pearsonr_BC['p-unc'] >= 0.005 ]

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power


<font size = 4> <b> Estimating the Correlation Coefficient (r), 95% Confidence Interval for r,  R2 and p-value for the NO2 dataset </b> </font>

In [21]:
NO2_pearsonr = pg.pairwise_corr(NO2_clustered, method='pearson', padjust = 'none' )

In [22]:
NO2_pearsonr.head(100)

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power
0,Unnamed: 0,NO2 Value,pearson,two-sided,13379,-0.654154,"[-0.66, -0.64]",0.427917,0.427831,-0.782525,0.0,inf,1.0
1,Unnamed: 0,10457511-Manufacturing-high_dist,pearson,two-sided,13379,0.890677,"[0.89, 0.89]",0.793306,0.793275,1.425193,0.0,inf,1.0
2,Unnamed: 0,10460511-Retail-Res-high_dist,pearson,two-sided,13379,0.805265,"[0.8, 0.81]",0.648451,0.648399,1.113411,0.0,inf,1.0
3,Unnamed: 0,10510811-AsphaltPlant-high_dist,pearson,two-sided,13379,-0.770925,"[-0.78, -0.76]",0.594326,0.594265,-1.022605,0.0,inf,1.0
4,Unnamed: 0,10517111-Retail-Res-high_dist,pearson,two-sided,13379,0.799568,"[0.79, 0.81]",0.639310,0.639256,1.097415,0.0,inf,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,NO2 Value,375111-Eateries-high_dist,pearson,two-sided,13379,-0.575978,"[-0.59, -0.56]",0.331751,0.331651,-0.656424,0.0,inf,1.0
96,NO2 Value,379511-WWTP-high_dist,pearson,two-sided,13379,-0.591089,"[-0.6, -0.58]",0.349386,0.349289,-0.679338,0.0,inf,1.0
97,NO2 Value,382111-Foundaries-high_dist,pearson,two-sided,13379,0.594534,"[0.58, 0.61]",0.353471,0.353374,0.684650,0.0,inf,1.0
98,NO2 Value,382911-Misc-high_dist,pearson,two-sided,13379,0.627543,"[0.62, 0.64]",0.393811,0.393720,0.737353,0.0,inf,1.0


In [23]:
NO2_pearsonr_NO2 = NO2_pearsonr[NO2_pearsonr.X == 'NO2 Value']

In [24]:
NO2_pearsonr_NO2.head(100)

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power
60,NO2 Value,10457511-Manufacturing-high_dist,pearson,two-sided,13379,-0.612269,"[-0.62, -0.6]",0.374873,0.37478,-0.712543,0.0,inf,1.0
61,NO2 Value,10460511-Retail-Res-high_dist,pearson,two-sided,13379,-0.544293,"[-0.56, -0.53]",0.296255,0.29615,-0.610236,0.0,inf,1.0
62,NO2 Value,10510811-AsphaltPlant-high_dist,pearson,two-sided,13379,0.558601,"[0.55, 0.57]",0.312035,0.311932,0.630797,0.0,inf,1.0
63,NO2 Value,10517111-Retail-Res-high_dist,pearson,two-sided,13379,-0.544155,"[-0.56, -0.53]",0.296105,0.296,-0.61004,0.0,inf,1.0
64,NO2 Value,10707511-EGen-high_dist,pearson,two-sided,13379,-0.535405,"[-0.55, -0.52]",0.286659,0.286552,-0.597692,0.0,inf,1.0
65,NO2 Value,10708011-Waste-high_dist,pearson,two-sided,13379,0.570939,"[0.56, 0.58]",0.325971,0.32587,0.648914,0.0,inf,1.0
66,NO2 Value,127511-Retail-Res-high_dist,pearson,two-sided,13379,-0.559557,"[-0.57, -0.55]",0.313104,0.313001,-0.632188,0.0,inf,1.0
67,NO2 Value,136511-EGen-high_dist,pearson,two-sided,13379,-0.612279,"[-0.62, -0.6]",0.374885,0.374792,-0.712559,0.0,inf,1.0
68,NO2 Value,13761011-Retail-Res-high_dist,pearson,two-sided,13379,-0.578073,"[-0.59, -0.57]",0.334168,0.334069,-0.659564,0.0,inf,1.0
69,NO2 Value,13825511-Retail-Res-high_dist,pearson,two-sided,13379,-0.521714,"[-0.53, -0.51]",0.272185,0.272076,-0.578691,0.0,inf,1.0


<font size = 4> <b> Determine the features that have a p-value that is above the level of signifiance alpha of 0.05 </b> </font>

<font size = 2> We reject the alternative hypothesis and conclude that the null hypothesis is true in this case i.e. there is no statistical significance of the correlation coefficient </font>

In [25]:
NO2_pearsonr_NO2[NO2_pearsonr_NO2['p-unc'] >= 0.005 ]

Unnamed: 0,X,Y,method,tail,n,r,CI95%,r2,adj_r2,z,p-unc,BF10,power


The above tests indicate that for the BC and NO2 datasets, all the coefficients have a p-value less than 0.05, indicating that we can reject the null hypothesis and conclude that there is a <b> statistical significant relationship </b> between each feature in the dataset and the target variable. Next, we fit a linear model 



 - Linear regression for stats
    - VIF scores - gives a score between all feeatures/predictor variables
    - Then do a linear regression 
    - Combine two facilities that are close to each other with similar VIF scores and take average distance
    - After combining, do correlation test to see significance, and drop columns that have a high p-value
    - Recalculate VIF scores again 
    - Do some type of linear regression for model - check linear regression model
    - what are the other ways of identifying and dropping highly correlated features in feature selection?
    

Potential questions: 
    
    1) What is the probability of observing a concentration that is higher than the ambient standards for Black Carbon and NO2? 

<b> T-test to determine whether is a significant difference between the means of two groups.</b>

1) Is there a significant difference in BC concentration for distance to highways > 4 km and less than 4 km?
1) Is there a significant difference in NO2 concentration for distance to highways > 4 km and less than 4 km?