In [1]:
# import modules
import os
#import sys

import pandas as pd
#import numpy as np
#from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
# from sklearn.metrics import r2_score
# from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import LabelEncoder
# from sklearn.ensemble import RandomForestClassifier

from scipy.stats import ttest_ind
from scipy.stats import t


In [2]:
current_dir = os.getcwd()
current_dir
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")
current_dir = os.getcwd()
current_dir
#set current working directory to H:\VScode\March Group\March_Team_Project\
os.chdir("H:\\VScode\\March Group\\March_Team_Project\\")

You set a new current directory


In [3]:
# load Outputs/DashBoardData.zip'
merged_data = pd.read_csv("Outputs/DashBoardData.zip", compression='zip', low_memory=False)
# produce list of numeric and bool columns
numeric_columns = merged_data.select_dtypes(include=['float64', 'int64', 'bool']).columns
# drop WS_Latitude, WS_Longitude from numeric_columns
numeric_columns = numeric_columns.drop(['WS_Latitude', 'WS_Longitude'])
# create a list of columns with skewness greater than 1
skewness = merged_data[numeric_columns].skew()
skewness = skewness[skewness > 1]
skewness

County Code            1.328799
Site Num               1.411485
NO2 Mean               1.002661
O3 AQI                 2.162399
SO2 Mean               2.693517
SO2 1st Max Value      3.765834
SO2 AQI                3.171309
CO Mean                2.053753
CO 1st Max Value       2.050279
CO AQI                 1.972371
AWND                   2.827331
PGTM                   2.646656
PRCP                   6.251311
TAVG                   4.680941
WDMV                  42.748820
WT01                   4.058455
WT02                  10.991683
WT03                   6.138038
WT04                  36.764360
WT05                  33.888430
WT06                  33.069562
WT08                   6.261057
WT09                  67.843202
WT11                  36.764360
WT13                   7.151200
WT16                   8.278349
WT18                  14.044666
WT22                  21.950226
O3_AQI_Group           4.031989
SO2_AQI_Group         10.665846
CO_AQI_Group         107.290493
NO2_AQI_

Transform data to standardise data ranges and correct skewness

In [4]:
# Use scikit learn to apply transformations to the data to standardise the data and correct skewness
merged_data_corrected=merged_data.copy()
# apply power transformer to columns with skewness greater than 1
power_transformer = PowerTransformer()
merged_data_corrected[skewness.index] = power_transformer.fit_transform(merged_data_corrected[skewness.index])

# apply standard scaler to all numeric columns
scaler = StandardScaler()
merged_data_corrected[numeric_columns] = scaler.fit_transform(merged_data_corrected[numeric_columns])


Compare Correlation models

In [None]:
# Compare the correlation matrix before and after the transformations
# create a new column transformer
column_transformer = ColumnTransformer(transformers=[('power_transformer', PowerTransformer(), skewness.index), ('standard_scaler', StandardScaler(), numeric_columns)])

# create a new pipeline
pipeline = Pipeline(steps=[('column_transformer', column_transformer)])

# fit and transform the pipeline
merged_data_transformed = pipeline.fit_transform(merged_data_corrected)

# convert transformed data to a dataframe
merged_data_transformed_df = pd.DataFrame(merged_data_transformed)
# perform correlation matrix on transformed data
correlation_matrix_transformed_corr = merged_data_transformed_df.corr()
correlation_matrix_transformed_corr = correlation_matrix_transformed_corr.round(2)

# perform correlation matrix on original data
correlation_matrix_original = merged_data[numeric_columns].copy().corr()
correlation_matrix_original = correlation_matrix_original.round(2)

# check that correlation_matrix_transformed_corr and correlation_matrix_original are the same shape
if correlation_matrix_transformed_corr.shape != correlation_matrix_original.shape:
    print('The correlation matrices have different shapes')
    # print the shapes of the correlation matrices
    print(f'The shape of the correlation matrix for the transformed data is {correlation_matrix_transformed_corr.shape}')
    print(f'The shape of the correlation matrix for the original data is {correlation_matrix_original.shape}')
    print('The program will stop')
    # stop the program
    exit()
else:
    print('The correlation matrices have the same shape')

# performa a statistical test to determine if the correlation matrices are significantly different
# calculate the difference between the two correlation matrices
# Flatten the correlation matrices to 1D arrays
original_corr_values = correlation_matrix_original.values.flatten()
transformed_corr_values = correlation_matrix_transformed_corr.values.flatten()

# Perform a t-test to compare the two correlation matrices
t_statistic, p_value = ttest_ind(original_corr_values, transformed_corr_values)

# Calculate degrees of freedom
degrees_of_freedom = len(original_corr_values) + len(transformed_corr_values) - 2

# Calculate the critical value for a 95% confidence level
critical_value = t.ppf(0.975, degrees_of_freedom)

# Determine if the null hypothesis is rejected
null_hypothesis_rejected = abs(t_statistic) > critical_value
# print the results
print(f'p-value: {p_value}')
print(f'degrees of freedom: {degrees_of_freedom}')
print(f't-statistic: {t_statistic}')
print(f'critical value: {critical_value}')
print(f'null hypothesis rejected: {null_hypothesis_rejected}')



The correlation matrices have different shapes
The shape of the correlation matrix for the transformed data is (80, 80)
The shape of the correlation matrix for the original data is (48, 48)
The program will stop
p-value: nan
degrees of freedom: 8702
t-statistic: nan
critical value: 1.9602366340016277
null hypothesis rejected: False


: 

Linear Regression Model for NO2 AGI

In [None]:
# preform a linear regression on the data to predict the pollution level
# create a list of numeric columns
numeric_columns = merged_data.select_dtypes(include=['float64', 'int64']).columns
# drop WS_Latitude, WS_Longitude from numeric_columns
numeric_columns = numeric_columns.drop(['WS_Latitude', 'WS_Longitude'])

# split data into features and target
X = merged_data[numeric_columns]
y = merged_data['NO2 AQI']

# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# predict the target using the model
y_pred = model.predict(X_test)

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

# save the model
import joblib

joblib.dump(model, 'Outputs/Linear_Regression_Model.pkl')

mse



Load model and predict


In [None]:
# load the model
model = joblib.load('Outputs/Linear_Regression_Model.pkl')

# predict the target using the model
y_pred = model.predict(X_test)

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

# create a dataframe of the coefficients
coefficients = pd.DataFrame(model.coef_, index=numeric_columns, columns=['Coefficient'])
coefficients

# create a dataframe of the intercept
intercept = pd.DataFrame([model.intercept_], index=['Intercept'], columns=['Value'])
intercept

# save the coefficients and intercept to csv
coefficients.to_csv('Outputs/Coefficients.csv')
intercept.to_csv('Outputs/Intercept.csv')