In [5]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

The datas used in this notebook were created by the previous notebook (Part2_Data_Cleaning).

## Import the Cleaned Dataset

In [6]:
pollution_o = pd.read_excel('pollution_outliers.xlsx')
pollution_No = pd.read_excel('pollution_NoOutliers.xlsx')

In [7]:
pollution_o.head()

Unnamed: 0.1,Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,0,-1.596267,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,0,0,1,0,129
1,1,-1.596267,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,0,0,1,0,148
2,2,-1.596267,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,0,0,1,0,159
3,3,-1.596267,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,0,0,1,0,181
4,4,-1.596267,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,0,0,1,0,138


In [8]:
pollution_No.head()

Unnamed: 0.1,Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,0,-1.623543,-1.263633,-1.391146,0.379425,-0.552238,-0.049256,-0.153362,0,0,1,0,129
1,1,-1.623543,-1.194004,-1.391146,0.379425,-0.521094,-0.049256,-0.153362,0,0,1,0,148
2,2,-1.623543,-0.915487,-1.473478,0.477,-0.48995,-0.049256,-0.153362,0,0,1,0,159
3,3,-1.623543,-0.63697,-1.473478,0.574575,-0.427313,12.77728,-0.153362,0,0,1,0,181
4,4,-1.623543,-0.63697,-1.473478,0.574575,-0.396169,25.603817,-0.153362,0,0,1,0,138


Looking at both these tables, we can see that both the datasets have a column that reiterate the index. We will remove these columns.

In [9]:
pollution_o.drop(columns = ['Unnamed: 0'], inplace = True)
pollution_No.drop(columns = ['Unnamed: 0'], inplace = True)

In [10]:
pollution_o.head()

Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,-1.596267,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,0,0,1,0,129
1,-1.596267,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,0,0,1,0,148
2,-1.596267,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,0,0,1,0,159
3,-1.596267,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,0,0,1,0,181
4,-1.596267,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,0,0,1,0,138


In [11]:
pollution_No.head()

Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,-1.623543,-1.263633,-1.391146,0.379425,-0.552238,-0.049256,-0.153362,0,0,1,0,129
1,-1.623543,-1.194004,-1.391146,0.379425,-0.521094,-0.049256,-0.153362,0,0,1,0,148
2,-1.623543,-0.915487,-1.473478,0.477,-0.48995,-0.049256,-0.153362,0,0,1,0,159
3,-1.623543,-0.63697,-1.473478,0.574575,-0.427313,12.77728,-0.153362,0,0,1,0,181
4,-1.623543,-0.63697,-1.473478,0.574575,-0.396169,25.603817,-0.153362,0,0,1,0,138


---

## Creating x and y parts

##### Dataset with Outliers

In [12]:
x_o = pollution_o.iloc[:, :-1].values
x_o

array([[-1.59626706, -1.22979123, -1.34714315, ...,  0.        ,
         1.        ,  0.        ],
       [-1.59626706, -1.16050789, -1.34714315, ...,  0.        ,
         1.        ,  0.        ],
       [-1.59626706, -0.88337455, -1.4292782 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.58830091, -1.64549124, -1.26500809, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.58830091, -1.64549124, -1.34714315, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.58830091, -1.57620791, -1.26500809, ...,  1.        ,
         0.        ,  0.        ]])

In [13]:
y_o = pollution_o.iloc[:, -1].values
y_o

array([129, 148, 159, ...,  10,   8,  12], dtype=int64)

In [14]:
x_o.shape

(41757, 11)

In [15]:
y_o.shape

(41757,)

We can see that the x-set has 11 columns (all the features) and 41,757 rows, while the y-set has the same number of rows but only 1 columns (the target variable of PM 2.5 levels).

---

##### Dataset without Outliers

In [16]:
x_no = pollution_No.iloc[:, :-1].values
x_no

array([[-1.62354295, -1.2636334 , -1.39114568, ...,  0.        ,
         1.        ,  0.        ],
       [-1.62354295, -1.19400415, -1.39114568, ...,  0.        ,
         1.        ,  0.        ],
       [-1.62354295, -0.91548714, -1.47347807, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.61470256, -1.68140892, -1.30881329, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.61470256, -1.68140892, -1.2264809 , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.61470256, -1.68140892, -1.2264809 , ...,  1.        ,
         0.        ,  0.        ]])

In [17]:
y_no = pollution_No.iloc[:, -1].values
y_no

array([129, 148, 159, ...,   8,   7,  12], dtype=int64)

In [18]:
x_no.shape

(39061, 11)

In [19]:
y_no.shape

(39061,)

This is the dataset without outliers. The x-set has 11 columns for all the features and 39,061 rows (less than the previous dataset because there are no outliers in this data). The y-set has the same number of observations and 1 column for the target variable of pollution levels.

---

## Splitting Dataset

##### Dataset with Outliers

In [20]:
from sklearn.model_selection import train_test_split

x_o_train, x_o_test, y_o_train, y_o_test = train_test_split(x_o, y_o, test_size = 0.2, random_state = 4)
print('Train set:', x_o_train.shape, y_o_train.shape)
print('Test set:', x_o_test.shape, y_o_test.shape)

Train set: (33405, 11) (33405,)
Test set: (8352, 11) (8352,)


This is the dataset with outliers.

The training set has 33,405 observations and the test set has 8,352 rows. This is a 80:20 split of the full dataset with outliers.

---

##### Dataset without Outliers

In [21]:
x_no_train, x_no_test, y_no_train, y_no_test = train_test_split(x_no, y_no, test_size = 0.2, random_state = 4)
print('Train set:', x_no_train.shape, y_no_train.shape)
print('Test set:', x_no_test.shape, y_no_test.shape)

Train set: (31248, 11) (31248,)
Test set: (7813, 11) (7813,)


This is the dataset without outliers.

The 80:20 split creates fewer observations for the x and y sets because this dataset is smaller (since outliers have been removed). The training set has 31,248 observations and the test set has 7,813 rows.

---

## Regression

##### Data with Outliers

In [22]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(x_o_train, y_o_train)

LinearRegression()

In [59]:
from sklearn.metrics import r2_score, mean_squared_error

y_o_train_pred = linear.predict(x_o_train)
mse = mean_squared_error(y_o_train, y_o_train_pred)
r2 = r2_score(y_o_train, y_o_train_pred)

print("The model performance for training set")
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print("\n")


y_o_test_pred = linear.predict(x_o_test)
mse_test = mean_squared_error(y_o_test, y_o_test_pred)
r2_test = r2_score(y_o_test, y_o_test_pred)

print("The model performance for test set")
print('MSE is {}'.format(mse_test))
print('R2 score is {}'.format(r2_test))
print("\n")

The model performance for training set
MSE is 6453.953280901834
R2 score is 0.2496197606456948


The model performance for test set
MSE is 5924.4931880379945
R2 score is 0.25562569777548494




The regression model using data with outliers, for the training set the r-square score is 0.249, which means that this model explains 24.9% of the total variation,only 24.5% of the data fit the regression model.
As for test set, it appears to be slightly higher, the r-square is 0.256, the model explains 25.6% of the total variation, only 25.6% of the data fit the regression model.

Now let see how the regression model will be after removing outliers

#### Data without outliers

In [43]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_no_train, y_no_train)

LinearRegression()

In [57]:
from sklearn.metrics import r2_score, mean_squared_error

y_no_train_pred = linearn.predict(x_no_train)
mse = mean_squared_error(y_no_train, y_no_train_pred)
r2 = r2_score(y_no_train, y_no_train_pred)

print("The model performance for training set")
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print("\n")

y_no_test_pred = linear.predict(x_no_test)
mse_test = mean_squared_error(y_no_test, y_no_test_pred)
r2_test = r2_score(y_no_test, y_no_test_pred)

print("The model performance for training set")
print('MSE is {}'.format(mse_test))
print('R2 score is {}'.format(r2_test))
print("\n")

The model performance for training set
MSE is 4571.910008415173
R2 score is 0.26784903734005827


The model performance for training set
MSE is 4408.80036804968
R2 score is 0.27327609522590846




The regression model above is using data without outliers, for the training set the r-square score is 0.268, which means that this model explains 26.8% of the total variation, only 26.8% of the data fit the regression model.
As for test set, it appears to be slightly higher, the r-square is 0.273, the model explains 27.3% of the total variation, only 27.3% of the data fit the regression model.

Overall, the difference in the r-square value in the first model and the second is close to 2%. It appears that the second regression model without outliers seem to have a better r-square value than the first model with the outliers, the second model explains slight more variation than the first model

Since the r-square for both models are still very small, which is around 25% to 27% we will try plotting with the polynomial model to see if it can create a better prediction than the regression

## Polynomial Regression

#### Data with Outlier

In [49]:
from sklearn.preprocessing import PolynomialFeatures 
poly = PolynomialFeatures(degree= 2)
poly.fit(x_o_train,y_o_train)

PolynomialFeatures()

In [69]:
from sklearn.metrics import r2_score

y_train_predict = poly_regression.predict(poly.fit_transform(x_o_train))
rmse = (np.sqrt(mean_squared_error(y_o_train, y_train_predict)))
r2 = r2_score(y_o_train, y_train_predict)

print("The model performance for training set")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

y_test_predict = poly_regression.predict(poly.fit_transform(x_o_test))
rmse = (np.sqrt(mean_squared_error(y_o_test, y_test_predict)))
r2 = r2_score(y_o_test, y_test_predict)

print("The model performance for test set")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

The model performance for training set
RMSE is 71.5068061770958
R2 score is 0.40550208856831016


The model performance for test set
RMSE is 68.9216513672904
R2 score is 0.4031688024280067




The polynomial regression model is using data with outliers, for the training set the r-square score is 0.406, which means that this model explains 40.6% of the total variation, 40.6% of the data fit the regression model.
As for test set, it appears to be almost the same, the r-square is 0.403, the model explains 40.3% of the total variation, 40.3% of the data fit the polynomial regression model.

Compared with the regression model, the r-square in the polynomial model have a higher r-square value, it increases by 20% for both train and test set.

Now will see how the results are using the data without outlier

#### Data without outlier

In [67]:
from sklearn.preprocessing import PolynomialFeatures 
poly = PolynomialFeatures(degree= 2)
poly.fit(x_no_test,y_no_test)

PolynomialFeatures()

In [70]:
from sklearn.metrics import r2_score

y_train_predict = poly_regression.predict(poly.fit_transform(x_no_train))
rmse = (np.sqrt(mean_squared_error(y_no_train, y_train_predict)))
r2 = r2_score(y_no_train, y_train_predict)

print("The model performance for training set")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

y_test_predict = poly_regression.predict(poly.fit_transform(x_no_test))
rmse = (np.sqrt(mean_squared_error(y_no_test, y_test_predict)))
r2 = r2_score(y_no_test, y_test_predict)

print("The model performance for test set")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

The model performance for training set
RMSE is 722142891.0275102
R2 score is -83512069304389.3


The model performance for test set
RMSE is 754230416.4307845
R2 score is -93768527672740.66




The model above is polynomial regression without outliers. The r-square value seem to show an extremely high negative value, which is not common, it seems that this model will not be reliable. Additionally r-square is between 0 and 1. 

After trying four models, it appears that the more reliable model is to include the outliers.For this case, it does shows that outliers are necessary to keep, there might be other factors affecting the air quality features and the pollution model, which might cause some values to be higher than others. With the current climate change situation, it is possible that factors that are not recorded in the dataset can impact overall the dataset.

With all the results from the four models, it seems that the polynomial regression model with outliers are the better fit for the pollution level prediction.