In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

The datas used in this notebook were created by the previous notebook (Part2_Data_Cleaning).

## Import the Cleaned Dataset

In [2]:
pollution_o = pd.read_excel('pollution_outliers.xlsx')
pollution_No = pd.read_excel('pollution_NoOutliers.xlsx')

In [3]:
pollution_o.head()

Unnamed: 0.1,Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,0,-1.596267,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,0,0,1,0,129
1,1,-1.596267,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,0,0,1,0,148
2,2,-1.596267,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,0,0,1,0,159
3,3,-1.596267,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,0,0,1,0,181
4,4,-1.596267,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,0,0,1,0,138


In [4]:
pollution_No.head()

Unnamed: 0.1,Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,0,-1.623543,-1.263633,-1.391146,0.379425,-0.552238,-0.049256,-0.153362,0,0,1,0,129
1,1,-1.623543,-1.194004,-1.391146,0.379425,-0.521094,-0.049256,-0.153362,0,0,1,0,148
2,2,-1.623543,-0.915487,-1.473478,0.477,-0.48995,-0.049256,-0.153362,0,0,1,0,159
3,3,-1.623543,-0.63697,-1.473478,0.574575,-0.427313,12.77728,-0.153362,0,0,1,0,181
4,4,-1.623543,-0.63697,-1.473478,0.574575,-0.396169,25.603817,-0.153362,0,0,1,0,138


Looking at both these tables, we can see that both the datasets have a column that reiterate the index. We will remove these columns.

In [5]:
pollution_o.drop(columns = ['Unnamed: 0'], inplace = True)
pollution_No.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
pollution_o.head()

Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,-1.596267,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,0,0,1,0,129
1,-1.596267,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,0,0,1,0,148
2,-1.596267,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,0,0,1,0,159
3,-1.596267,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,0,0,1,0,181
4,-1.596267,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,0,0,1,0,138


In [7]:
pollution_No.head()

Unnamed: 0,month,dewTemp,temp,pres,windSpeed,cumSnow,cumRain,x0_NE,x0_NW,x0_SE,x0_SW,pm2.5
0,-1.623543,-1.263633,-1.391146,0.379425,-0.552238,-0.049256,-0.153362,0,0,1,0,129
1,-1.623543,-1.194004,-1.391146,0.379425,-0.521094,-0.049256,-0.153362,0,0,1,0,148
2,-1.623543,-0.915487,-1.473478,0.477,-0.48995,-0.049256,-0.153362,0,0,1,0,159
3,-1.623543,-0.63697,-1.473478,0.574575,-0.427313,12.77728,-0.153362,0,0,1,0,181
4,-1.623543,-0.63697,-1.473478,0.574575,-0.396169,25.603817,-0.153362,0,0,1,0,138


---

## Creating x and y parts

##### Dataset with Outliers

In [8]:
x_o = pollution_o.iloc[:, :-1].values
x_o

array([[-1.59626706, -1.22979123, -1.34714315, ...,  0.        ,
         1.        ,  0.        ],
       [-1.59626706, -1.16050789, -1.34714315, ...,  0.        ,
         1.        ,  0.        ],
       [-1.59626706, -0.88337455, -1.4292782 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.58830091, -1.64549124, -1.26500809, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.58830091, -1.64549124, -1.34714315, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.58830091, -1.57620791, -1.26500809, ...,  1.        ,
         0.        ,  0.        ]])

In [9]:
y_o = pollution_o.iloc[:, -1].values
y_o

array([129, 148, 159, ...,  10,   8,  12])

In [10]:
x_o.shape

(41757, 11)

In [11]:
y_o.shape

(41757,)

We can see that the x-set has 11 columns (all the features) and 41,757 rows, while the y-set has the same number of rows but only 1 columns (the target variable of PM 2.5 levels).

---

##### Dataset without Outliers

In [12]:
x_no = pollution_No.iloc[:, :-1].values
x_no

array([[-1.62354295, -1.2636334 , -1.39114568, ...,  0.        ,
         1.        ,  0.        ],
       [-1.62354295, -1.19400415, -1.39114568, ...,  0.        ,
         1.        ,  0.        ],
       [-1.62354295, -0.91548714, -1.47347807, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.61470256, -1.68140892, -1.30881329, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.61470256, -1.68140892, -1.2264809 , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.61470256, -1.68140892, -1.2264809 , ...,  1.        ,
         0.        ,  0.        ]])

In [13]:
y_no = pollution_No.iloc[:, -1].values
y_no

array([129, 148, 159, ...,   8,   7,  12])

In [14]:
x_no.shape

(39061, 11)

In [15]:
y_no.shape

(39061,)

This is the dataset without outliers. The x-set has 11 columns for all the features and 39,061 rows (less than the previous dataset because there are no outliers in this data). The y-set has the same number of observations and 1 column for the target variable of pollution levels.

---

## Splitting Dataset

##### Dataset with Outliers

In [17]:
from sklearn.model_selection import train_test_split

x_o_train, x_o_test, y_o_train, y_o_test = train_test_split(x_o, y_o, test_size = 0.2, random_state = 4)
print('Train set:', x_o_train.shape, y_o_train.shape)
print('Test set:', x_o_test.shape, y_o_test.shape)

Train set: (33405, 11) (33405,)
Test set: (8352, 11) (8352,)


This is the dataset with outliers.

The training set has 33,405 observations and the test set has 8,352 rows. This is a 80:20 split of the full dataset with outliers.

---

##### Dataset without Outliers

In [18]:
x_no_train, x_no_test, y_no_train, y_no_test = train_test_split(x_no, y_no, test_size = 0.2, random_state = 4)
print('Train set:', x_no_train.shape, y_no_train.shape)
print('Test set:', x_no_test.shape, y_no_test.shape)

Train set: (31248, 11) (31248,)
Test set: (7813, 11) (7813,)


This is the dataset without outliers.

The 80:20 split creates fewer observations for the x and y sets because this dataset is smaller (since outliers have been removed). The training set has 31,248 observations and the test set has 7,813 rows.

---

## Regression

##### Data with Outliers

In [19]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(x_o_train, y_o_train)

LinearRegression()

In [21]:
from sklearn.metrics import r2_score, mean_squared_error

y_o_train_pred = linear.predict(x_o_train)
mse = mean_squared_error(y_o_train, y_o_train_pred)
r2 = r2_score(y_o_train, y_o_train_pred)

print("The model performance for training set")
print("--------------------------------------")
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print("\n")


y_o_test_pred = linear.predict(x_o_test)
mse_test = mean_squared_error(y_o_test, y_o_test_pred)
r2_test = r2_score(y_o_test, y_o_test_pred)

print("The model performance for training set")
print("--------------------------------------")
print('MSE is {}'.format(mse_test))
print('R2 score is {}'.format(r2_test))
print("\n")

The model performance for training set
--------------------------------------
MSE is 6354.688849831677
R2 score is 0.26116091446318657


The model performance for training set
--------------------------------------
MSE is 5860.165397278776
R2 score is 0.26370806918519796




EXPLAIN

TRY POLYNOMIAL REGRESSION SINCE R-SQUARED IS LOW

Do the same for data with outliers.