In [1]:
import numpy as np     #imported all library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.pipeline import make_pipeline


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/vis2208/Air-Pollution-/main/Processed_data.csv")   #read data
df.head(5)

Unnamed: 0,Date,PM2.5,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
0,03/01/2018,287.71,488.92,86.46,20.49,1.82,67.28,0.32,248.7,167.41,12.41
1,15/01/2018,240.96,515.83,101.49,20.95,1.84,50.45,0.71,264.26,169.86,15.98
2,17/01/2018,294.38,402.21,72.23,17.61,1.2,64.78,1.19,88.52,166.5,14.02
3,23/01/2018,196.4,322.72,86.8,24.86,2.09,61.87,1.82,114.02,149.21,14.05
4,24/01/2018,162.97,239.81,72.06,25.29,2.08,66.09,0.71,240.56,198.31,13.42


In [3]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size = 0.8

X = df.drop(columns = ['PM2.5','Date']).copy()
y = df['PM2.5']

# In the first step we will split the data in training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, random_state = 42)


print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2581, 9)
(2581,)
(646, 9)
(646,)


In [4]:
X_valid  # due to random_state = 10 , always same data otherwise different data on each run

Unnamed: 0,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
1429,307.12,40.66,16.45,1.02,43.19,0.97,227.15,175.89,29.17
346,383.42,59.47,7.86,2.43,70.63,0.68,233.12,179.80,10.27
1391,121.48,39.33,33.90,0.73,87.53,0.73,112.78,77.76,12.90
393,80.38,25.41,9.22,1.20,46.96,1.44,251.60,267.62,25.78
194,457.32,60.19,6.72,2.21,53.01,1.12,185.17,215.51,33.96
...,...,...,...,...,...,...,...,...,...
1992,90.30,24.79,7.78,0.53,80.65,0.95,179.79,108.22,30.20
361,116.17,44.11,10.94,1.61,74.08,2.38,92.39,176.15,12.01
135,245.83,81.45,11.02,2.33,55.37,1.03,286.85,202.99,18.96
3054,165.83,14.62,14.34,1.06,77.14,1.83,275.52,101.54,18.29


In [5]:
reg=make_pipeline(PolynomialFeatures(degree=2),LinearRegression())
reg.fit(X_train,y_train)

In [6]:
y_pred = reg.predict(X_valid)   
y_pred                # predicted

array([112.87475109, 222.32951932,  66.09258055,  37.18035282,
       142.9382153 , 108.92726635,  87.67485051, 157.75970648,
        85.69779253,  30.76909138, 114.07863842,  41.98389669,
       144.10151035, 168.06760882,  87.09334555, 130.84179749,
        78.22084297, 112.05491996,  40.89950786,  98.83011957,
       101.17512615, 143.72873565, 157.39447436, 123.07667221,
        77.93547674, 106.56363263,  89.73315537,  76.03125915,
       265.27063002, 180.48185444, 119.35936986,  50.20723473,
       117.04657792, 111.02677747,  37.59869607, 116.12253197,
       104.61003303, 242.5103092 , 206.89378264, 187.8257341 ,
        46.68413815, 105.08966747,  35.2614817 ,  67.85314772,
        43.77949739,  65.04145056,  39.56030719,  43.87728899,
        15.16811032,  72.07412164,  29.23590379, 286.54663783,
        70.77849787,  54.45016359, 114.17117646, 173.83788393,
       151.3216085 , 108.32898379, 123.83038687,  60.96316036,
        72.52561655,  74.67754978, 118.99113991, 169.38

In [7]:
print("Printing training stats")
y_pred_train = reg.predict(X_train) 
print("Mean absolute error: %.2f" % mean_absolute_error(y_train, y_pred_train)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred_train)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_train, y_pred_train))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_train, y_pred_train)) # The coefficient of determination: 1 is perfect prediction

Printing training stats
Mean absolute error: 17.65
Mean squared error: 615.01
Root mean square error: 24.80
Coefficient of determination: 0.85


In [8]:
print("Printing testing stats")
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_valid, y_pred))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_pred)) # The coefficient of determination: 1 is perfect prediction

Printing testing stats
Mean absolute error: 18.84
Mean squared error: 776.40
Root mean square error: 27.86
Coefficient of determination: 0.80


In [9]:
for Y, y in zip(y_valid, y_pred):
  print(round(Y), round(y))

56 113
178 222
61 66
95 37
81 143
117 109
86 88
183 158
69 86
39 31
135 114
94 42
179 144
99 168
104 87
153 131
58 78
143 112
39 41
102 99
86 101
176 144
117 157
250 123
105 78
106 107
132 90
72 76
190 265
191 180
105 119
44 50
149 117
103 111
78 38
112 116
70 105
272 243
193 207
144 188
32 47
136 105
54 35
37 68
92 44
79 65
25 40
27 44
9 15
52 72
40 29
63 287
81 71
60 54
87 114
242 174
178 151
75 108
109 124
53 61
38 73
66 75
150 119
211 169
107 49
155 130
148 107
40 23
26 46
20 30
72 67
39 39
69 53
84 42
38 44
77 91
110 114
54 58
159 154
52 53
70 53
85 110
58 55
61 43
111 93
80 77
145 104
44 69
103 101
82 74
129 105
159 159
109 116
221 246
78 91
48 52
166 142
49 67
36 45
61 51
77 109
69 59
141 117
28 45
259 169
102 112
115 63
126 107
138 119
42 41
33 40
40 21
267 234
178 55
77 91
52 99
132 102
121 97
40 45
106 121
192 189
121 95
58 48
18 45
32 32
201 175
104 112
31 41
155 179
15 15
74 69
100 75
57 27
51 62
51 55
68 96
70 106
16 22
64 58
206 149
91 115
89 65
40 50
98 91
141 136
149 21