In [1]:
import numpy as np     #imported all library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/vis2208/Air-Pollution-/main/Processed_data.csv")   #read data
df.head(5)

Unnamed: 0,Date,PM2.5,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
0,03/01/2018,287.71,488.92,86.46,20.49,1.82,67.28,0.32,248.7,167.41,12.41
1,15/01/2018,240.96,515.83,101.49,20.95,1.84,50.45,0.71,264.26,169.86,15.98
2,17/01/2018,294.38,402.21,72.23,17.61,1.2,64.78,1.19,88.52,166.5,14.02
3,23/01/2018,196.4,322.72,86.8,24.86,2.09,61.87,1.82,114.02,149.21,14.05
4,24/01/2018,162.97,239.81,72.06,25.29,2.08,66.09,0.71,240.56,198.31,13.42


In [3]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size = 0.8

X = df.drop(columns = ['PM2.5','Date']).copy()
y = df['PM2.5']

# In the first step we will split the data in training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, random_state = 42)


print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2581, 9)
(2581,)
(646, 9)
(646,)


In [4]:
X_valid  # due to random_state = 10 , always same data otherwise different data on each run

Unnamed: 0,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
1429,307.12,40.66,16.45,1.02,43.19,0.97,227.15,175.89,29.17
346,383.42,59.47,7.86,2.43,70.63,0.68,233.12,179.80,10.27
1391,121.48,39.33,33.90,0.73,87.53,0.73,112.78,77.76,12.90
393,80.38,25.41,9.22,1.20,46.96,1.44,251.60,267.62,25.78
194,457.32,60.19,6.72,2.21,53.01,1.12,185.17,215.51,33.96
...,...,...,...,...,...,...,...,...,...
1992,90.30,24.79,7.78,0.53,80.65,0.95,179.79,108.22,30.20
361,116.17,44.11,10.94,1.61,74.08,2.38,92.39,176.15,12.01
135,245.83,81.45,11.02,2.33,55.37,1.03,286.85,202.99,18.96
3054,165.83,14.62,14.34,1.06,77.14,1.83,275.52,101.54,18.29


In [5]:
reg = linear_model.Ridge(alpha = 35)           # fitted data to model
reg.fit(X_train,y_train)

In [6]:
y_pred = reg.predict(X_valid)   
y_pred                # predicted

array([115.69887908, 212.16482606,  98.55656287,  38.04010164,
       159.7767141 , 102.28943057, 100.9631146 , 147.66832942,
        95.06139522,  28.72008797, 105.15248018, 122.50343621,
       120.9191105 , 159.74417053, 115.79450459, 130.13922948,
        66.67926663, 118.81224208,  35.83157096, 123.73891794,
       101.18116911, 138.73664064, 149.36905478, 139.95126926,
        77.51141745, 117.52573876, 110.6181551 ,  81.74705742,
       239.47060853, 167.69055243, 121.65112433,  36.22990036,
       120.64749254, 109.3002087 ,  42.15625626, 103.85648385,
       109.6835231 , 217.61671333, 192.65591863, 186.43021673,
        18.1301413 , 110.10653282,  31.92637886,  71.68294453,
        16.62489741,  52.33405448,  40.23729465,  31.2829785 ,
         2.79218383,  70.14823939,  12.11432973, 244.73078191,
        78.05472292,  53.61404986, 119.5630707 , 171.72259933,
       149.32457968, 107.72556618, 135.74029493,  59.84783445,
        72.90484904,  80.04543555, 131.38398369, 164.93

In [7]:
print("Printing training stats")
y_pred_train = reg.predict(X_train) 
print("Mean absolute error: %.2f" % mean_absolute_error(y_train, y_pred_train)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred_train)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_train, y_pred_train))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_train, y_pred_train)) # The coefficient of determination: 1 is perfect prediction

Printing training stats
Mean absolute error: 20.61
Mean squared error: 796.04
Root mean square error: 28.21
Coefficient of determination: 0.80


In [8]:
print("Printing testing stats")
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_valid, y_pred))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_pred)) # The coefficient of determination: 1 is perfect prediction

Printing testing stats
Mean absolute error: 21.48
Mean squared error: 888.79
Root mean square error: 29.81
Coefficient of determination: 0.77


In [9]:
for Y, y in zip(y_valid, y_pred):
  print(round(Y), round(y))

56 116
178 212
61 99
95 38
81 160
117 102
86 101
183 148
69 95
39 29
135 105
94 123
179 121
99 160
104 116
153 130
58 67
143 119
39 36
102 124
86 101
176 139
117 149
250 140
105 78
106 118
132 111
72 82
190 239
191 168
105 122
44 36
149 121
103 109
78 42
112 104
70 110
272 218
193 193
144 186
32 18
136 110
54 32
37 72
92 17
79 52
25 40
27 31
9 3
52 70
40 12
63 245
81 78
60 54
87 120
242 172
178 149
75 108
109 136
53 60
38 73
66 80
150 131
211 165
107 72
155 155
148 104
40 4
26 36
20 26
72 65
39 38
69 50
84 31
38 33
77 120
110 121
54 90
159 177
52 38
70 41
85 145
58 66
61 28
111 102
80 67
145 94
44 60
103 102
82 52
129 103
159 167
109 129
221 234
78 96
48 39
166 146
49 70
36 57
61 53
77 105
69 66
141 117
28 46
259 161
102 120
115 58
126 114
138 123
42 38
33 26
40 25
267 207
178 69
77 90
52 116
132 105
121 97
40 89
106 132
192 177
121 111
58 44
18 44
32 24
201 174
104 127
31 33
155 171
15 4
74 70
100 61
57 27
51 55
51 49
68 99
70 108
16 16
64 50
206 146
91 112
89 59
40 44
98 85
141 149
1