In [28]:
import numpy as np     #imported all library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt


In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/vis2208/Air-Pollution-/main/Processed_data.csv")   #read data
df.head(5)

Unnamed: 0,Date,PM2.5,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
0,03/01/2018,287.71,488.92,86.46,20.49,1.82,67.28,0.32,248.7,167.41,12.41
1,15/01/2018,240.96,515.83,101.49,20.95,1.84,50.45,0.71,264.26,169.86,15.98
2,17/01/2018,294.38,402.21,72.23,17.61,1.2,64.78,1.19,88.52,166.5,14.02
3,23/01/2018,196.4,322.72,86.8,24.86,2.09,61.87,1.82,114.02,149.21,14.05
4,24/01/2018,162.97,239.81,72.06,25.29,2.08,66.09,0.71,240.56,198.31,13.42


In [30]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size = 0.8

X = df.drop(columns = ['PM2.5','Date']).copy()
y = df['PM2.5']

# In the first step we will split the data in training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, random_state = 42)


print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2581, 9)
(2581,)
(646, 9)
(646,)


In [31]:
X_valid  # due to random_state = 10 , always same data otherwise different data on each run

Unnamed: 0,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
1429,307.12,40.66,16.45,1.02,43.19,0.97,227.15,175.89,29.17
346,383.42,59.47,7.86,2.43,70.63,0.68,233.12,179.80,10.27
1391,121.48,39.33,33.90,0.73,87.53,0.73,112.78,77.76,12.90
393,80.38,25.41,9.22,1.20,46.96,1.44,251.60,267.62,25.78
194,457.32,60.19,6.72,2.21,53.01,1.12,185.17,215.51,33.96
...,...,...,...,...,...,...,...,...,...
1992,90.30,24.79,7.78,0.53,80.65,0.95,179.79,108.22,30.20
361,116.17,44.11,10.94,1.61,74.08,2.38,92.39,176.15,12.01
135,245.83,81.45,11.02,2.33,55.37,1.03,286.85,202.99,18.96
3054,165.83,14.62,14.34,1.06,77.14,1.83,275.52,101.54,18.29


In [32]:
reg = linear_model.Lasso(alpha = 0.55)           # fitted data to model
reg.fit(X_train,y_train)

In [33]:
y_pred = reg.predict(X_valid)   
y_pred                # predicted

array([115.9721361 , 210.10165323,  98.53721821,  37.20882799,
       159.83920756, 103.18931801, 100.26935203, 147.24187508,
        96.08246218,  28.97399455, 106.3455048 , 121.3575532 ,
       123.62679175, 161.13364887, 117.37691127, 128.98083623,
        67.60802583, 117.3890154 ,  34.99915842, 126.43529267,
       102.93848142, 140.06741135, 150.35564762, 139.90266095,
        76.87324241, 117.74749258, 109.45213188,  80.61669706,
       238.32886518, 167.86017517, 120.51749686,  33.66197965,
       120.26836391, 109.36249464,  42.02834728, 104.50757313,
       109.7943623 , 217.06568759, 191.78463289, 185.88350918,
        15.65450696, 110.17680767,  34.74014227,  71.63750541,
        15.80573252,  48.57005135,  40.19323095,  29.86342899,
         3.85166779,  70.63506204,  10.32745578, 244.93833239,
        76.19528789,  53.16192872, 119.11570496, 168.32417709,
       148.34405877, 110.10021061, 136.1721905 ,  60.78709132,
        72.92162741,  80.17140219, 130.98715403, 164.99

In [34]:
print("Printing training stats")
y_pred_train = reg.predict(X_train) 
print("Mean absolute error: %.2f" % mean_absolute_error(y_train, y_pred_train)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred_train)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_train, y_pred_train))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_train, y_pred_train)) # The coefficient of determination: 1 is perfect prediction

Printing training stats
Mean absolute error: 20.62
Mean squared error: 798.32
Root mean square error: 28.25
Coefficient of determination: 0.80


In [35]:
print("Printing testing stats")
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_valid, y_pred))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_pred)) # The coefficient of determination: 1 is perfect prediction

Printing testing stats
Mean absolute error: 21.47
Mean squared error: 893.81
Root mean square error: 29.90
Coefficient of determination: 0.77


In [36]:
for Y, y in zip(y_valid, y_pred):
  print(round(Y), round(y))

56 116
178 210
61 99
95 37
81 160
117 103
86 100
183 147
69 96
39 29
135 106
94 121
179 124
99 161
104 117
153 129
58 68
143 117
39 35
102 126
86 103
176 140
117 150
250 140
105 77
106 118
132 109
72 81
190 238
191 168
105 121
44 34
149 120
103 109
78 42
112 105
70 110
272 217
193 192
144 186
32 16
136 110
54 35
37 72
92 16
79 49
25 40
27 30
9 4
52 71
40 10
63 245
81 76
60 53
87 119
242 168
178 148
75 110
109 136
53 61
38 73
66 80
150 131
211 165
107 71
155 154
148 106
40 5
26 35
20 25
72 65
39 36
69 51
84 30
38 32
77 119
110 121
54 91
159 175
52 37
70 42
85 142
58 65
61 27
111 101
80 68
145 95
44 60
103 102
82 55
129 102
159 166
109 131
221 232
78 96
48 38
166 146
49 71
36 58
61 52
77 108
69 66
141 118
28 46
259 160
102 118
115 56
126 113
138 125
42 38
33 26
40 26
267 207
178 68
77 88
52 118
132 103
121 99
40 87
106 132
192 177
121 110
58 45
18 44
32 24
201 175
104 127
31 33
155 171
15 5
74 68
100 61
57 28
51 55
51 49
68 100
70 110
16 15
64 53
206 146
91 111
89 58
40 41
98 84
141 147
