In [1]:
import numpy as np     #imported all library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/vis2208/Air-Pollution-/main/Processed_data.csv")   #read data
df.head(5)

Unnamed: 0,Date,PM2.5,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
0,03/01/2018,287.71,488.92,86.46,20.49,1.82,67.28,0.32,248.7,167.41,12.41
1,15/01/2018,240.96,515.83,101.49,20.95,1.84,50.45,0.71,264.26,169.86,15.98
2,17/01/2018,294.38,402.21,72.23,17.61,1.2,64.78,1.19,88.52,166.5,14.02
3,23/01/2018,196.4,322.72,86.8,24.86,2.09,61.87,1.82,114.02,149.21,14.05
4,24/01/2018,162.97,239.81,72.06,25.29,2.08,66.09,0.71,240.56,198.31,13.42


In [3]:
df.shape

(3227, 11)

In [4]:
# Let's say we want to split the data in 80:20 for train:valid dataset
train_size = 0.8

X = df.drop(columns = ['PM2.5','Date']).copy()
y = df['PM2.5']

# In the first step we will split the data in training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, random_state = 42)


print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2581, 9)
(2581,)
(646, 9)
(646,)


In [5]:
X_valid  # due to random_state = 10 , always same data otherwise different data on each run

Unnamed: 0,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
1429,307.12,40.66,16.45,1.02,43.19,0.97,227.15,175.89,29.17
346,383.42,59.47,7.86,2.43,70.63,0.68,233.12,179.80,10.27
1391,121.48,39.33,33.90,0.73,87.53,0.73,112.78,77.76,12.90
393,80.38,25.41,9.22,1.20,46.96,1.44,251.60,267.62,25.78
194,457.32,60.19,6.72,2.21,53.01,1.12,185.17,215.51,33.96
...,...,...,...,...,...,...,...,...,...
1992,90.30,24.79,7.78,0.53,80.65,0.95,179.79,108.22,30.20
361,116.17,44.11,10.94,1.61,74.08,2.38,92.39,176.15,12.01
135,245.83,81.45,11.02,2.33,55.37,1.03,286.85,202.99,18.96
3054,165.83,14.62,14.34,1.06,77.14,1.83,275.52,101.54,18.29


In [6]:
reg = linear_model.LinearRegression()           # fitted data to model
reg.fit(X_train,y_train)

In [7]:
y_pred = reg.predict(X_valid)   
y_pred                # predicted

array([115.78609944, 212.41322897,  98.8440869 ,  38.03596742,
       159.68464869, 102.15693881, 100.9474695 , 147.64949703,
        95.02195315,  28.81963318, 104.99488692, 122.37748226,
       120.08959964, 159.65959647, 115.67299386, 130.41504486,
        66.44183079, 118.99195312,  35.89445455, 123.21846882,
       100.85059683, 138.64249875, 149.32291257, 139.81298316,
        77.86366416, 117.6374709 , 110.56224188,  81.83491696,
       239.64812183, 167.2752433 , 121.57273243,  36.67106793,
       120.80238895, 109.38228069,  42.2838464 , 103.9225417 ,
       109.57951327, 217.65763058, 192.96570502, 186.57988826,
        18.54619183, 110.19339318,  31.34490228,  71.81281725,
        16.70781597,  52.67775985,  40.34451464,  31.9083648 ,
         2.31218504,  70.0717444 ,  12.50452219, 244.23855735,
        78.30569582,  53.5495784 , 119.89138382, 171.97451597,
       149.53746436, 106.99522522, 135.68977127,  59.48813361,
        73.03163415,  80.07682284, 131.45612604, 165.09

In [8]:
print("Printing training stats")
y_pred_train = reg.predict(X_train) 
print("Mean absolute error: %.2f" % mean_absolute_error(y_train, y_pred_train)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred_train)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_train, y_pred_train))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_train, y_pred_train)) # The coefficient of determination: 1 is perfect prediction

Printing training stats
Mean absolute error: 20.61
Mean squared error: 795.97
Root mean square error: 28.21
Coefficient of determination: 0.80


In [9]:
print("Printing testing stats")
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_valid, y_pred))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_pred)) # The coefficient of determination: 1 is perfect prediction

Printing testing stats
Mean absolute error: 21.49
Mean squared error: 888.40
Root mean square error: 29.81
Coefficient of determination: 0.77


In [10]:
for Y, y in zip(y_valid, y_pred):
  print(round(Y), round(y))

56 116
178 212
61 99
95 38
81 160
117 102
86 101
183 148
69 95
39 29
135 105
94 122
179 120
99 160
104 116
153 130
58 66
143 119
39 36
102 123
86 101
176 139
117 149
250 140
105 78
106 118
132 111
72 82
190 240
191 167
105 122
44 37
149 121
103 109
78 42
112 104
70 110
272 218
193 193
144 187
32 19
136 110
54 31
37 72
92 17
79 53
25 40
27 32
9 2
52 70
40 13
63 244
81 78
60 54
87 120
242 172
178 150
75 107
109 136
53 59
38 73
66 80
150 131
211 165
107 72
155 155
148 104
40 3
26 36
20 26
72 65
39 38
69 50
84 31
38 33
77 120
110 121
54 90
159 177
52 38
70 41
85 145
58 66
61 29
111 102
80 67
145 93
44 60
103 102
82 51
129 104
159 167
109 128
221 234
78 97
48 39
166 146
49 70
36 57
61 53
77 105
69 66
141 117
28 46
259 161
102 120
115 59
126 114
138 123
42 39
33 26
40 25
267 208
178 69
77 90
52 116
132 105
121 97
40 88
106 133
192 177
121 111
58 45
18 45
32 24
201 174
104 127
31 32
155 171
15 3
74 70
100 61
57 27
51 55
51 49
68 98
70 107
16 15
64 50
206 146
91 112
89 60
40 44
98 85
141 149
1