In [1]:
import numpy as np     #imported all library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/vis2208/Air-Pollution-/main/Processed_data.csv")   #read data
df.head(5)

Unnamed: 0,Date,PM2.5,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
0,03/01/2018,287.71,488.92,86.46,20.49,1.82,67.28,0.32,248.7,167.41,12.41
1,15/01/2018,240.96,515.83,101.49,20.95,1.84,50.45,0.71,264.26,169.86,15.98
2,17/01/2018,294.38,402.21,72.23,17.61,1.2,64.78,1.19,88.52,166.5,14.02
3,23/01/2018,196.4,322.72,86.8,24.86,2.09,61.87,1.82,114.02,149.21,14.05
4,24/01/2018,162.97,239.81,72.06,25.29,2.08,66.09,0.71,240.56,198.31,13.42


In [4]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size = 0.8

X = df.drop(columns = ['PM2.5','Date']).copy()
y = df['PM2.5']

# In the first step we will split the data in training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8,random_state = 10 )

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2581, 9)
(2581,)
(646, 9)
(646,)


In [5]:
X_valid  # due to random_state = 10 , always same data otherwise different data on each run

Unnamed: 0,PM10,NO2,SO2,CO,RH,WS,WD,SR,AT
1246,140.73,24.42,12.25,0.87,79.17,1.21,273.12,187.49,31.92
820,301.24,47.91,25.00,1.79,75.10,1.27,220.29,91.27,17.72
2427,237.41,75.56,8.82,1.55,56.73,0.59,196.02,127.59,22.54
2580,116.12,23.65,12.98,0.53,34.98,1.35,129.51,236.43,29.44
2174,370.99,73.48,16.07,2.14,78.76,1.33,117.06,104.04,18.32
...,...,...,...,...,...,...,...,...,...
2621,221.91,25.99,4.34,1.51,53.48,1.61,81.07,163.73,35.24
1343,207.04,16.09,17.55,0.93,80.85,1.24,95.97,85.20,15.64
845,146.70,22.76,29.19,0.87,65.13,1.06,291.82,202.58,27.23
2616,218.53,30.04,8.10,1.58,54.93,0.82,148.78,161.47,33.43


In [6]:
reg = tree.ExtraTreeRegressor(random_state = 10, max_depth =8)           # fitted data to model
reg.fit(X_train,y_train)

In [7]:
y_pred = reg.predict(X_valid)   
y_pred                # predicted

array([ 39.00020134, 145.20649123, 102.12034884,  51.31707916,
       211.34818182,  60.98166667,  97.88857807, 171.26277778,
        43.74583333,  56.53809091,  86.39964789, 123.802     ,
       224.12060606,  35.03348438, 224.12060606,  39.00020134,
        26.80933333,  82.91888889,  97.88857807, 104.19659574,
        97.88857807,  86.39964789,  97.88857807,  70.03      ,
        43.74583333,  97.88857807, 102.12034884,  39.00020134,
        35.03348438, 258.636     , 220.18      ,  97.88857807,
        86.39964789, 148.20833333,  46.76333333,  70.18846154,
        44.62034483,  26.2825    , 123.802     , 102.12034884,
        26.80933333, 147.770625  ,  39.00020134,  86.39964789,
        77.538     , 175.52192982, 108.85333333,  97.88857807,
        56.53809091,  39.00020134,  44.86814815, 148.20833333,
       145.20649123,  86.39964789, 102.12034884,  93.62071429,
        44.62034483,  64.195     , 141.859375  , 171.26277778,
        64.195     ,  50.39166667, 166.34466667,  56.53

In [8]:
print("Printing training stats")
y_pred_train = reg.predict(X_train) 
print("Mean absolute error: %.2f" % mean_absolute_error(y_train, y_pred_train)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred_train)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_train, y_pred_train))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_train, y_pred_train)) # The coefficient of determination: 1 is perfect prediction

Printing training stats
Mean absolute error: 18.18
Mean squared error: 662.85
Root mean square error: 25.75
Coefficient of determination: 0.83


In [9]:
print("Printing testing stats")
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # mean absolute error 
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # The mean squared error
print("Root mean square error: %.2f" % sqrt(mean_squared_error(y_valid, y_pred))) # RMSE
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_pred)) # The coefficient of determination: 1 is perfect prediction

Printing testing stats
Mean absolute error: 21.48
Mean squared error: 943.35
Root mean square error: 30.71
Coefficient of determination: 0.75


In [None]:
for Y, y in zip(y_valid, y_pred):
  print(round(Y), round(y))

18 16
86 86
226 238
194 234
47 79
57 61
97 151
29 30
172 120
120 103
159 151
178 147
111 103
98 103
122 103
134 103
185 171
18 16
98 103
141 120
35 79
185 171
205 173
76 73
81 106
235 146
188 177
303 286
153 129
38 48
256 299
26 16
43 48
99 86
100 106
192 129
54 50
126 103
9 24
38 41
330 242
75 103
233 171
40 50
169 171
89 120
90 103
228 236
100 106
330 303
231 312
55 50
197 173
40 48
61 48
81 86
114 106
128 153
239 242
111 153
74 77
139 151
333 227
111 120
38 48
58 57
94 77
52 58
163 103
63 79
113 106
137 151
37 32
165 103
41 46
69 73
159 103
70 77
49 35
209 293
119 77
80 151
56 57
52 57
31 38
180 179
73 58
36 50
115 79
180 312
264 227
159 173
185 58
188 173
95 108
192 153
40 20
203 173
162 261
68 79
253 256
155 173
49 48
33 32
63 57
124 103
323 238
58 103
27 20
71 46
180 173
79 103
83 81
246 248
74 79
63 57
