In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

  from pandas import MultiIndex, Int64Index


In [4]:
df = pd.read_csv("https://code.datasciencedojo.com/datasciencedojo/datasets/raw/master/Beijing%20PM2.5/PRSA_data_2010.1.1-2014.12.31.csv").set_index("No")
df.head()

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [5]:
df["date"] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + '/' + df.day.astype(str))

In [6]:
df["date_time"] = pd.to_datetime(df.date) + df.hour.astype('timedelta64[h]')

In [7]:
df.head()

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,date,date_time
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0,2010-01-01,2010-01-01 00:00:00
2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0,2010-01-01,2010-01-01 01:00:00
3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0,2010-01-01,2010-01-01 02:00:00
4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0,2010-01-01,2010-01-01 03:00:00
5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0,2010-01-01,2010-01-01 04:00:00


In [8]:
df.drop(columns=["year","month","day","hour","date"],inplace=True)

In [9]:
df.head()

Unnamed: 0_level_0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,date_time
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,-21,-11.0,1021.0,NW,1.79,0,0,2010-01-01 00:00:00
2,,-21,-12.0,1020.0,NW,4.92,0,0,2010-01-01 01:00:00
3,,-21,-11.0,1019.0,NW,6.71,0,0,2010-01-01 02:00:00
4,,-21,-14.0,1019.0,NW,9.84,0,0,2010-01-01 03:00:00
5,,-20,-12.0,1018.0,NW,12.97,0,0,2010-01-01 04:00:00


In [10]:
df["pm2.5"] = df["pm2.5"].fillna(method="bfill")

In [11]:
df.isna().sum()

pm2.5        0
DEWP         0
TEMP         0
PRES         0
cbwd         0
Iws          0
Is           0
Ir           0
date_time    0
dtype: int64

In [12]:
df.head()

Unnamed: 0_level_0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,date_time
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,129.0,-21,-11.0,1021.0,NW,1.79,0,0,2010-01-01 00:00:00
2,129.0,-21,-12.0,1020.0,NW,4.92,0,0,2010-01-01 01:00:00
3,129.0,-21,-11.0,1019.0,NW,6.71,0,0,2010-01-01 02:00:00
4,129.0,-21,-14.0,1019.0,NW,9.84,0,0,2010-01-01 03:00:00
5,129.0,-20,-12.0,1018.0,NW,12.97,0,0,2010-01-01 04:00:00


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43824 entries, 1 to 43824
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   pm2.5      43824 non-null  float64       
 1   DEWP       43824 non-null  int64         
 2   TEMP       43824 non-null  float64       
 3   PRES       43824 non-null  float64       
 4   cbwd       43824 non-null  object        
 5   Iws        43824 non-null  float64       
 6   Is         43824 non-null  int64         
 7   Ir         43824 non-null  int64         
 8   date_time  43824 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(3), object(1)
memory usage: 3.3+ MB


In [14]:
label_encoder = LabelEncoder()

In [15]:
df.cbwd = label_encoder.fit_transform(df.cbwd)
df.cbwd = pd.Categorical(df.cbwd)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43824 entries, 1 to 43824
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   pm2.5      43824 non-null  float64       
 1   DEWP       43824 non-null  int64         
 2   TEMP       43824 non-null  float64       
 3   PRES       43824 non-null  float64       
 4   cbwd       43824 non-null  category      
 5   Iws        43824 non-null  float64       
 6   Is         43824 non-null  int64         
 7   Ir         43824 non-null  int64         
 8   date_time  43824 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(4), int64(3)
memory usage: 3.1 MB


In [17]:
sscaler = StandardScaler()

In [18]:
cols_standard = ["DEWP","TEMP","PRES","Iws","Ir","Is"]

In [19]:
df[cols_standard] = sscaler.fit_transform(df[cols_standard])

In [20]:
df.head()

Unnamed: 0_level_0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,date_time
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,129.0,-1.580878,-1.92225,0.443328,1,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00
2,129.0,-1.580878,-2.004228,0.345943,1,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00
3,129.0,-1.580878,-1.92225,0.248559,1,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00
4,129.0,-1.580878,-2.168183,0.248559,1,-0.280926,-0.069353,-0.137667,2010-01-01 03:00:00
5,129.0,-1.511594,-2.004228,0.151174,1,-0.218339,-0.069353,-0.137667,2010-01-01 04:00:00


In [21]:
df.set_index("date_time",inplace=True)

In [22]:
X = df.drop(columns="pm2.5")
y = df["pm2.5"]

In [23]:
X.head()

Unnamed: 0_level_0,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01 00:00:00,-1.580878,-1.92225,0.443328,1,-0.441894,-0.069353,-0.137667
2010-01-01 01:00:00,-1.580878,-2.004228,0.345943,1,-0.379306,-0.069353,-0.137667
2010-01-01 02:00:00,-1.580878,-1.92225,0.248559,1,-0.343514,-0.069353,-0.137667
2010-01-01 03:00:00,-1.580878,-2.168183,0.248559,1,-0.280926,-0.069353,-0.137667
2010-01-01 04:00:00,-1.511594,-2.004228,0.151174,1,-0.218339,-0.069353,-0.137667


In [24]:
y.head()

date_time
2010-01-01 00:00:00    129.0
2010-01-01 01:00:00    129.0
2010-01-01 02:00:00    129.0
2010-01-01 03:00:00    129.0
2010-01-01 04:00:00    129.0
Name: pm2.5, dtype: float64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [31]:
list_of_model = [SVR(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(), LGBMRegressor()]

In [32]:
def model_comp():
    score_list = list()
    for model in list_of_model:
        estimator = model.fit(X_train,y_train)
        y_pred = estimator.predict(X_test)
        mse_score = mean_squared_error(y_test,y_pred)
        #cv_score = cross_val_score(model,X,y,cv=5)
        score_list.append({
            "model" : model,
            "mean_squared_error" : mse_score,
        })
    result_df = pd.DataFrame(score_list,columns=["model","mean_squared_error"])
    return result_df

In [33]:
result_df = model_comp()



In [40]:
result_df["mean_squared_error"]**0.5

0    76.181218
1    91.624280
2    67.893045
3    70.318720
4    71.093317
5    67.325132
Name: mean_squared_error, dtype: float64