In [143]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [144]:
# Import dataset 
df = pd.read_csv("data/train.csv")

In [145]:
df.head()

Unnamed: 0,date_time,is_holiday,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,air_pollution_index,traffic_volume
0,2012-10-02 09:00:00,,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,121,5545
1,2012-10-02 10:00:00,,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,178,4516
2,2012-10-02 11:00:00,,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,113,4767
3,2012-10-02 12:00:00,,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,20,5026
4,2012-10-02 13:00:00,,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,281,4918


In [146]:
# Detecting outliers in column by z_score method as it close to normal distribution
def detect_outliers_by_z_score(data):
    outliers = []
    threshold = 3
    std_val  = data.std()
    mean_val = data.mean()

    for value in data:
        z_score = (value-mean_val)/std_val
        if abs(z_score) > 3:
            outliers.append(value)
    return outliers


# Detecting outliers in column by IQR method as it is not normally distributed
def outliers_by_iqr(data):
    outliers = []
    
    percentile_25 = np.quantile(data, 0.25)
    percentile_75 = np.quantile(data, 0.75)
    IQR           = percentile_75 - percentile_25
    upper_limit   = percentile_75 + (1.5 * IQR)
    lower_limit   = percentile_25 - (1.5 * IQR)
    print("Upper Limit: ", upper_limit)
    print("Lower Limit: ", lower_limit)
    
    for item in data:
        if (item > upper_limit) or (item < lower_limit):
            outliers.append(item)
            
    return outliers, upper_limit, lower_limit

In [147]:
#------------------------- Handling "humidity" columns outliers --------------------------
out = detect_outliers_by_z_score(df['humidity'])
# Capping "humidity" column outliers by upper_limit and lower_limit
upper_limit = df['humidity'].mean() + (3*df['humidity'].std())
lower_limit = df['humidity'].mean() - (3*df['humidity'].std())
df['humidity'] = np.where( df['humidity']>upper_limit, upper_limit, np.where( df['humidity']<lower_limit, lower_limit, df['humidity'] ) )

#------------------------- Handling "wind_speed" columns outliers --------------------------
outlier_data, iqr_upper_limit, iqr_lower_limit = outliers_by_iqr(df['wind_speed'])
# Capping outliers of "wind_speed" column
df['wind_speed'] = np.where( (df['wind_speed']>iqr_upper_limit), iqr_upper_limit, np.where(df['wind_speed']<iqr_lower_limit, iqr_lower_limit, df['wind_speed']))


#------------------------- Handling "temperature" columns outliers --------------------------
outlier_data, iqr_upper_limit, iqr_lower_limit = outliers_by_iqr(df['temperature'])
# Capping outliers of "temperature" column
df['temperature'] = np.where( (df['temperature']>iqr_upper_limit), iqr_upper_limit, np.where(df['temperature']<iqr_lower_limit, iqr_lower_limit, df['temperature']))


Upper Limit:  9.5
Lower Limit:  -2.5
Upper Limit:  318.96999999999997
Lower Limit:  243.37000000000006


In [148]:
# Dropping columns -> "rain_p_h", "snow_p_h", "dew_point"
df.drop(columns=['rain_p_h', 'snow_p_h', 'dew_point'], inplace=True)

In [149]:
# Creating new columns from "date_time"
df['date_time'] = pd.to_datetime(df['date_time'])

In [150]:
# Createing 3 new columns "date_year", "date_month", "date_day"
df['date_year']  = df['date_time'].dt.year
df['date_month'] = df['date_time'].dt.month
df['date_day']   = df['date_time'].dt.day

In [151]:
# Dropping date_time column
df.drop(columns=['date_time'], inplace=True)

In [152]:
df['is_holiday'] = np.where(df['is_holiday'] == "None", 0, 1)

In [153]:
# train test split 
X = df.drop("air_pollution_index", axis=1)
y = df['air_pollution_index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [154]:
X_train.shape, X_test.shape

((27000, 12), (6750, 12))

In [155]:
# num_columns = [ col for col in X_train.columns if X_train[col].dtype != "O"]
# cat_columns = [ col for col in X_train.columns if X_train[col].dtype == "O"]

num_columns = X_train.select_dtypes(exclude="object").columns
cat_columns = X_train.select_dtypes(include="object").columns
num_columns, cat_columns

(Index(['is_holiday', 'humidity', 'wind_speed', 'wind_direction',
        'visibility_in_miles', 'temperature', 'clouds_all', 'traffic_volume',
        'date_year', 'date_month', 'date_day'],
       dtype='object'),
 Index(['weather_type'], dtype='object'))

In [156]:
df.head()

Unnamed: 0,is_holiday,humidity,wind_speed,wind_direction,visibility_in_miles,temperature,clouds_all,weather_type,air_pollution_index,traffic_volume,date_year,date_month,date_day
0,0,89.0,2.0,329,1,288.28,40,Clouds,121,5545,2012,10,2
1,0,67.0,3.0,330,1,289.36,75,Clouds,178,4516,2012,10,2
2,0,66.0,3.0,329,2,289.58,90,Clouds,113,4767,2012,10,2
3,0,66.0,3.0,329,5,290.13,90,Clouds,20,5026,2012,10,2
4,0,65.0,3.0,329,7,291.14,75,Clouds,281,4918,2012,10,2


# Without pipeline and column transformer

In [157]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[cat_columns])
X_test_ohe  = ohe.transform(X_test[cat_columns])

In [158]:
# pd.DataFrame(X_train_ohe)

In [159]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_columns])
X_test_scaled  = scaler.transform(X_test[num_columns])

In [160]:
#  pd.DataFrame(X_train_scaled)
X_train_ohe.shape, X_train_scaled.shape

((27000, 11), (27000, 11))

In [161]:
# Concatenating the arrays(X_train_ohe, X_train_scaled) along columns
X_train_final_arr = np.concatenate( (X_train_ohe, X_train_scaled), axis=1)
X_test_final_arr  = np.concatenate( (X_test_ohe, X_test_scaled), axis=1)

In [162]:
# pd.DataFrame(X_train_final_arr)
X_train_final_arr.shape, X_test_final_arr.shape

((27000, 22), (6750, 22))

# With Pipeline and Column Transformer

In [163]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pipeline_1 = Pipeline([ ("ohe", ohe) ])

pipeline_2 = Pipeline([ ("scaler", scaler) ])

ct = ColumnTransformer([
    ("pipeline_1", pipeline_1, cat_columns),
    ("pipeline_2", pipeline_2, num_columns),
])

In [164]:
X_train_transformed = ct.fit_transform(X_train)
X_test_transformed  = ct.transform(X_test)

In [165]:
# pd.DataFrame(X_train_transformed)
X_train_transformed.shape, X_test_transformed.shape

((27000, 22), (6750, 22))

# Creating Model

In [166]:
models = {
    "KNN": KNeighborsRegressor(),
    "DTR": DecisionTreeRegressor(),
    "RFR": RandomForestRegressor(),
    "ABR": AdaBoostRegressor(),
    "SVR": SVR(),
    "LR" : LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "CB": CatBoostRegressor(),
    "XGB": XGBRegressor()
}

In [167]:
def evalute_model(y_test, y_pred):
    mse  = mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2_scor = r2_score(y_test, y_pred)
    
    return r2_scor, rmse, mae

In [168]:
model_list = []
r2_list    = []
rmse_list  = []
mae_list   = []

for key in models.keys():
    
    model = models[key]
    # Training model
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    r2_scr, rmse, mae = evalute_model(y_test, y_pred)
    
    print(f"---------------------- Model Name: {key} ----------------------------")
    print("R2 Score: ", r2_scr)
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    print()
        
    model_list.append(key)
    r2_list.append(r2_scr)
    rmse_list.append(rmse)
    mae_list.append(mae)


---------------------- Model Name: KNN ----------------------------
R2 Score:  -0.21192166485150876
RMSE:  92.91451567814205
MAE:  78.32005925925927

---------------------- Model Name: DTR ----------------------------
R2 Score:  -1.0187859048142083
RMSE:  119.9199461369187
MAE:  98.00237037037037

---------------------- Model Name: RFR ----------------------------
R2 Score:  -0.05445469136019754
RMSE:  86.66829532854304
MAE:  74.78498222222223

---------------------- Model Name: ABR ----------------------------
R2 Score:  -0.0008716003127751115
RMSE:  84.43752132145907
MAE:  73.3920324175419

---------------------- Model Name: SVR ----------------------------
R2 Score:  -0.002773064053630092
RMSE:  84.5176907965454
MAE:  73.41457142150371

---------------------- Model Name: LR ----------------------------
R2 Score:  -0.0012320737644238022
RMSE:  84.45272544187557
MAE:  73.38715451388889

---------------------- Model Name: Ridge ----------------------------
R2 Score:  -0.001180781732092

144:	learn: 82.1979124	total: 1.25s	remaining: 7.39s
145:	learn: 82.1896018	total: 1.26s	remaining: 7.38s
146:	learn: 82.1800774	total: 1.27s	remaining: 7.37s
147:	learn: 82.1693145	total: 1.28s	remaining: 7.36s
148:	learn: 82.1622139	total: 1.29s	remaining: 7.35s
149:	learn: 82.1477315	total: 1.3s	remaining: 7.34s
150:	learn: 82.1374338	total: 1.3s	remaining: 7.33s
151:	learn: 82.1278569	total: 1.31s	remaining: 7.33s
152:	learn: 82.1185933	total: 1.32s	remaining: 7.32s
153:	learn: 82.1102314	total: 1.33s	remaining: 7.31s
154:	learn: 82.1021755	total: 1.34s	remaining: 7.3s
155:	learn: 82.0949642	total: 1.35s	remaining: 7.29s
156:	learn: 82.0866716	total: 1.35s	remaining: 7.28s
157:	learn: 82.0770785	total: 1.36s	remaining: 7.27s
158:	learn: 82.0684711	total: 1.37s	remaining: 7.26s
159:	learn: 82.0587204	total: 1.38s	remaining: 7.25s
160:	learn: 82.0471383	total: 1.39s	remaining: 7.24s
161:	learn: 82.0360812	total: 1.4s	remaining: 7.23s
162:	learn: 82.0268932	total: 1.41s	remaining: 7.2

320:	learn: 80.4106495	total: 2.75s	remaining: 5.82s
321:	learn: 80.4002264	total: 2.76s	remaining: 5.81s
322:	learn: 80.3942694	total: 2.77s	remaining: 5.8s
323:	learn: 80.3886885	total: 2.77s	remaining: 5.79s
324:	learn: 80.3790109	total: 2.78s	remaining: 5.78s
325:	learn: 80.3706081	total: 2.79s	remaining: 5.77s
326:	learn: 80.3655411	total: 2.8s	remaining: 5.76s
327:	learn: 80.3551062	total: 2.81s	remaining: 5.75s
328:	learn: 80.3473950	total: 2.82s	remaining: 5.75s
329:	learn: 80.3363014	total: 2.83s	remaining: 5.74s
330:	learn: 80.3235182	total: 2.83s	remaining: 5.73s
331:	learn: 80.3119652	total: 2.84s	remaining: 5.72s
332:	learn: 80.3006710	total: 2.85s	remaining: 5.71s
333:	learn: 80.2917037	total: 2.86s	remaining: 5.7s
334:	learn: 80.2843459	total: 2.87s	remaining: 5.69s
335:	learn: 80.2735566	total: 2.88s	remaining: 5.68s
336:	learn: 80.2593356	total: 2.88s	remaining: 5.68s
337:	learn: 80.2502843	total: 2.89s	remaining: 5.67s
338:	learn: 80.2408234	total: 2.9s	remaining: 5.6

494:	learn: 78.8440956	total: 4.24s	remaining: 4.32s
495:	learn: 78.8364323	total: 4.24s	remaining: 4.31s
496:	learn: 78.8287860	total: 4.25s	remaining: 4.3s
497:	learn: 78.8204293	total: 4.26s	remaining: 4.29s
498:	learn: 78.8118657	total: 4.27s	remaining: 4.29s
499:	learn: 78.8066213	total: 4.28s	remaining: 4.28s
500:	learn: 78.7949653	total: 4.29s	remaining: 4.27s
501:	learn: 78.7895488	total: 4.3s	remaining: 4.26s
502:	learn: 78.7841771	total: 4.3s	remaining: 4.25s
503:	learn: 78.7778733	total: 4.31s	remaining: 4.24s
504:	learn: 78.7686466	total: 4.32s	remaining: 4.24s
505:	learn: 78.7602770	total: 4.33s	remaining: 4.23s
506:	learn: 78.7503815	total: 4.34s	remaining: 4.22s
507:	learn: 78.7413807	total: 4.35s	remaining: 4.21s
508:	learn: 78.7367365	total: 4.36s	remaining: 4.2s
509:	learn: 78.7297861	total: 4.36s	remaining: 4.19s
510:	learn: 78.7168072	total: 4.37s	remaining: 4.18s
511:	learn: 78.7074657	total: 4.38s	remaining: 4.17s
512:	learn: 78.6971096	total: 4.39s	remaining: 4.1

669:	learn: 77.4875124	total: 5.73s	remaining: 2.82s
670:	learn: 77.4813073	total: 5.74s	remaining: 2.81s
671:	learn: 77.4723430	total: 5.75s	remaining: 2.8s
672:	learn: 77.4648674	total: 5.75s	remaining: 2.79s
673:	learn: 77.4610014	total: 5.76s	remaining: 2.79s
674:	learn: 77.4537124	total: 5.77s	remaining: 2.78s
675:	learn: 77.4430042	total: 5.78s	remaining: 2.77s
676:	learn: 77.4335840	total: 5.79s	remaining: 2.76s
677:	learn: 77.4247032	total: 5.8s	remaining: 2.75s
678:	learn: 77.4165669	total: 5.8s	remaining: 2.74s
679:	learn: 77.4062021	total: 5.81s	remaining: 2.74s
680:	learn: 77.4027721	total: 5.82s	remaining: 2.73s
681:	learn: 77.3924407	total: 5.83s	remaining: 2.72s
682:	learn: 77.3832711	total: 5.84s	remaining: 2.71s
683:	learn: 77.3730460	total: 5.85s	remaining: 2.7s
684:	learn: 77.3651321	total: 5.86s	remaining: 2.69s
685:	learn: 77.3602580	total: 5.87s	remaining: 2.68s
686:	learn: 77.3519326	total: 5.87s	remaining: 2.68s
687:	learn: 77.3439860	total: 5.88s	remaining: 2.6

840:	learn: 76.2273114	total: 7.2s	remaining: 1.36s
841:	learn: 76.2177232	total: 7.21s	remaining: 1.35s
842:	learn: 76.2065733	total: 7.21s	remaining: 1.34s
843:	learn: 76.1951166	total: 7.22s	remaining: 1.33s
844:	learn: 76.1841323	total: 7.23s	remaining: 1.33s
845:	learn: 76.1774541	total: 7.24s	remaining: 1.32s
846:	learn: 76.1681882	total: 7.25s	remaining: 1.31s
847:	learn: 76.1622118	total: 7.26s	remaining: 1.3s
848:	learn: 76.1578848	total: 7.26s	remaining: 1.29s
849:	learn: 76.1492799	total: 7.27s	remaining: 1.28s
850:	learn: 76.1412163	total: 7.28s	remaining: 1.27s
851:	learn: 76.1342123	total: 7.29s	remaining: 1.27s
852:	learn: 76.1301440	total: 7.3s	remaining: 1.26s
853:	learn: 76.1231666	total: 7.31s	remaining: 1.25s
854:	learn: 76.1163476	total: 7.32s	remaining: 1.24s
855:	learn: 76.1127240	total: 7.32s	remaining: 1.23s
856:	learn: 76.1049353	total: 7.33s	remaining: 1.22s
857:	learn: 76.1001164	total: 7.34s	remaining: 1.21s
858:	learn: 76.0930557	total: 7.35s	remaining: 1.

---------------------- Model Name: CB ----------------------------
R2 Score:  -0.03149538850690581
RMSE:  85.71956100396436
MAE:  74.28574172698389

---------------------- Model Name: XGB ----------------------------
R2 Score:  -0.06910544236556282
RMSE:  87.26830946788738
MAE:  75.17179770575629



In [173]:
result_df = pd.DataFrame({'Model': model_list, "R2 Score": r2_list, "RMSE": rmse_list, "MAE": mae_list}).sort_values(by='R2 Score', ascending=False)

In [174]:
result_df

Unnamed: 0,Model,R2 Score,RMSE,MAE
7,Lasso,-1.5e-05,84.401371,73.344634
3,ABR,-0.000872,84.437521,73.392032
6,Ridge,-0.001181,84.450562,73.385342
5,LR,-0.001232,84.452725,73.387155
4,SVR,-0.002773,84.517691,73.414571
8,CB,-0.031495,85.719561,74.285742
2,RFR,-0.054455,86.668295,74.784982
9,XGB,-0.069105,87.268309,75.171798
0,KNN,-0.211922,92.914516,78.320059
1,DTR,-1.018786,119.919946,98.00237
