In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
# Import dataset 
df = pd.read_csv("data/train.csv")

In [3]:
df.head()

Unnamed: 0,date_time,is_holiday,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,air_pollution_index,traffic_volume
0,2012-10-02 09:00:00,,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,121,5545
1,2012-10-02 10:00:00,,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,178,4516
2,2012-10-02 11:00:00,,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,113,4767
3,2012-10-02 12:00:00,,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,20,5026
4,2012-10-02 13:00:00,,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,281,4918


In [4]:
# Detecting outliers in column by z_score method as it close to normal distribution
def detect_outliers_by_z_score(data):
    outliers = []
    threshold = 3
    std_val  = data.std()
    mean_val = data.mean()

    for value in data:
        z_score = (value-mean_val)/std_val
        if abs(z_score) > 3:
            outliers.append(value)
    return outliers


# Detecting outliers in column by IQR method as it is not normally distributed
def outliers_by_iqr(data):
    outliers = []
    
    percentile_25 = np.quantile(data, 0.25)
    percentile_75 = np.quantile(data, 0.75)
    IQR           = percentile_75 - percentile_25
    upper_limit   = percentile_75 + (1.5 * IQR)
    lower_limit   = percentile_25 - (1.5 * IQR)
    print("Upper Limit: ", upper_limit)
    print("Lower Limit: ", lower_limit)
    
    for item in data:
        if (item > upper_limit) or (item < lower_limit):
            outliers.append(item)
            
    return outliers, upper_limit, lower_limit

In [5]:
#------------------------- Handling "humidity" columns outliers --------------------------
out = detect_outliers_by_z_score(df['humidity'])
# Capping "humidity" column outliers by upper_limit and lower_limit
upper_limit = df['humidity'].mean() + (3*df['humidity'].std())
lower_limit = df['humidity'].mean() - (3*df['humidity'].std())
df['humidity'] = np.where( df['humidity']>upper_limit, upper_limit, np.where( df['humidity']<lower_limit, lower_limit, df['humidity'] ) )

#------------------------- Handling "wind_speed" columns outliers --------------------------
outlier_data, iqr_upper_limit, iqr_lower_limit = outliers_by_iqr(df['wind_speed'])
# Capping outliers of "wind_speed" column
df['wind_speed'] = np.where( (df['wind_speed']>iqr_upper_limit), iqr_upper_limit, np.where(df['wind_speed']<iqr_lower_limit, iqr_lower_limit, df['wind_speed']))


#------------------------- Handling "temperature" columns outliers --------------------------
outlier_data, iqr_upper_limit, iqr_lower_limit = outliers_by_iqr(df['temperature'])
# Capping outliers of "temperature" column
df['temperature'] = np.where( (df['temperature']>iqr_upper_limit), iqr_upper_limit, np.where(df['temperature']<iqr_lower_limit, iqr_lower_limit, df['temperature']))


Upper Limit:  9.5
Lower Limit:  -2.5
Upper Limit:  318.96999999999997
Lower Limit:  243.37000000000006


In [6]:
# Dropping columns -> "rain_p_h", "snow_p_h", "dew_point"
df.drop(columns=['rain_p_h', 'snow_p_h', 'dew_point'], inplace=True)

In [7]:
# Creating new columns from "date_time"
df['date_time'] = pd.to_datetime(df['date_time'])

In [8]:
# Createing 3 new columns "date_year", "date_month", "date_day"
df['date_year']  = df['date_time'].dt.year
df['date_month'] = df['date_time'].dt.month
df['date_day']   = df['date_time'].dt.day

In [9]:
# Dropping date_time column
df.drop(columns=['date_time'], inplace=True)

In [10]:
df['is_holiday'] = np.where(df['is_holiday'] == "None", 0, 1)

In [11]:
# train test split 
X = df.drop("air_pollution_index", axis=1)
y = df['air_pollution_index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape, X_test.shape

((27000, 12), (6750, 12))

In [13]:
# num_columns = [ col for col in X_train.columns if X_train[col].dtype != "O"]
# cat_columns = [ col for col in X_train.columns if X_train[col].dtype == "O"]

num_columns = X_train.select_dtypes(exclude="object").columns
cat_columns = X_train.select_dtypes(include="object").columns
num_columns, cat_columns

(Index(['is_holiday', 'humidity', 'wind_speed', 'wind_direction',
        'visibility_in_miles', 'temperature', 'clouds_all', 'traffic_volume',
        'date_year', 'date_month', 'date_day'],
       dtype='object'),
 Index(['weather_type'], dtype='object'))

In [14]:
df.head()

Unnamed: 0,is_holiday,humidity,wind_speed,wind_direction,visibility_in_miles,temperature,clouds_all,weather_type,air_pollution_index,traffic_volume,date_year,date_month,date_day
0,0,89.0,2.0,329,1,288.28,40,Clouds,121,5545,2012,10,2
1,0,67.0,3.0,330,1,289.36,75,Clouds,178,4516,2012,10,2
2,0,66.0,3.0,329,2,289.58,90,Clouds,113,4767,2012,10,2
3,0,66.0,3.0,329,5,290.13,90,Clouds,20,5026,2012,10,2
4,0,65.0,3.0,329,7,291.14,75,Clouds,281,4918,2012,10,2


# Without pipeline and column transformer

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[cat_columns])
X_test_ohe  = ohe.transform(X_test[cat_columns])

In [16]:
# pd.DataFrame(X_train_ohe)

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_columns])
X_test_scaled  = scaler.transform(X_test[num_columns])

In [18]:
#  pd.DataFrame(X_train_scaled)
X_train_ohe.shape, X_train_scaled.shape

((27000, 11), (27000, 11))

In [19]:
# Concatenating the arrays(X_train_ohe, X_train_scaled) along columns
X_train_final_arr = np.concatenate( (X_train_ohe, X_train_scaled), axis=1)
X_test_final_arr  = np.concatenate( (X_test_ohe, X_test_scaled), axis=1)

In [20]:
# pd.DataFrame(X_train_final_arr)
X_train_final_arr.shape, X_test_final_arr.shape

((27000, 22), (6750, 22))

# With Pipeline and Column Transformer

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pipeline_1 = Pipeline([ ("ohe", ohe) ])

pipeline_2 = Pipeline([ ("scaler", scaler) ])

ct = ColumnTransformer([
    ("pipeline_1", pipeline_1, cat_columns),
    ("pipeline_2", pipeline_2, num_columns),
])

In [22]:
X_train_transformed = ct.fit_transform(X_train)
X_test_transformed  = ct.transform(X_test)

In [23]:
# pd.DataFrame(X_train_transformed)
X_train_transformed.shape, X_test_transformed.shape

((27000, 22), (6750, 22))

# Creating Model

In [None]:
# Applying Hyperparameter tuning on Random Forest
model = RandomForestRegressor()
params = {
    "n_estimators": [10,20,30,40,50,60,70,80,90,100],
    "criterion":    ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "max_features": ["sqrt", "log2", None]
}

gird_rf = GridSearchCV(estimator=model, param_grid=params, cv=3, scoring='r2')
gird_rf.fit(X_train_transformed, y_train)
print("Best Params: ",gird_rf.best_params_)
print("Best Score: ", best_score_)

In [None]:
models = {
    "KNN": KNeighborsRegressor(),
    "DTR": DecisionTreeRegressor(),
    "RFR": RandomForestRegressor(),
    "ABR": AdaBoostRegressor(),
    "SVR": SVR(),
    "LR" : LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "CB": CatBoostRegressor(),
    "XGB": XGBRegressor()
}

In [None]:
def evalute_model(y_test, y_pred):
    mse  = mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2_scor = r2_score(y_test, y_pred)
    
    return r2_scor, rmse, mae

In [None]:
model_list = []
r2_list    = []
rmse_list  = []
mae_list   = []

for key in models.keys():
    
    model = models[key]
    # Training model
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    r2_scr, rmse, mae = evalute_model(y_test, y_pred)
    
    print(f"---------------------- Model Name: {key} ----------------------------")
    print("R2 Score: ", r2_scr)
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    print()
        
    model_list.append(key)
    r2_list.append(r2_scr)
    rmse_list.append(rmse)
    mae_list.append(mae)


In [None]:
result_df = pd.DataFrame({'Model': model_list, "R2 Score": r2_list, "RMSE": rmse_list, "MAE": mae_list}).sort_values(by='R2 Score', ascending=False)

In [None]:
result_df