In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline


In [2]:
# Load data
data = pd.read_csv('../data/Train_cleaned_Aug29.csv')

In [3]:
data.columns

Index(['Place_ID', 'Date_DT', 'Place_ID X Date', 'target', 'target_min',
       'target_max', 'target_variance', 'target_count',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure', 'L3_O3_O3_column_number_density',
       'L3_O3_O3_effective_temperature', 'L3_O3_cloud_fraction',
       'L3_CO_CO_column_number_density', 'L3_CO_H2O_column_number_density',
       'L3_CO_cloud_height', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_trop

In [4]:
data['Date_DT'] = pd.to_datetime(data['Date_DT'])

In [5]:
data['DayOfWeek'] = data['Date_DT'].dt.dayofweek  # Monday=0, Sunday=6
data['IsWeekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekdays

In [6]:
X = data.drop(columns=['Date_DT', 'Place_ID X Date','target', 'target_min','target_max', 'target_variance', 'target_count'])
y = data['target']

## Feature Engineering

In [7]:
X.columns

Index(['Place_ID', 'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure', 'L3_O3_O3_column_number_density',
       'L3_O3_O3_effective_temperature', 'L3_O3_cloud_fraction',
       'L3_CO_CO_column_number_density', 'L3_CO_H2O_column_number_density',
       'L3_CO_cloud_height', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_base_pressure',
      

In [8]:
num_features = ['precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure', 'L3_O3_O3_column_number_density',
       'L3_O3_O3_effective_temperature', 'L3_O3_cloud_fraction',
       'L3_CO_CO_column_number_density', 'L3_CO_H2O_column_number_density',
       'L3_CO_cloud_height', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_base_pressure',
       'L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth',
       'L3_CLOUD_cloud_top_height', 'L3_CLOUD_cloud_top_pressure',
       'L3_CLOUD_surface_albedo', 'L3_AER_AI_absorbing_aerosol_index',
       'L3_AER_AI_sensor_altitude', 'L3_SO2_SO2_column_number_density',
       'L3_SO2_SO2_column_number_density_amf',
       'L3_SO2_SO2_slant_column_number_density',
       'L3_SO2_absorbing_aerosol_index', 'L3_SO2_cloud_fraction']

In [9]:
cat_features = ['Place_ID','DayOfWeek','IsWeekend']

In [10]:
# Convert Place_ID to string if not already
X['Place_ID'] = X['Place_ID'].astype(str)

# Continuous numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Nominal Categorical features
nominal_cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist', min_frequency=0.01))
])

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('nom', nominal_cat_transformer, cat_features)
    ])

## Below separate by model type

In [11]:
# Define the full pipeline combining preprocessing and the model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(max_depth=10, n_estimators=100, n_jobs=-1, random_state=42))
])

In [12]:
# Perform 10-fold cross-validation
scores = cross_val_score(model_pipeline, X, y, cv=10)



In [13]:
# Output the scores
print("Cross-Validation Scores: ", scores)
print("Mean CV Score: ", scores.mean())

Cross-Validation Scores:  [0.45572043 0.41485391 0.35066303 0.44641541 0.4935291  0.32309717
 0.51721558 0.42430249 0.57072742 0.43370773]
Mean CV Score:  0.44302322851479536


## Explore Hyper Parameter Tuning with SK Learn

In [12]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the full pipeline combining preprocessing and the model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# Define the parameter grid
param_grid = {
    'model__max_depth': [10, 15, 20, None],
    'model__n_estimators': [80, 100, 120],
    'model__min_samples_split': [2, 3, 5],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',  # or 'neg_mean_squared_error', depending on your objective
    n_jobs=-1,  # Use all available cores
    verbose=2  # Increase output verbosity
)

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Get the best score
print("Best cross-validation score:", grid_search.best_score_)

# The best model is automatically fitted to the entire dataset
best_model = grid_search.best_estimator_

# If you want to make predictions using the best model:
# predictions = best_model.predict(X_test)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits




## Using XGBoost

In [11]:
# Define the full pipeline combining preprocessing and the model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=42))
])

# Perform 10-fold cross-validation
scores = cross_val_score(model_pipeline, X, y, cv=10)

print("Mean CV score:", scores.mean())



Mean CV score: 0.5073014650055013




In [12]:
# Perform 10-fold cross-validation
scores = cross_val_score(model_pipeline, X, y, cv=10)



In [13]:
# Output the scores
print("Cross-Validation Scores: ", scores)
print("Mean CV Score: ", scores.mean())

Cross-Validation Scores:  [0.5209047  0.4315613  0.43295634 0.51928086 0.58112608 0.35155163
 0.56991206 0.53776414 0.61074559 0.51721195]
Mean CV Score:  0.5073014650055013
