In [1]:
import numpy as np
import pandas as pd
import os

# List all files in the specified directory
csv_files = []
for dirname, _, filenames in os.walk('/kaggle/input/filtered-pm2-5pm10'):
    for filename in filenames:
        if filename.endswith('.csv'):
            csv_files.append(os.path.join(dirname, filename))

# Print all CSV file paths
for file_path in csv_files:
    print(file_path)

# Import the first CSV file into a DataFrame
if csv_files:
    # Select the first CSV file from the list
    first_csv_file = csv_files[0]
    # Read the CSV file into a DataFrame
    df = pd.read_csv(first_csv_file)
    # Display the first few rows of the DataFrame
    print(df.head())
else:
    print("No CSV files found in the directory.")


/kaggle/input/filtered-pm2-5pm10/filtered_df.csv
             From Date              To Date  PM2.5 (ug/m3)  PM10 (ug/m3)  \
0  2018-02-01 11:00:00  2018-02-01 12:00:00     165.985286        385.75   
1  2018-02-01 12:00:00  2018-02-01 13:00:00     165.985286        368.83   
2  2018-02-01 13:00:00  2018-02-01 14:00:00     149.000000        333.75   
3  2018-02-01 14:00:00  2018-02-01 15:00:00     113.080000        273.25   
4  2018-02-01 15:00:00  2018-02-01 16:00:00      93.500000        239.58   

   NO (ug/m3)  NO2 (ug/m3)  NOx (ppb)  NH3 (ug/m3)  SO2 (ug/m3)  \
0        5.96        26.08      32.14        37.77        20.26   
1        2.70        15.93      18.62        38.67        12.48   
2        1.33        11.37      23.08        24.69         4.28   
3        1.22        15.52      33.15         7.96         0.53   
4        0.84        36.40      27.24         5.14         0.87   

   Ozone (ug/m3)  Benzene (ug/m3)  RH (%)  WS (m/s)  WD (degree)  SR (W/mt2)  \
0      33.1

In [2]:
df = df.drop(['From Date', 'To Date'], axis=1)

In [3]:
df = df.head(2747)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
X = df.drop(['PM10 (ug/m3)'], axis=1)
y = df['PM10 (ug/m3)']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 2218.8140915327
R^2 Score: 0.7852308319508267


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage to prevent overfitting
    'max_depth': [3, 4, 5],                 # Maximum depth of the individual trees
}

# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor(random_state=42)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', # We use negative MSE because GridSearchCV maximizes the score
                           cv=5,                            # Number of cross-validation folds
                           n_jobs=-1,                       # Use all available cores
                           verbose=1)                       # Print detailed progress

# Fit Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best model from Grid Search
best_model = grid_search.best_estimator_

# Print the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Negative MSE: {grid_search.best_score_}")

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best Negative MSE: -1800.4222199870644
Mean Squared Error: 1794.1124332541913
R^2 Score: 0.8263396486676756


In [7]:
import joblib

# Save the best model obtained from GridSearchCV
joblib.dump(best_model, '/kaggle/working/best_gradient_boosting_model.pkl')

# If you also want to save the scaler used for feature standardization
joblib.dump(scaler, '/kaggle/working/scaler.pkl')


['/kaggle/working/scaler.pkl']