In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load processed data
df = pd.read_csv("Processed_AOD_PM25.csv")

# Mapping location codes to actual names
location_names = {
    0: "Bollaram Industrial Area, Hyderabad - TSPCB",
    1: "Central University, Hyderabad - TSPCB",
    2: "ICRISAT Patancheru, Hyderabad - TSPCB",
    3: "IDA Pashamylaram, Hyderabad - TSPCB",
    4: "Sanathnagar, Hyderabad - TSPCB",
    5: "Zoo Park, Hyderabad - TSPCB"
}

# Prepare results dictionary
results = {}

# Get unique locations
locations = df["Location"].unique()

# Train model for each location
for loc in locations:
    location_name = location_names.get(loc, f"Unknown Location {loc}")
    #print(f"Training model for Location: {location_name}")
    
    # Filter data for the current location
    df_loc = df[df["Location"] == loc]
    
    # Define features and target variable
    X = df_loc.drop(columns=["PM2.5", "Dates", "Location"])
    y = df_loc["PM2.5"]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store results
    results[location_name + ":"] = {"R2 Score": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"Location {location_name}: R2 = {r2:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}\n")

# Convert results to DataFrame and save
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df.to_csv("Model_Performance_GradientBoost_by_Location.csv")

print("Training completed. Model performance saved in 'Model_Performance_GradientBoost_by_Location.csv'.")


Location Bollaram Industrial Area, Hyderabad - TSPCB: R2 = 0.5130, RMSE = 13.4737, MAE = 11.4802

Location Central University, Hyderabad - TSPCB: R2 = 0.6868, RMSE = 10.6359, MAE = 8.9959

Location ICRISAT Patancheru, Hyderabad - TSPCB: R2 = 0.4086, RMSE = 15.6366, MAE = 11.7682

Location IDA Pashamylaram, Hyderabad - TSPCB: R2 = 0.4483, RMSE = 15.1284, MAE = 12.9606

Location Sanathnagar, Hyderabad - TSPCB: R2 = -0.9774, RMSE = 35.9470, MAE = 23.6901

Location Zoo Park, Hyderabad - TSPCB: R2 = 0.5635, RMSE = 11.6999, MAE = 8.1391

Training completed. Model performance saved in 'Model_Performance_GradientBoost_by_Location.csv'.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load processed data
df = pd.read_csv("Processed_AOD_PM25.csv")

# Mapping location codes to actual names
location_names = {
    0: "Bollaram Industrial Area, Hyderabad - TSPCB",
    1: "Central University, Hyderabad - TSPCB",
    2: "ICRISAT Patancheru, Hyderabad - TSPCB",
    3: "IDA Pashamylaram, Hyderabad - TSPCB",
    4: "Sanathnagar, Hyderabad - TSPCB",
    5: "Zoo Park, Hyderabad - TSPCB"
}

# Prepare results dictionary
results = {}

# Get unique locations
locations = df["Location"].unique()

# Train model for each location
for loc in locations:
    location_name = location_names.get(loc, f"Unknown Location {loc}")
    #print(f"Training model for Location: {location_name}")
    
    # Filter data for the current location
    df_loc = df[df["Location"] == loc]
    
    # Define features and target variable
    X = df_loc.drop(columns=["PM2.5", "Dates", "Location"])
    y = df_loc["PM2.5"]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on train and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model performance on train data
    r2_train = r2_score(y_train, y_train_pred)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    mae_train = mean_absolute_error(y_train, y_train_pred)
    
    # Evaluate model performance on test data
    r2_test = r2_score(y_test, y_test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    # Store results
    results[location_name] = {
        "R2 (Train)": r2_train, "RMSE (Train)": rmse_train, "MAE (Train)": mae_train,
        "R2 (Test)": r2_test, "RMSE (Test)": rmse_test, "MAE (Test)": mae_test
    }
    
    # Print results
    print(f"Location {location_name}:\n"
          f"  R2 (Train) = {r2_train:.4f}, R2 (Test) = {r2_test:.4f}\n"
          f"  RMSE (Train) = {rmse_train:.4f}, RMSE (Test) = {rmse_test:.4f}\n"
          f"  MAE (Train) = {mae_train:.4f}, MAE (Test) = {mae_test:.4f}\n")

# Convert results to DataFrame and save
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df.to_csv("Model_Performance_GradientBoost_by_Location.csv")

print("Training completed. Model performance saved in 'Model_Performance_GradientBoost_by_Location.csv'.")


Location Bollaram Industrial Area, Hyderabad - TSPCB:
  R2 (Train) = 0.9704, R2 (Test) = 0.5130
  RMSE (Train) = 2.8747, RMSE (Test) = 13.4737
  MAE (Train) = 2.3119, MAE (Test) = 11.4802

Location Central University, Hyderabad - TSPCB:
  R2 (Train) = 0.9759, R2 (Test) = 0.6868
  RMSE (Train) = 2.9095, RMSE (Test) = 10.6359
  MAE (Train) = 2.2566, MAE (Test) = 8.9959

Location ICRISAT Patancheru, Hyderabad - TSPCB:
  R2 (Train) = 0.9601, R2 (Test) = 0.4088
  RMSE (Train) = 3.8373, RMSE (Test) = 15.6335
  MAE (Train) = 3.0607, MAE (Test) = 11.6876

Location IDA Pashamylaram, Hyderabad - TSPCB:
  R2 (Train) = 0.9737, R2 (Test) = 0.4483
  RMSE (Train) = 3.0373, RMSE (Test) = 15.1284
  MAE (Train) = 2.4935, MAE (Test) = 12.9606

Location Sanathnagar, Hyderabad - TSPCB:
  R2 (Train) = 0.9909, R2 (Test) = -0.9774
  RMSE (Train) = 6.9179, RMSE (Test) = 35.9470
  MAE (Train) = 5.4427, MAE (Test) = 23.6901

Location Zoo Park, Hyderabad - TSPCB:
  R2 (Train) = 0.9816, R2 (Test) = 0.5638
  RMSE (