In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load processed data
df = pd.read_csv("Processed_AOD_PM25.csv")

# Mapping location codes to actual names
location_names = {
    0: "Bollaram Industrial Area, Hyderabad - TSPCB",
    1: "Central University, Hyderabad - TSPCB",
    2: "ICRISAT Patancheru, Hyderabad - TSPCB",
    3: "IDA Pashamylaram, Hyderabad - TSPCB",
    4: "Sanathnagar, Hyderabad - TSPCB",
    5: "Zoo Park, Hyderabad - TSPCB"
}

# Prepare results dictionary
results = {}

# Get unique locations
locations = df["Location"].unique()

# Train model for each location
for loc in locations:
    location_name = location_names.get(loc, f"Unknown Location {loc}")
    #print(f"Training model for Location: {location_name}")
    
    # Filter data for the current location
    df_loc = df[df["Location"] == loc]
    
    # Define features and target variable
    X = df_loc.drop(columns=["PM2.5", "Dates", "Location"])
    y = df_loc["PM2.5"]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = XGBRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store results
    results[location_name] = {"R2 Score": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"Location {location_name}: R2 = {r2:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}\n")

# Convert results to DataFrame and save
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df.to_csv("Model_Performance_XGBoost_by_Location.csv")

print("Training completed. Model performance saved in 'Model_Performance_XGBoost_by_Location.csv'.")


Location Bollaram Industrial Area, Hyderabad - TSPCB: R2 = 0.4987, RMSE = 13.6693, MAE = 11.2743

Location Central University, Hyderabad - TSPCB: R2 = 0.5560, RMSE = 12.6636, MAE = 10.6405

Location ICRISAT Patancheru, Hyderabad - TSPCB: R2 = 0.3230, RMSE = 16.7298, MAE = 12.7413

Location IDA Pashamylaram, Hyderabad - TSPCB: R2 = 0.4338, RMSE = 15.3248, MAE = 11.9369

Location Sanathnagar, Hyderabad - TSPCB: R2 = -2.4881, RMSE = 47.7434, MAE = 23.0234

Location Zoo Park, Hyderabad - TSPCB: R2 = 0.4618, RMSE = 12.9916, MAE = 9.5319

Training completed. Model performance saved in 'Model_Performance_XGBoost_by_Location.csv'.


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load processed data
df = pd.read_csv("Processed_AOD_PM25.csv")

# Mapping location codes to actual names
location_names = {
    0: "Bollaram Industrial Area, Hyderabad - TSPCB",
    1: "Central University, Hyderabad - TSPCB",
    2: "ICRISAT Patancheru, Hyderabad - TSPCB",
    3: "IDA Pashamylaram, Hyderabad - TSPCB",
    4: "Sanathnagar, Hyderabad - TSPCB",
    5: "Zoo Park, Hyderabad - TSPCB"
}

# Prepare results dictionary
results = {}

# Get unique locations
locations = df["Location"].unique()

# Train model for each location
for loc in locations:
    location_name = location_names.get(loc, f"Unknown Location {loc}")
    #print(f"Training model for Location: {location_name}")
    
    # Filter data for the current location
    df_loc = df[df["Location"] == loc]
    
    # Define features and target variable
    X = df_loc.drop(columns=["PM2.5", "Dates", "Location"])
    y = df_loc["PM2.5"]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = XGBRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on both train and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model performance on training data
    r2_train = r2_score(y_train, y_train_pred)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    mae_train = mean_absolute_error(y_train, y_train_pred)
    
    # Evaluate model performance on testing data
    r2_test = r2_score(y_test, y_test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    # Store results for both train and test
    results[location_name] = {
        "R2 (Train)": r2_train, "RMSE (Train)": rmse_train, "MAE (Train)": mae_train,
        "R2 (Test)": r2_test, "RMSE (Test)": rmse_test, "MAE (Test)": mae_test
    }
    
    # Print performance metrics
    print(f"Location {location_name}:\n"
          f"  R2 (Train) = {r2_train:.4f}, R2 (Test) = {r2_test:.4f}\n"
          f"  RMSE (Train) = {rmse_train:.4f}, RMSE (Test) = {rmse_test:.4f}\n"
          f"  MAE (Train) = {mae_train:.4f}, MAE (Test) = {mae_test:.4f}\n")

# Convert results to DataFrame and save as CSV
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df.to_csv("Model_Performance_XGBoost_by_Location.csv")

print("Training completed. Model performance saved in 'Model_Performance_XGBoost_by_Location.csv'.")


Location Bollaram Industrial Area, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = 0.4987
  RMSE (Train) = 0.0008, RMSE (Test) = 13.6693
  MAE (Train) = 0.0006, MAE (Test) = 11.2743

Location Central University, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = 0.5560
  RMSE (Train) = 0.0007, RMSE (Test) = 12.6636
  MAE (Train) = 0.0005, MAE (Test) = 10.6405

Location ICRISAT Patancheru, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = 0.3230
  RMSE (Train) = 0.0006, RMSE (Test) = 16.7298
  MAE (Train) = 0.0004, MAE (Test) = 12.7413

Location IDA Pashamylaram, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = 0.4338
  RMSE (Train) = 0.0007, RMSE (Test) = 15.3248
  MAE (Train) = 0.0005, MAE (Test) = 11.9369

Location Sanathnagar, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = -2.4881
  RMSE (Train) = 0.0007, RMSE (Test) = 47.7434
  MAE (Train) = 0.0005, MAE (Test) = 23.0234

Location Zoo Park, Hyderabad - TSPCB:
  R2 (Train) = 1.0000, R2 (Test) = 0.4618
  RMSE 