In [2]:
# Install packages if needed
!pip install -q scikit-learn xgboost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from xgboost import XGBRegressor

In [4]:
df = pd.read_csv('../Datas/AQ_Data_TN.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4186568 entries, 0 to 4186567
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   location_id  int64  
 1   sensors_id   int64  
 2   location     object 
 3   datetime     object 
 4   lat          float64
 5   lon          float64
 6   parameter    object 
 7   units        object 
 8   value        float64
dtypes: float64(3), int64(2), object(4)
memory usage: 287.5+ MB


In [5]:
df['parameter'].unique()

array(['pm10', 'pm25', 'o3', 'no2', 'no', 'relativehumidity',
       'temperature', 'so2', 'co'], dtype=object)

In [6]:
df.nunique()

location_id       38
sensors_id       319
location          39
datetime       21038
lat               38
lon               39
parameter          9
units              4
value          40174
dtype: int64

In [7]:
df['datetime'] = pd.to_datetime(df['datetime'] , errors='coerce')
df['date'] = df['datetime'].dt.date
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4186568 entries, 0 to 4186567
Data columns (total 14 columns):
 #   Column       Dtype                    
---  ------       -----                    
 0   location_id  int64                    
 1   sensors_id   int64                    
 2   location     object                   
 3   datetime     datetime64[ns, UTC+05:30]
 4   lat          float64                  
 5   lon          float64                  
 6   parameter    object                   
 7   units        object                   
 8   value        float64                  
 9   date         object                   
 10  month        int32                    
 11  year         int32                    
 12  day          int32                    
 13  hour         int32                    
dtypes: datetime64[ns, UTC+05:30](1), float64(3), int32(4), int64(2), object(4)
memory usage: 383.3+ MB


In [9]:
df.describe(include='all')

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,date,month,year,day,hour
count,4186568.0,4186568.0,4186568,4186568,4186568.0,4186568.0,4186568,4186568,4186568.0,4186568,4186568.0,4186568.0,4186568.0,4186568.0
unique,,,39,,,,9,4,,271,,,,
top,,,"Manali Village, Chennai - TNPCB-3379598",,,,pm10,ppb,,2025-03-21,,,,
freq,,,166842,,,,514998,1827592,,24251,,,,
mean,1746580.0,12246550.0,,2025-06-10 09:47:49.386858752+05:30,11.66197,78.9859,,,4.42105e+24,,5.801348,2025.0,16.00886,11.88537
min,2586.0,5077812.0,,2025-01-01 01:00:00+05:30,8.728442,76.7139,,,-329363.5,,1.0,2025.0,1.0,0.0
25%,11581.0,12236290.0,,2025-04-13 22:45:00+05:30,10.83016,78.02267,,,3.91,,4.0,2025.0,8.0,6.0
50%,358627.0,12237200.0,,2025-06-12 15:00:00+05:30,11.67911,79.13186,,,14.43,,6.0,2025.0,16.0,12.0
75%,3409350.0,12237300.0,,2025-08-07 09:15:00+05:30,12.9533,80.1081,,,33.0,,8.0,2025.0,23.0,18.0
max,3774726.0,14018150.0,,2025-09-28 23:45:00+05:30,13.4127,80.2909,,,9.46017e+30,,9.0,2025.0,31.0,23.0


In [10]:
# EDA on target 'value' (robust to missing Y)
if 'value_clipped' in df.columns:
    target_series = df['value_clipped']
    print("Using value_clipped for EDA")
else:
    target_series = df['value']
    print("Using raw value for EDA")

print('dtype:', target_series.dtype)
print(target_series.describe())
print('Quantiles:', target_series.quantile([0,0.01,0.05,0.5,0.95,0.99,1]).to_dict())
print('Top 5 |value|:', target_series.reindex(target_series.abs().sort_values(ascending=False).index).head().to_dict())

Using raw value for EDA
dtype: float64
count    4.186568e+06
mean     4.421050e+24
std      6.398036e+27
min     -3.293635e+05
25%      3.910000e+00
50%      1.443000e+01
75%      3.300000e+01
max      9.460170e+30
Name: value, dtype: float64
Quantiles: {0.0: -329363.5, 0.01: -0.44, 0.05: 0.05, 0.5: 14.43, 0.95: 80.0, 0.99: 136.3, 1.0: 9.460169849432296e+30}
Top 5 |value|: {857781: 9.460169849432296e+30, 857782: 9.048858116848286e+30, 858071: 5.980286195931758e+21, 858070: 2.7301306546644986e+21, 19717: 907744.38}


In [11]:
df.duplicated(subset=["location_id", "parameter", "datetime", "value"]).sum()

np.int64(0)

In [12]:
df.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,date,month,year,day,hour
0,2586,12235652,"Manali, Chennai - CPCB-3379580",2025-02-19 01:45:00+05:30,13.164544,80.26285,pm10,µg/m³,75.92,2025-02-19,2,2025,19,1
1,2586,12235652,"Manali, Chennai - CPCB-3379580",2025-02-19 02:00:00+05:30,13.164544,80.26285,pm10,µg/m³,75.92,2025-02-19,2,2025,19,2
2,2586,12235652,"Manali, Chennai - CPCB-3379580",2025-02-19 02:15:00+05:30,13.164544,80.26285,pm10,µg/m³,75.92,2025-02-19,2,2025,19,2
3,2586,12235652,"Manali, Chennai - CPCB-3379580",2025-02-19 02:30:00+05:30,13.164544,80.26285,pm10,µg/m³,76.9,2025-02-19,2,2025,19,2
4,2586,12235652,"Manali, Chennai - CPCB-3379580",2025-02-19 02:45:00+05:30,13.164544,80.26285,pm10,µg/m³,79.19,2025-02-19,2,2025,19,2


In [13]:
df.groupby(by=['parameter'],as_index=False).count()

Unnamed: 0,parameter,location_id,sensors_id,location,datetime,lat,lon,units,value,date,month,year,day,hour
0,co,392344,392344,392344,392344,392344,392344,392344,392344,392344,392344,392344,392344,392344
1,no,482320,482320,482320,482320,482320,482320,482320,482320,482320,482320,482320,482320,482320
2,no2,486985,486985,486985,486985,486985,486985,486985,486985,486985,486985,486985,486985,486985
3,o3,433761,433761,433761,433761,433761,433761,433761,433761,433761,433761,433761,433761,433761
4,pm10,514998,514998,514998,514998,514998,514998,514998,514998,514998,514998,514998,514998,514998
5,pm25,483422,483422,483422,483422,483422,483422,483422,483422,483422,483422,483422,483422,483422
6,relativehumidity,461702,461702,461702,461702,461702,461702,461702,461702,461702,461702,461702,461702,461702
7,so2,465943,465943,465943,465943,465943,465943,465943,465943,465943,465943,465943,465943,465943
8,temperature,465093,465093,465093,465093,465093,465093,465093,465093,465093,465093,465093,465093,465093


In [14]:
df = pd.get_dummies(df, columns=['parameter'], drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4186568 entries, 0 to 4186567
Data columns (total 21 columns):
 #   Column                      Dtype                    
---  ------                      -----                    
 0   location_id                 int64                    
 1   sensors_id                  int64                    
 2   location                    object                   
 3   datetime                    datetime64[ns, UTC+05:30]
 4   lat                         float64                  
 5   lon                         float64                  
 6   units                       object                   
 7   value                       float64                  
 8   date                        object                   
 9   month                       int32                    
 10  year                        int32                    
 11  day                         int32                    
 12  hour                        int32                    
 1

In [15]:
# Clip extreme outliers in target 'value' to stabilize training
q_low, q_high = df['value'].quantile([0.01, 0.99])
df['value_clipped'] = df['value'].clip(lower=q_low, upper=q_high)
print('Clipping thresholds:', q_low, q_high)
print(df['value_clipped'].describe())

Clipping thresholds: -0.44 136.3
count    4.186568e+06
mean     2.397260e+01
std      2.684835e+01
min     -4.400000e-01
25%      3.910000e+00
50%      1.443000e+01
75%      3.300000e+01
max      1.363000e+02
Name: value_clipped, dtype: float64


In [16]:
# Rebuild features to use the clipped target
X = df[['location_id', 'year', 'month', 'day', 'hour'] + [col for col in df.columns if col.startswith('parameter_')]]
Y = df['value_clipped']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4186568 entries, 0 to 4186567
Data columns (total 13 columns):
 #   Column                      Dtype
---  ------                      -----
 0   location_id                 int64
 1   year                        int32
 2   month                       int32
 3   day                         int32
 4   hour                        int32
 5   parameter_no                bool 
 6   parameter_no2               bool 
 7   parameter_o3                bool 
 8   parameter_pm10              bool 
 9   parameter_pm25              bool 
 10  parameter_relativehumidity  bool 
 11  parameter_so2               bool 
 12  parameter_temperature       bool 
dtypes: bool(8), int32(4), int64(1)
memory usage: 127.8 MB


In [17]:
# Fresh split (stratification not applicable for regression)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [18]:
# Train XGBoost on filtered data (remove outliers by quantiles) without early stopping
# Build filtered dataset
q_low, q_high = df['value'].quantile([0.01, 0.99])
df_filt = df[(df['value'] >= q_low) & (df['value'] <= q_high)].copy()
print('Filtered rows:', len(df_filt), 'of', len(df))

X_f = df_filt[['location_id', 'year', 'month', 'day', 'hour'] + [c for c in df_filt.columns if c.startswith('parameter_')]].copy()
# Cast bools to integers for XGBoost
for c in X_f.select_dtypes(include='bool').columns:
    X_f[c] = X_f[c].astype('uint8')
Y_f = df_filt['value']

X_train, X_test, Y_train, Y_test = train_test_split(X_f, Y_f, test_size=0.2, random_state=42)

base_reg = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

model = TransformedTargetRegressor(
    regressor=base_reg,
    transformer=StandardScaler()
)

# Fit on the training data
model.fit(X_train, Y_train)

Filtered rows: 4102847 of 4186568


0,1,2
,regressor,"XGBRegressor(...ree=None, ...)"
,transformer,StandardScaler()
,func,
,inverse_func,
,check_inverse,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [19]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Predictions and proper regression metrics
y_pred = model.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mae = mean_absolute_error(Y_test, y_pred)
# some sklearn versions may not support squared=False; compute RMSE manually
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))

print("R²:", r2)
print("MAE:", mae)
print("RMSE:", rmse)

R²: 0.8332065427030287
MAE: 5.592882532269302
RMSE: 9.991066497103846


In [20]:
# Save model to HDF5 (.h5) with embedded joblib blob and metadata
import os, io, json, joblib, h5py
from datetime import datetime

out_dir = os.path.join('..', 'Models')
os.makedirs(out_dir, exist_ok=True)

# Serialize model to in-memory buffer
buf = io.BytesIO()
joblib.dump(model, buf)
buf.seek(0)

h5_path = os.path.join(out_dir, f"xgb_ttr_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}.h5")
with h5py.File(h5_path, 'w') as h5f:
    # Store the joblib bytes as a dataset
    h5f.create_dataset('model_joblib', data=np.void(buf.getvalue()))
    # Store metadata as a JSON string
    meta = {
        'created_at': datetime.now().isoformat(),
        'feature_columns': list(X_train.columns),
        'dtypes': {c: str(X_train[c].dtype) for c in X_train.columns},
        'target': 'value',
        'note': 'XGBRegressor wrapped with TransformedTargetRegressor; trained on outlier-filtered data (1st–99th percentiles).',
    }
    try:
        meta['q_low'] = float(q_low)
        meta['q_high'] = float(q_high)
    except Exception:
        pass
    h5f.attrs['metadata'] = json.dumps(meta)

print('Saved H5 model to:', h5_path)

Saved H5 model to: ..\Models\xgb_ttr_model_20251003_221235.h5
