<a href="https://colab.research.google.com/github/tlysenko/ML-notebooks/blob/master/Rain_in_Australia_XGboost_with_sklearn_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset

https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package

Reference 
https://medium.com/towards-data-science/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390

### Installing the libraries

In [1]:
!pip install xgboost



### Importing the libraries

In [2]:
import xgboost as xgb

In [3]:
import numpy as np 
import pandas as pd
import seaborn as sns
%matplotlib inline

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.model_selection import GridSearchCV

### Data

In [9]:
df = pd.read_csv("weatherAUS.csv")

### EDA

In [10]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129804 entries, 0 to 129803
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           129804 non-null  object 
 1   Location       129804 non-null  object 
 2   MinTemp        128411 non-null  float64
 3   MaxTemp        128600 non-null  float64
 4   Rainfall       126643 non-null  float64
 5   Evaporation    71894 non-null   float64
 6   Sunshine       66737 non-null   float64
 7   WindGustDir    119716 non-null  object 
 8   WindGustSpeed  119774 non-null  float64
 9   WindDir9am     120111 non-null  object 
 10  WindDir3pm     125714 non-null  object 
 11  WindSpeed9am   128083 non-null  float64
 12  WindSpeed3pm   126795 non-null  float64
 13  Humidity9am    127266 non-null  float64
 14  Humidity3pm    126167 non-null  float64
 15  Pressure9am    115906 non-null  float64
 16  Pressure3pm    115944 non-null  float64
 17  Cloud9am       79034 non-null

## Data Cleaning

### Dropping columns

In [12]:
cols_to_drop = ["Date", "Location", "RainTomorrow", "Rainfall"]

df.drop(cols_to_drop, axis=1, inplace=True)

In [13]:
missing_props = df.isna().mean(axis=0)
missing_props

MinTemp          0.010732
MaxTemp          0.009276
Evaporation      0.446134
Sunshine         0.485863
WindGustDir      0.077717
WindGustSpeed    0.077270
WindDir9am       0.074674
WindDir3pm       0.031509
WindSpeed9am     0.013258
WindSpeed3pm     0.023181
Humidity9am      0.019553
Humidity3pm      0.028019
Pressure9am      0.107069
Pressure3pm      0.106776
Cloud9am         0.391128
Cloud3pm         0.413000
Temp9am          0.013120
Temp3pm          0.022079
RainToday        0.024360
dtype: float64

In [14]:
# setting the threshold to 40%
over_threshold = missing_props[missing_props >= 0.4]
over_threshold

Evaporation    0.446134
Sunshine       0.485863
Cloud3pm       0.413000
dtype: float64

In [15]:
over_threshold.index

Index(['Evaporation', 'Sunshine', 'Cloud3pm'], dtype='object')

In [16]:
df.drop(over_threshold.index, 
          axis=1, 
          inplace=True)

In [17]:
# Splitting the data 
X = df.drop("RainToday", axis=1)
y = df.RainToday

### Transforming categorical and numercial columns using sklearn Pipeline

In [36]:
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

In [37]:
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [38]:
cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns

In [39]:
#check 
print(len(cat_cols) + len(num_cols))
print(len(X.columns))

15
15


In [40]:
full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

In [23]:
xgb_cl = xgb.XGBClassifier()

In [24]:
# Apply preprocessing
X_processed = full_processor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=9999)

### Training the baseline model

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

In [27]:
accuracy_score(y_test, preds)

0.833934239314659

### Hyperparameters tuning

In [28]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [29]:
# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X_processed, y_processed)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [30]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.1,
 'max_depth': 7,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [41]:
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_,
    objective="binary:logistic",
)

In [None]:
final_cl.fit(X_test, y_test)

In [45]:
preds = final_cl.predict(X_test)

In [46]:
accuracy_score(y_test, preds)

0.885211549721118