# **Model Building and Data Preprocessing**

## **1.0 Import Libraries**

In [131]:
# For Data Manipulation
import pandas as pd
# For Numerical Manipulation
import numpy as np
# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Skewness
from scipy.stats import skew
# For Data Preprocessing
from sklearn.preprocessing import PowerTransformer, RobustScaler, MinMaxScaler
# Train-Test Split
from sklearn.model_selection import train_test_split
# Column and Function Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# Models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# For Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
# Filter Warnings
import warnings
warnings.filterwarnings("ignore")
# to store model and preprocessor in pkl
import pickle
# Inline representa
%matplotlib inline

## **2.0 Load Dataset**

In [154]:
# Load Dataset
x_train=pd.read_csv('../Artifacts/train.csv')
x_test=pd.read_csv('../Artifacts/test.csv')

In [157]:
x_train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [156]:
x_test.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


## **3.0 Feature Engineering**

In [None]:
# Dropping Irrelevant Features
x_train.drop(columns=['id','maxtemp'], inplace=True)
x_test.drop(columns=['id','maxtemp'], inplace=True)

## **4.0 Split into X and y**

In [None]:
# Splitting Features and Target
X=x_train.drop(columns=['rainfall'])
y=x_train['rainfall']

## **5.0 Train-Test Split**

In [135]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1752, 10), (438, 10), (1752,), (438,))

## **6.0 Data Preprocessing**

In [137]:
# Separating positive and negative skewed for transformation
positive_skewed=[]
negative_skewed=[]
for col in x_train.columns:
    if skew(x_train[col]) > 0.5:
        positive_skewed.append(col)
    elif (skew(x_train[col]) < -0.5):
        negative_skewed.append(col)
    else:
        pass
all_transformed=negative_skewed+positive_skewed
not_transformed=list(set(x_train.columns)-set(all_transformed))

In [138]:
print(f"positive skewed Features are: {positive_skewed}")
print(f"negative skewed Features are: {negative_skewed}")
print(f"all transformed Features are: {all_transformed}")
print(f"not transformed Features are: {not_transformed}")

positive skewed Features are: ['sunshine', 'winddirection', 'windspeed']
negative skewed Features are: ['temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud']
all transformed Features are: ['temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
not transformed Features are: ['pressure', 'day']


In [139]:
preprocessor = ColumnTransformer(transformers=[
    ('Power_Transformer', PowerTransformer(method='yeo-johnson', standardize=True, copy=True), negative_skewed),
    ('Log_Transformation', FunctionTransformer(np.log1p, validate=True), positive_skewed),
    ('Robust_Scaler', RobustScaler(), all_transformed),
    ('Not_Transformed', MinMaxScaler(), not_transformed)
], remainder='passthrough')

In [146]:
x_train=preprocessor.fit_transform(x_train)
x_test=preprocessor.transform(x_test)

ValueError: Specifying the columns using strings is only supported for dataframes.

## **7.0 Model Building**

In [151]:
def evaluate_models_with_bestparams(x_train, y_train, x_test, y_test):
    best_score=0
    models = { 
    'logistic regression': (LogisticRegression(random_state=42), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    }),

    'svc': (SVC(random_state=42), {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }),

    'naive bayes': (GaussianNB(), {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    }),

    'decision tree': (DecisionTreeClassifier(random_state=42),{
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }),

    'random forest': (RandomForestClassifier(random_state=42),{
    'n_estimators': [100, 300],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
    }),

    'gradient boosting': (GradientBoostingClassifier(random_state=42), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 5, 10]
    }),

    'adaboost': (AdaBoostClassifier(random_state=42), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    }),

    'extra trees': (ExtraTreesClassifier(random_state=42), {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }),

    'xgboost': (XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [ 3, 5, 10],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0]
    })
    }
    try:
        for model_name, (model, hyperparameters) in models.items():
            grid_search = RandomizedSearchCV(model, hyperparameters, cv=5, scoring='accuracy', n_jobs=-1)
            grid_search.fit(x_train, y_train)
            # Best hyperparameters
            print(f"Best parameters for {model_name}: {grid_search.best_params_}")
            print(f"Best score: {grid_search.best_score_:.4f}")
            y_pred=grid_search.predict(x_test)
            accuracy=accuracy_score(y_test, y_pred)
            print(f"Accuracy: {accuracy:.4f}")
            if grid_search.best_score_ > best_score:
                best_score=grid_search.best_score_
                best_model_name=model_name
                best_model=grid_search.best_estimator_
    except Exception as e:
        print(f"An error occurred: {e}")
    print(f"Best Model: {best_model_name}")
    print(f"Best Score: {best_score:.4f}")

In [152]:
evaluate_models_with_bestparams(x_train, y_train, x_test, y_test)

Best parameters for logistic regression: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.1}
Best score: 0.8704
Accuracy: 0.8493
Best parameters for svc: {'kernel': 'rbf', 'gamma': 'auto', 'C': 10}
Best score: 0.8693
Accuracy: 0.8493
Best parameters for naive bayes: {'var_smoothing': 1e-09}
Best score: 0.8413
Accuracy: 0.8196
Best parameters for decision tree: {'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 50, 'criterion': 'gini'}
Best score: 0.8213
Accuracy: 0.7968
Best parameters for random forest: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}
Best score: 0.8744
Accuracy: 0.8470
Best parameters for gradient boosting: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 3, 'learning_rate': 0.1}
Best score: 0.8647
Accuracy: 0.8539
Best parameters for adaboost: {'n_estimators': 100, 'learning_rate': 0.1}
Best score: 0.8630
Accuracy: 0.8493
Best parameters for extra trees: {'n_estimators': 500, 'min_samples_spli

**Saving Preprocessor**

In [153]:
with open('../Artifacts/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor,f)