# Importing Libraries
in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [None]:
# importing all the necessary libraries
import pandas as pd

from numpy import mean
import numpy as np
import time
from imblearn.over_sampling import SMOTE

# step 1: preprocessing
from sklearn.impute import SimpleImputer # import some strategic imputer to fill in any missing values using mean
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer # scale all the values to one range to avoid any biasness (this bias is seen in mostly naive bayes and knn etc)

from sklearn.impute import KNNImputer # import some strategic imputer to fill missing values using KNN (finds the nearest neighbour and fills it with that value)

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold

# step 2: data division
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid # to divide the code into train/test using a specific percentage or with/without replacement

# step 3: model
from sklearn.tree import DecisionTreeClassifier                                                        
from sklearn.naive_bayes import GaussianNB                                                              
from sklearn.neighbors import KNeighborsClassifier                                                       
from sklearn.ensemble import BaggingClassifier, VotingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
import lightgbm as lgb 
import xgboost as xgb
from catboost import CatBoostClassifier 

# step 4: displaying accuracy
from sklearn.metrics import roc_auc_score, accuracy_score # to display the accuracy of our tree

# step 5: warning filter
import warnings
warnings.filterwarnings('ignore')

In [None]:
# use this block to install any libraries not on the system
# !pip install pandas
# !pip install sklearn
# python -m pip install scikit-learn lightgbm xgboost catboost

# Data Loading
data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [None]:
# lets load the training data set
train_data = pd.read_csv(r"/kaggle/input/imlchallenger1/train_set.csv")

# lets also check it by getting the first few rows of the data, there should be x1 - x78 and one target variable Y
train_data.head() 

In [None]:
# lets load the test data
test_data = pd.read_csv(r"/kaggle/input/imlchallenger1/test_set.csv")

# check if the data has been loaded by getting the first 5 rows - there should be x1 - x78 and no target variable Y as this is test data
test_data.head() 

# Data Preprocessing
before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

In [None]:
# Ensure proper data types for categorical features
categorical_columns = ['X5', 'X8', 'X11']
for col in categorical_columns:
    df1[col] = df1[col].astype('str')
    df2[col] = df2[col].astype('str')

## Conversion of Categorical to Numerical
First we will convert categorical data to numerical data by doing one hot encoding, which turns it into binary variables

In [None]:
# one hot encoding - display it
pd.get_dummies(train_data) # this line will convert the train_data to one hot encoding but it will only display the result and not save it

In [None]:
# we can see that there is no change in the number of columns meaning there is no categorical data. but for the sake of running the program. we must perform the preprocessing therefore we shall re-run the one hot encoding and save it somewhere
train_data_processed = pd.get_dummies(train_data)

# now we shall do the same on the test data so that we maintain the rules over all data
test_data_processed = pd.get_dummies(test_data)

## Data Splitting - festures and targets
the data in train_data set is of x1 - x78 columns (79 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [None]:
# so in X, it is ALL the columns EXCEPT the last column known as 'Y' (we can confirm this using the train_data.head() we did earlier) so we must get all columns and DROP only the 'y' column
X = train_data_processed.drop(columns=['Y'])
X # lets display X and see what it is now

In [None]:
# so as per our X output, we can see that number of columns in train_data is 79 and number of columns in X is 78 meaning we have successfully performed our removal of target variable
# now to get the target variable alone, we can just get it alone,
Y = train_data_processed['Y']
Y # lets see what it is
# as per our Y output, we can see it is of one column and 246k rows which means we have successfully extracted the target variable column

## Data Imputation 
many cells in our data may be empty - we must fill these cells with data. we have multiple options to deal with them:
- we remove the entire rows (Case 1)
- we fill the cells with the average of the column (Case 2)
- we fill the cells based on KNN imputation (nearest neighbour) (Case 3)

In [None]:
# Average Mean Imputation
# ----------------------------- case -----------------------------
# this will fill all the empty spaces using the average of all the spaces
imputer = SimpleImputer(strategy='mean')

In [None]:
# KNN Imputation
# ----------------------------- case -----------------------------
# this fills them in using k-nearest neighbours of all the spaces
# imputer = KNNImputer(n_neighbors=7)

In [None]:
X = imputer.fit_transform(X)                                        # fill them in X
test_data_processed = imputer.transform(test_data_processed)    # fill them in test data

## Data Scaling
some columns may be very large then other columns when compared. it would not affect at the moment as we are using decision trees, but to maintain a fair enviroment, we shall perform scaling on every run.
there are two types of scaling: 
- min max scaling (also known as normalization)
- standardisation (z-score normalization)
- max abs scaler
- robust scaler
- normalizer

In [None]:
# ----------------------------- case  -----------------------------
# in this case we shall perform min max scaling. to do that, we must use our MinMaxScaler that we have imported above
# scaler = MinMaxScaler()
# # now we must use this scaler to scale X
# scaler.fit_transform(X)

In [None]:
# ----------------------------- case -----------------------------
# scaler = MaxAbsScaler()
# # now we must use this scaler to scale X
# scaler.fit_transform(X)

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X)

In [None]:
# our output shows us that every value in the array is between 0 and 1. thus lets save this value on X
X = scaler.fit_transform(X)

# now we must do the same on our test_data set
test_data_processed = scaler.transform(test_data_processed)

# Filters
there are two types of filters to filter out columns/features:
- variance filter (a column which has same values throughout the column like all are sunny)
- correlation filter (two columns which are same like weight in kg and weight in pounds)

In [None]:
print("X : ", X.shape)
print("test data : ", test_data_processed.shape)

In [None]:
# variance filter
# ----------------------------- case  -----------------------------
# variance_filter = VarianceThreshold(threshold=0.001)  # Adjust the threshold if needed
# X = variance_filter.fit_transform(X)
# test_data_processed = variance_filter.fit_transform(test_data_processed)
X.shape

In [None]:
test_data_processed.shape

In [None]:
# # correlation filter
# # ----------------------------- case  -----------------------------
# corr_matrix = pd.DataFrame(X).corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
# X = pd.DataFrame(X).drop(columns=to_drop)
# test_data_processed = pd.DataFrame(test_data_processed).drop(columns=to_drop)
X.shape

In [None]:
test_data_processed.shape

## Data Splitting - train and validate
now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.
our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 
So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [None]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42)

# smote

In [None]:
# Oversample with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# functions

In [None]:
def fbselection(direction, sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SequentialFeatureSelector(sample_model, direction=direction, n_features_to_select=features, scoring='roc_auc')
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

def modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed):
    print("start extracting")
    trainX = selection.fit_transform(trainX, trainY)
    print("extracted, transforming")
    testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly
    test_data_processed = selection.transform(test_data_processed)      # test data is also transformed
    X = selection.transform(X)                                          # full data transforming
    print("transformed")
    return sample_model, X, trainX, trainY, testX, test_data_processed

def kbest(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SelectKBest(score_func=f_classif, k=features)
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

In [None]:
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("fitting")
    
    # fit the model
    sample_model.fit(trainX, trainY)

    print("extracting features")

    # extract all the feature names from data
    importances = sample_model.feature_importances_
    feature_names = train_data_processed.drop(columns=['Y']).columns
    print(feature_names)

    # sort with respect to importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # extract the top ones
    top_features = feature_importance_df['Feature'].head(features).values
    print(top_features)

    # change all data according to the top ones we have selected
    trainX = pd.DataFrame(trainX, columns=feature_names)[top_features]
    testX = pd.DataFrame(testX, columns=feature_names)[top_features]
    X = pd.DataFrame(X, columns=feature_names)[top_features]
    test_data_processed = pd.DataFrame(test_data_processed, columns=feature_names)[top_features]

    print("features extracted")
    
    # retrain the model
    sample_model.fit(trainX, trainY)

    print("features trained")
    
    return sample_model, X, trainX, trainY, testX, test_data_processed

## model intialization
here model is intialized

In [None]:
### SAMPLE ###
# -------------------- case X (add the case number here) --------------------
# # intialize models here as model_1, model_2, perform feature selection and feature importance BEFORE they are inserted in stacking
# model_1 = 
# model_2 = 
# # intialize estimators here
# estimators = [('model_1', model_1), ('model_2', model_2)]
# # intialize stacking
# model = StackingClassifier(estimators=estimators, final_estimator=model_name, verbose=2)

In [None]:
# # forward selection
# model_1, X, trainX, trainY, testX, test_data_processed = fbselection('forward', model_1, 15, X, trainX, trainY, testX, test_data_processed)

In [None]:
# # kbest selection
# model_1, X, trainX, trainY, testX, test_data_processed = kbest(model_1, 5, X, trainX, trainY, testX, test_data_processed)

In [None]:
# # feature importance
# xgb = xgb.XGBClassifier(max_depth=5, n_estimators=250, learning_rate=0.1, eval_metric='auc', random_state=42)
# xgb, X, trainX, trainY, testX, test_data_processed = featureImportance( xgb, 45, X, trainX, trainY, testX, test_data_processed )

In [None]:
# # case 202
# lgbm = lgb.LGBMClassifier(learning_rate=0.02, max_depth=3, n_estimators=3000, random_state=42)
# xgb = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.07, random_state=42)

# # Stacking ensemble with XGBoost as the final estimator
# model = StackingClassifier(
#     estimators=[('lgbm', lgbm), ('xgb', xgb)],
#     final_estimator=xgb.XGBClassifier(n_estimators=150, learning_rate=0.05, random_state=42),
#     cv=5
# )

In [None]:
# # case 203
# estimators = [ 
#     ('adaboost', AdaBoostClassifier( n_estimators=1000, learning_rate=0.07, algorithm='SAMME.R', random_state=42 )),
#     ('rf', RandomForestClassifier( n_estimators=512, criterion='gini', max_depth=930, min_samples_split=29, min_samples_leaf=40, max_features='sqrt', bootstrap=True, n_jobs=-1 )),
#     ('LightGBM', lgb.LGBMClassifier( boosting_type='dart', n_estimators=1000, learning_rate=0.07, num_leaves=40, max_depth=7, min_child_samples=25, subsample=0.85, colsample_bytree=0.75, random_state=42, n_jobs=-1 ))
# ]
# model = StackingClassifier(
#     estimators=estimators, final_estimator=xgb.XGBClassifier(), cv=5, n_jobs=-1
# )

In [None]:
# case 204
XGBoost = xgb.XGBClassifier(scale_pos_weight=99, use_label_encoder=False, eval_metric='logloss', random_state=42)
LightGBM = lgb.LGBMClassifier(is_unbalance=True, random_state=42)
Random_Forest = RandomForestClassifier(class_weight='balanced', random_state=42)
stacking_clf = StackingClassifier(estimators=[('rf', Random_Forest), ('xgb', XGBoost), ('lgbm', LightGBM)], final_estimator=GaussianNB())

In [None]:
# # case 205
# lgbm1 = lgb.LGBMClassifier( boosting_type='dart', n_estimators=1000, learning_rate=0.07, num_leaves=40, max_depth=7, min_child_samples=25, subsample=0.85, colsample_bytree=0.75, random_state=42, n_jobs=-1 )

# lgbm2 = lgb.LGBMClassifier( boosting_type='dart', n_estimators=1500, learning_rate=0.07, num_leaves=40, max_depth=7, min_child_samples=25, subsample=0.85, colsample_bytree=0.75, random_state=42, n_jobs=-1 )

# lgbm3 = lgb.LGBMClassifier(learning_rate=0.02, max_depth=2, n_estimators=3500, random_state=42)

# meta_learner = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1, algorithm='SAMME.R', random_state=42)

# model = StackingClassifier(
#     estimators=[ ('lgbm1', lgbm1), ('lgbm2', lgbm2), ('lgbm3', lgbm3) ],
#     final_estimator=meta_learner,
#     cv=5,  # Number of cross-validation folds for stacking
#     stack_method='predict_proba',  # Using probabilities from base models as inputs to meta-learner
#     passthrough=False  # Set to True if you want to pass original features to the meta-learner
# )

In [None]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data_processed.shape)

## Bagging intialization
here we will introduce and intialize bagging

In [None]:
# model = BaggingClassifier(estimator=model, n_estimators=10, verbose=2)
# -- 

## model running
here we run the model

In [None]:
# fit the model
model.fit(trainX, trainY)

In [None]:
# predict using this model
y_pred = model.predict(testX)

In [None]:
# display the accuracy of this prediction
accuracy = accuracy_score(testY, y_pred)
print("model accuracy = ", accuracy, "   ")

# now lets calculate the ROC AUC score according to this prediction
roc_score = roc_auc_score(testY, y_pred)
print("roc score = ", roc_score, "   ")

## predict for test dataset
fit the model and predict for test dataset

In [None]:
model.fit(X, Y)

In [None]:
test_prediction = model.predict_proba(test_data_processed)

test_prediction=test_prediction[:, 1]

print(test_prediction)

## write into csv
now we write the predictions into the csv file

In [None]:
sample_data = pd.read_csv(r"/kaggle/input/sample-sub/sample_submission.csv")

sample_data['Y'] = test_prediction
sample_data

sample_data.to_csv(r"/kaggle/working/stacking2.csv", index=False)
sample_data

In [None]:
model