# Importing Libraries
in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [None]:
# importing all the necessary libraries
import pandas as pd

from numpy import mean
import numpy as np
import time

# step 1: preprocessing
from sklearn.impute import SimpleImputer # import some strategic imputer to fill in any missing values using mean
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer # scale all the values to one range to avoid any biasness (this bias is seen in mostly naive bayes and knn etc)

from sklearn.impute import KNNImputer # import some strategic imputer to fill missing values using KNN (finds the nearest neighbour and fills it with that value)

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold
from sklearn.feature_selection import f_regression
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso

# step 2: data division
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid # to divide the code into train/test using a specific percentage or with/without replacement

# step 3: model
from xgboost import XGBRegressor

# step 4: displaying accuracy
from sklearn.metrics import roc_auc_score, accuracy_score # to display the accuracy of our tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer

# step 5: warning filter
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datetime import datetime

# Function to get current date and time as a string
def get_current_datetime():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Data Loading
data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [None]:
# lets load the training data set
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\train\train.csv")

# lets also check it by getting the first few rows of the data, there should be x1 - x78 and one target variable Y
train_data.head() 

In [None]:
# lets load the test data
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\test\test.csv")

# check if the data has been loaded by getting the first 5 rows - there should be x1 - x78 and no target variable Y as this is test data
test_data.head() 

# Data Preprocessing
before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

## split data into categorical and numerical
categorical will have one-hot and simple imputer of most frequent while numerical will have simple mean imputer and minmax scaler

In [None]:
categorical_cols = train_data.select_dtypes(include=["object"]).columns
numerical_cols = train_data.select_dtypes(exclude=["object"]).drop(columns=['price_doc']).columns

In [None]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [None]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)

## correlation matrix
i tried getting the correlation matrix but apparently a 2000 columns matrix is very computationally expensive as it performs pairs for all. so dont run it. it takes too long and then fails. i ran for 5 minutes. 

In [None]:
# # DONT RUN
# corr_matrix = train_data.corr()
# print(corr_matrix)

# PCA
principal component analysis is applied

In [None]:
# # -------------------------- case  --------------------------
# pca = PCA(n_components=33)                                 
# X = pca.fit_transform(X)
# test_data_processed = pca.transform(test_data_processed)

## Data Splitting - festures and targets
the data in train_data set is of x1 - x78 columns (79 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [None]:
# so in X, it is ALL the columns EXCEPT the last column known as 'Y' (we can confirm this using the train_data.head() we did earlier) so we must get all columns and DROP only the 'y' column
X = train_data.drop(columns=['price_doc'])
X # lets display X and see what it is now

In [None]:
# so as per our X output, we can see that number of columns in train_data is 79 and number of columns in X is 78 meaning we have successfully performed our removal of target variable
# now to get the target variable alone, we can just get it alone,
Y = train_data['price_doc']
Y # lets see what it is
# as per our Y output, we can see it is of one column and 246k rows which means we have successfully extracted the target variable column

# Filters
there are two types of filters to filter out columns/features:
- variance filter (a column which has same values throughout the column like all are sunny)
- correlation filter (two columns which are same like weight in kg and weight in pounds)

In [None]:
# print("X : ", X.shape)
# print("test data : ", test_data_processed.shape)

In [None]:
# variance filter
# ----------------------------- case  -----------------------------
# variance_filter = VarianceThreshold(threshold=0.001)  # Adjust the threshold if needed
# X = variance_filter.fit_transform(X)
# test_data_processed = variance_filter.fit_transform(test_data_processed)
X.shape

In [None]:
# test_data_processed.shape

In [None]:
# # correlation filter
# # ----------------------------- case  -----------------------------
# corr_matrix = pd.DataFrame(X).corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
# X = pd.DataFrame(X).drop(columns=to_drop)
# test_data_processed = pd.DataFrame(test_data_processed).drop(columns=to_drop)
X.shape

In [None]:
# test_data_processed.shape

## Data Splitting - train and validate
now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.
our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 
So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [None]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42)

# functions
here we have defined functions like forward-backward selection, kbest selection & algorithm feature importance

In [None]:
# forward backward selection
def fbselection(direction, sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SequentialFeatureSelector(sample_model, direction=direction, n_features_to_select=features, scoring='roc_auc')
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

def modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed):
    print("start extracting")
    trainX = selection.fit_transform(trainX, trainY)
    print("extracted, transforming")
    testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly
    test_data_processed = selection.transform(test_data_processed)      # test data is also transformed
    X = selection.transform(X)                                          # full data transforming
    print("transformed")
    return sample_model, X, trainX, trainY, testX, test_data_processed

# kbest selection
def kbest(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SelectKBest(score_func=f_regression, k=features)
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

In [None]:
# feature importance function
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("fitting")
    
    # fit the model
    sample_model.fit(trainX, trainY)

    print("extracting features")

    # extract all the feature names from data
    importances = sample_model.feature_importances_
    feature_names = train_data.drop(columns=['price_doc']).columns
    print(feature_names)

    # sort with respect to importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # extract the top ones
    top_features = feature_importance_df['Feature'].head(features).values
    print(top_features)

    # change all data according to the top ones we have selected
    trainX = pd.DataFrame(trainX, columns=feature_names)[top_features]
    testX = pd.DataFrame(testX, columns=feature_names)[top_features]
    X = pd.DataFrame(X, columns=feature_names)[top_features]
    test_data_processed = pd.DataFrame(test_data_processed, columns=feature_names)[top_features]

    print("features extracted")
    
    # retrain the model
    sample_model.fit(trainX, trainY)

    print("features trained")
    
    return sample_model, X, trainX, trainY, testX, test_data_processed

## model intialization
here model is intialized

In [None]:
trainX = preprocessor.fit_transform(trainX)
print("trainX completed")
testX = preprocessor.transform(testX)
print("testX completed")
test_data = preprocessor.transform(test_data)
print("test data completed")
X = preprocessor.transform(X)
print(X.shape)

In [None]:
# declare the model here
# model = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=1600, subsample=0.85, reg_lambda=0.2, reg_alpha=0.8)

In [None]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data.shape)

# feature selection
here we will apply feature selection and feature importance

In [None]:
# Feature selection
# model, X, trainX, trainY, testX, test_data = featureImportance(
#     model, 40, X, trainX, trainY, testX, test_data
# )

In [None]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data.shape)

# grid search

In [None]:
def gridsearch(param_grid, model, scorer, trainX, trainY):
    print("starting grid search")

    # intialize grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scorer, verbose=3)
    print("grid search is intialized")

    # fit the model
    grid_search.fit(trainX, trainY)
    print("grid search fitting completed")

    # display the best model grid search found
    best_model = grid_search.best_estimator_
    print(best_model)

    # display the best parameters of the best model
    best_parameters = grid_search.best_params_
    print(best_parameters)

    # display the best score of the best model
    print("Best cross-validated score:", grid_search.best_score_)

    # assign the best model our model
    model = best_model
    print("model assigned, grid search completed")

    return model

In [None]:
# # Define a scoring metric (e.g., negative mean squared error)
# scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [None]:
# # define hyper parameters of grid
# param_grid = {
#     'estimators': [10, 100, 200, 500, 1000, 2000, 3000], 
#     'learning_rate': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5],
#     'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# }

In [None]:
# sample from train_data
# sample_train = train_data.sample(frac=0.5)
# sample_X = sample_train.drop('price_doc', axis=1) 
# sample_Y = sample_train['price_doc']

In [None]:
# model = gridsearch(param_grid, model, scorer, trainX, trainY)

## model running
here we run the model

In [None]:
from datetime import datetime

def get_current_datetime():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [None]:
# compute this predictions metrics
def metrics(y_pred, testY):
    print("starting to compute metrics")
    
    # display the mean squared error of this prediction
    mse = mean_squared_error(testY, y_pred)
    print("Mean squared error: %.2f" % mse, "   ")

    # display the root mean squared error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print("Root Mean squared error: %.2f" % rmse, "   ")

    # display the mean absolute error of this prediction
    mae = mean_absolute_error(testY, y_pred)
    print("Mean absolute error: %.2f" % mae, "   ")

    # display the coeffeicient of determination of this preduction
    r2_Score = r2_score(testY, y_pred)
    print("Coefficient of determination: %.2f" % r2_Score, "    ")
    
    return rmse

In [None]:
def run_model(model, trainX, trainY, testX, testY):
    print("training model")
    model.fit(trainX, trainY)
    print("computing score")
    print("model score: ", model.score(trainX, trainY))
    y_pred = model.predict(testX)
    rmse = metrics(y_pred, testY)
    return model, rmse

In [None]:
def createFile(model, X, Y, test_data, file_name):
    print("fitting on X Y ", get_current_datetime())
    model.fit(X, Y)

    print("scoring on X Y ", get_current_datetime())
    score = model.score(X, Y)
    print("model test score: ", score, "    ")

    print("predicting on test ", get_current_datetime())
    test_prediction = model.predict(test_data)
    print(test_prediction)

    print("getting sample submission ", get_current_datetime())
    sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")
    sample_data['price_doc'] = test_prediction

    print("Saving submission ", get_current_datetime())
    # Ensure the path ends with a backslash
    base_path = r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\\"
    full_path = base_path + file_name
    sample_data.to_csv(full_path, index=False)
    print(f"File saved at: {full_path}")

In [None]:
def create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, file_name):
    model, rmse = run_model(model, trainX, trainY, testX, testY)
    createFile(model, X, Y, test_data, file_name)

In [None]:
model = XGBRegressor(
    max_depth=18,               # Control tree depth
    learning_rate=0.009,         # Step size
    n_estimators=2000,          # Number of boosting rounds
    subsample=0.85,              # Fraction of training data to grow trees
    colsample_bytree=0.8,       # Fraction of features for each tree
    reg_lambda=0.2,               # L2 regularization
    reg_alpha=0.8,                # L1 regularization
    device = 'cuda',
    verbose = 2,
    tree_method = 'gpu_hist',
    predictor = 'gpu_predictor'
)
create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "xgb1.csv")

In [None]:
# model = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=1600, subsample=0.85, reg_lambda=0.2, reg_alpha=0.8)
# create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "xgb1.cxv")

In [None]:
# model = XGBRegressor(max_depth=10, learning_rate=0.001, n_estimators=1600, subsample=0.85, reg_lambda=0.2, reg_alpha=0.8)
# create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "xgb2.cxv")

In [None]:
# model = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=1700, subsample=0.85, reg_lambda=0.2, reg_alpha=0.8)
# create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "xgb3.cxv")

In [None]:
# model = XGBRegressor(max_depth=11, learning_rate=0.01, n_estimators=1600, subsample=0.85, reg_lambda=0.2, reg_alpha=0.8)
# create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "xgb4.cxv")

## predict for test dataset
fit the model and predict for test dataset

In [None]:
# model.fit(X, Y)

In [None]:
# # display information regarding the regression
# print("model score: ", model.score(X, Y), "    ")
# # print("model coefficient: ", model.coef_)
# # print("model intercept: ", model.intercept_)

In [None]:
# test_prediction = model.predict(test_data)

# # test_prediction=test_prediction[:, 1]

# print(test_prediction)

## write into csv
now we write the predictions into the csv file

In [None]:
# sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")

# sample_data['price_doc'] = test_prediction
# sample_data

# sample_data.to_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\xgb1.csv", index=False)
# sample_data

In [None]:
model