# Importing Libraries







in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [None]:
# -------------------- Data Handling --------------------
import pandas as pd
import numpy as np
from numpy import mean
import time
import matplotlib.pyplot as plt
from datetime import datetime

# -------------------- Preprocessing --------------------
from sklearn.impute import SimpleImputer, KNNImputer  # Handle missing values
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer  # Feature scaling
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold, f_regression  # Feature selection
from sklearn.decomposition import PCA  # Dimensionality reduction
from sklearn.compose import ColumnTransformer  # Preprocessing for different feature types
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder  # Feature transformation
from sklearn.pipeline import Pipeline  # Create machine learning pipelines
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import TargetEncoder, BinaryEncoder

# -------------------- Model --------------------
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

# -------------------- Model Evaluation --------------------
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid  # Train-test split, cross-validation, grid search
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer  # Regression metrics

# -------------------- Warning Handling --------------------
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings

# Functions

In [None]:
def get_current_datetime():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [None]:
# Correlation Filter
def correlationFilter(X, test_data_processed, threshold=0.9):
    """
    Removes highly correlated features from the dataset based on a given threshold.

    Parameters:
        X: Full dataset as a DataFrame.
        test_data_processed: Processed test data as a DataFrame.
        threshold: Correlation threshold above which features are considered highly correlated.

    Returns:
        Filtered datasets (X and test_data_processed) with reduced multicollinearity.
    """
    print("Calculating correlation matrix...")

    # Compute the correlation matrix
    corr_matrix = X.corr().abs()

    # Extract the upper triangle of the correlation matrix
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Identify columns to drop based on the threshold
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]

    print(f"Features to drop due to high correlation (> {threshold}):", to_drop)

    # Drop the highly correlated features
    X = X.drop(columns=to_drop)
    test_data_processed = test_data_processed.drop(columns=to_drop)

    print("Highly correlated features removed.")
    print("New dataset shape:", X.shape)

    return X, test_data_processed

In [None]:
# Variance Filter
def varianceFilter(X, test_data_processed, threshold=0.001):
    """
    Removes features with low variance from the dataset based on a given threshold.

    Parameters:
        X: Full dataset as a DataFrame.
        test_data_processed: Processed test data as a DataFrame.
        threshold: Variance threshold below which features are considered low variance.

    Returns:
        Filtered datasets (X and test_data_processed) with low variance features removed.
    """
    print("Calculating feature variances...")

    # Compute the variance of each feature
    feature_variances = X.var(axis=0)

    # Display variance statistics
    print(feature_variances.describe())

    # Identify features with low variance
    low_variance_columns = feature_variances[feature_variances < threshold].index.tolist()

    print(f"Features with variance below {threshold}: {low_variance_columns}")

    # Remove the low variance features
    X = X.drop(columns=low_variance_columns)
    test_data_processed = test_data_processed.drop(columns=low_variance_columns)

    print("Low variance features removed.")
    print("New dataset shape:", X.shape)

    return X, test_data_processed

In [None]:
def apply_pca(train_data, test_data, target_column='price_doc', variance_threshold=0.95):
    """
    Apply PCA to reduce dimensionality of the dataset while preserving the specified variance threshold.

    Parameters:
        train_data (DataFrame): The training data, including features and target.
        test_data (DataFrame): The test data, including features.
        target_column (str): The name of the target column in the data (default is 'price_doc').
        variance_threshold (float): The threshold for the cumulative variance to retain (default is 0.95).

    Returns:
        train_data (DataFrame): The training data with PCA-reduced features and target.
        test_data (DataFrame): The test data with PCA-reduced features.
    """

    print("Starting PCA process...")

    # Separate features and target variable
    print("Separating features and target variable...")
    train_features = train_data.drop(columns=[target_column])
    train_target = train_data[target_column]

    test_features = test_data.drop(columns=[target_column])

    print(f"Train features shape: {train_features.shape}")
    print(f"Test features shape: {test_features.shape}")

    # Perform PCA to determine the optimal number of components
    print("Fitting PCA to training data...")
    pca = PCA()
    pca.fit(train_features)  # Fit PCA on training data

    # Plot explained variance ratio to decide on optimal components
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA: Explained Variance vs Number of Components')
    plt.show()

    # Select the number of components that explain the desired variance
    print(f"Applying PCA with {variance_threshold*100}% explained variance...")
    pca = PCA(n_components=variance_threshold)
    train_features_pca = pca.fit_transform(train_features)
    test_features_pca = pca.transform(test_features)

    print(f"Train features shape after PCA: {train_features_pca.shape}")
    print(f"Test features shape after PCA: {test_features_pca.shape}")

    # Reconstruct the train_data and test_data with PCA-reduced features and target
    print("Reconstructing train and test datasets with PCA-transformed features...")
    train_data_pca = pd.DataFrame(train_features_pca)
    train_data_pca[target_column] = train_target.reset_index(drop=True)

    test_data_pca = pd.DataFrame(test_features_pca)
    test_data_pca[target_column] = test_data[target_column].reset_index(drop=True)

    print(f"Train data shape after PCA: {train_data_pca.shape}")
    print(f"Test data shape after PCA: {test_data_pca.shape}")

    return train_data_pca, test_data_pca

In [None]:
# Forward and Backward Selection
def fbselection(direction, sample_model, features, X, trainX, trainY, testX, test_data_processed):
    """
    Performs forward or backward feature selection.

    Parameters:
        direction: 'forward' or 'backward' for the selection method.
        sample_model: The model to use for feature selection.
        features: Number of features to select.
        X: Full dataset.
        trainX: Training feature dataset.
        trainY: Training target dataset.
        testX: Testing feature dataset.
        test_data_processed: Processed test data.

    Returns:
        Updated model and adjusted datasets.
    """
    print("Starting forward/backward selection...")

    # Define Sequential Feature Selector
    selection = SequentialFeatureSelector(
        sample_model,
        direction=direction,
        n_features_to_select=features,
        scoring='roc_auc'
    )

    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)


# Model Selector for applying the transformation
def modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed):
    """
    Applies the given selection method to transform datasets.

    Parameters:
        sample_model: The model to use for feature selection.
        selection: Feature selection object (e.g., SequentialFeatureSelector, SelectKBest).
        X: Full dataset.
        trainX: Training feature dataset.
        trainY: Training target dataset.
        testX: Testing feature dataset.
        test_data_processed: Processed test data.

    Returns:
        Updated model and adjusted datasets.
    """
    print("Extracting features using the selection method...")

    # Fit and transform training data
    trainX = selection.fit_transform(trainX, trainY)

    print("Features extracted, transforming other datasets...")

    # Transform other datasets using the fitted selection object
    testX = selection.transform(testX)
    test_data_processed = selection.transform(test_data_processed)
    X = selection.transform(X)

    print("All datasets transformed.")
    print("X shape -> ", X.shape)
    print("trainX shape -> ", trainX.shape)
    print("testX shape -> ", testX.shape)
    print("test_data_processed shape -> ", test_data.shape)

    return sample_model, X, trainX, trainY, testX, test_data_processed


# K-Best Selection
def kbest(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    """
    Selects the top K features based on statistical tests.

    Parameters:
        sample_model: The model to use for feature selection.
        features: Number of top features to select.
        X: Full dataset.
        trainX: Training feature dataset.
        trainY: Training target dataset.
        testX: Testing feature dataset.
        test_data_processed: Processed test data.

    Returns:
        Updated model and adjusted datasets.
    """
    print("Starting K-Best feature selection...")

    # Define SelectKBest object
    selection = SelectKBest(score_func=f_regression, k=features)

    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

In [None]:
# Feature Importance Function
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    """
    Adjusts the dataset and model based on feature importance.
    
    Parameters:
        sample_model: The model used for feature importance extraction.
        features: Number of top features to select.
        X: The full dataset.
        trainX: Training feature dataset.
        trainY: Training target dataset.
        testX: Testing feature dataset.
        test_data_processed: Processed test data.

    Returns:
        Updated model and adjusted datasets.
    """
    print("Fitting the model...")

    # Fit the model
    sample_model.fit(trainX, trainY)

    print("Extracting feature importances...")

    # Extract feature importances
    importances = sample_model.feature_importances_

    # Extract feature names
    feature_names = trainX.columns

    print("Feature names:", feature_names)

    # Create a DataFrame for feature importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Extract top features based on importance
    top_features = feature_importance_df['Feature'].head(features).values

    print("Top features:", top_features)

    # Filter datasets to include only top features
    trainX = trainX[top_features]
    testX = testX[top_features]
    X = X[top_features]
    test_data_processed = test_data_processed[top_features]

    print("Top features extracted and datasets updated.")

    # Retrain the model with top features
    print("Retraining the model with selected features...")
    sample_model.fit(trainX, trainY)

    print("Model retrained with top features.")

    return sample_model, X, trainX, trainY, testX, test_data_processed

In [None]:
def gridsearch(param_grid, model, trainX, trainY):
    """
    Performs a grid search to optimize hyperparameters for a given model.

    Parameters:
        param_grid: Dictionary containing parameter grid for optimization.
        model: The machine learning model to optimize.
        trainX: Training feature dataset.
        trainY: Training target dataset.

    Returns:
        Optimized model with the best parameters found during grid search.
    """
    print("Starting grid search...")

    # Intialize scorer
    scorer = make_scorer(mean_squared_error, greater_is_better=False)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=3,
        scoring=scorer,
        verbose=3,
        n_jobs=-1
    )

    print("Grid search initialized.")

    # Fit the grid search on the training data
    grid_search.fit(trainX, trainY)

    print("Grid search fitting completed.")

    # Retrieve the best model found during grid search
    best_model = grid_search.best_estimator_
    print("Best model found:", best_model)

    # Retrieve and display the best parameters
    best_parameters = grid_search.best_params_
    print("Best parameters:", best_parameters)

    # Retrieve and display the best score
    print("Best cross-validated score:", grid_search.best_score_)

    # Assign the best model
    model = best_model
    print("Model assigned. Grid search completed.")

    return model

In [None]:
def metrics(y_pred, testY):
    """
    Computes and displays various regression metrics for model evaluation.

    Parameters:
        y_pred: Predicted values.
        testY: Actual target values.

    Returns:
        Root Mean Squared Error (RMSE) of the predictions.
    """
    print("Starting to compute metrics...")

    # Mean Squared Error (MSE)
    mse = mean_squared_error(testY, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.2f}")

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(testY, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.2f}")

    # Coefficient of Determination (R² Score)
    r2_Score = r2_score(testY, y_pred)
    print(f"Coefficient of Determination (R² Score): {r2_Score:.2f}")

    return rmse

In [None]:
def run_model(model, trainX, trainY, testX, testY):
    """
    Trains the model, evaluates it on the test set, and computes evaluation metrics.

    Parameters:
        model: The machine learning model to train and evaluate.
        trainX: Training feature dataset.
        trainY: Training target dataset.
        testX: Testing feature dataset.
        testY: Testing target dataset.

    Returns:
        Trained model and the Root Mean Squared Error (RMSE) on the test set.
    """
    print("Training model", get_current_datetime())
    model.fit(trainX, trainY)

    print("Computing score", get_current_datetime())
    print("Model score (training set):", model.score(trainX, trainY))

    # Predict on the test set
    y_pred = model.predict(testX)

    # Compute metrics
    rmse = metrics(y_pred, testY)

    return model, rmse

In [None]:
def createFile(model, X, Y, test_data, file_name):
    """
    Fits the model on the provided dataset, predicts on test data, and saves the predictions to a CSV file.

    Parameters:
        model: The machine learning model to use for training and prediction.
        X: Full feature dataset.
        Y: Target dataset.
        test_data: Test dataset (should include all features except the target column).
        file_name: Name of the output CSV file.

    Returns:
        None
    """
    print("Fitting model on X and Y", get_current_datetime())
    model.fit(X, Y)

    print("Scoring model on X and Y", get_current_datetime())
    score = model.score(X, Y)
    print("Model training score:", score)

    print("Predicting on test data", get_current_datetime())
    test_prediction = model.predict(test_data)#.drop(columns=['price_doc']))
    print("Predictions:", test_prediction)

    print("Preparing sample submission file", get_current_datetime())
    # sample_data = pd.read_csv(r"/kaggle/input/challenge2/sample_submission.csv")
    sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")
    sample_data['price_doc'] = test_prediction

    print("Saving submission file", get_current_datetime())
    # base_path = r"/kaggle/working/"
    base_path = r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2"
    full_path = base_path + file_name

    sample_data.to_csv(full_path, index=False)
    print(f"File saved at: {full_path}")

In [None]:
def create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, file_name):
    model, rmse = run_model(model, trainX, trainY, testX, testY)
    createFile(model, X, Y, test_data, file_name)

# Data Loading







data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [None]:
# lets load the training data set
# train_data = pd.read_csv(r"/kaggle/input/challenge2/train.csv")
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\train\train.csv")

# lets also check it by getting the first few rows of the data, there should  be one target variable Y
train_data.head() 

Unnamed: 0,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,43.0,27.0,4.0,Investment,Bibirevo,6407578.1,155572.0,0.189727,7e-05,9576.0,...,9.0,4.0,0.0,13.0,22.0,1.0,0.0,52.0,4.0,5850000.0
1,34.0,19.0,3.0,Investment,Nagatinskij Zaton,9589336.912,115352.0,0.372602,0.049637,6880.0,...,15.0,3.0,0.0,15.0,29.0,1.0,10.0,66.0,14.0,6000000.0
2,43.0,29.0,2.0,Investment,Tekstil'shhiki,4808269.831,101708.0,0.11256,0.118537,5879.0,...,10.0,3.0,0.0,11.0,27.0,0.0,4.0,67.0,10.0,5700000.0
3,77.0,77.0,4.0,Investment,Basmannoe,8398460.622,108171.0,0.015234,0.037316,5706.0,...,319.0,108.0,17.0,135.0,236.0,2.0,91.0,195.0,14.0,16331452.0
4,67.0,46.0,14.0,Investment,Nizhegorodskoe,7506452.02,43795.0,0.00767,0.486246,2418.0,...,62.0,14.0,1.0,53.0,78.0,1.0,20.0,113.0,17.0,9100000.0


In [None]:
# lets load the test data
# test_data = pd.read_csv(r"/kaggle/input/challenge2/test.csv")
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\test\test.csv")

# check if the data has been loaded by getting the first 5 rows - there should be no target variable Y as this is test data
test_data.head() 

Unnamed: 0,row ID,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,Row3,89.0,50.0,9.0,Investment,Mitino,12583540.0,178473.0,0.194703,0.069753,...,15.0,11.0,2.0,1.0,4.0,4.0,0.0,0.0,26.0,3.0
1,Row6,25.0,14.0,10.0,Investment,Sokol'niki,10320470.0,57405.0,0.523439,0.042307,...,144.0,81.0,16.0,3.0,38.0,80.0,1.0,27.0,127.0,8.0
2,Row11,38.0,19.0,11.0,Investment,Zapadnoe Degunino,7632940.0,78810.0,0.051844,0.437885,...,39.0,8.0,3.0,0.0,10.0,9.0,0.0,0.0,35.0,4.0
3,Row12,43.0,28.0,4.0,Investment,Kuncevo,52351770.0,142462.0,0.070662,0.035145,...,21.0,13.0,9.0,1.0,7.0,15.0,0.0,2.0,47.0,0.0
4,Row14,31.0,21.0,3.0,Investment,Lefortovo,8993640.0,89971.0,0.066941,0.306977,...,205.0,88.0,19.0,2.0,63.0,100.0,0.0,28.0,132.0,14.0


# Data Preprocessing







before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

## split data into categorical and numerical







categorical will have one-hot and simple imputer of most frequent while numerical will have simple mean imputer and minmax scaler

In [10]:
categorical_cols = train_data.select_dtypes(include=["object"]).columns
numerical_cols = train_data.select_dtypes(exclude=["object"]).drop(columns=['price_doc']).columns

## Pipelines and Column Transformers

Pipelines in machine learning allow for chaining multiple preprocessing steps and modeling into a single object, ensuring that all transformations are applied consistently. ColumnTransformers enable column-specific transformations, allowing different preprocessing techniques for different features in the dataset.

In [None]:
# num_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("scaler", MinMaxScaler())
# ])

# cat_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])

In [None]:
# # Column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", num_transformer, numerical_cols),
#         ("cat", cat_transformer, categorical_cols)
#     ]
# )

In [None]:
# train_data = preprocessor.fit_transform(train_data)
# print("train completed")
# test_data = preprocessor.transform(test_data)
# print("test data completed")

## Imputers
Imputers are used to handle missing data in a dataset by filling in missing values with estimated ones. Common strategies include using the mean, median, or most frequent value for numerical data, and the most frequent value for categorical data. Imputation helps ensure that models can be trained without the issue of missing values disrupting the learning process.

In [None]:
# ------------------ Numerical Imputers ------------------

# Mean Imputer (Fills missing values with the mean of each column)
num_imputer = SimpleImputer(strategy="mean")

# Median Imputer (Fills missing values with the median of each column)
# num_imputer = SimpleImputer(strategy="median")

# Most Frequent Imputer (Fills missing values with the most frequent value of each column)
# num_imputer = SimpleImputer(strategy="most_frequent")

# Constant Imputer (Fills missing values with a constant value, e.g., 0)
# num_imputer = SimpleImputer(strategy="constant", fill_value=0)

# KNN Imputer (Fills missing values based on nearest neighbors)
# num_imputer = KNNImputer(n_neighbors=5)

# ------------------ Categorical Imputers ------------------

# Most Frequent Imputer (Fills missing values with the most frequent value in each column)
cat_imputer = SimpleImputer(strategy="most_frequent")

# Constant Imputer (Fills missing values with a constant value, e.g., 'Unknown')
# cat_imputer = SimpleImputer(strategy="constant", fill_value="Unknown")

# ------------------ Apply Imputers ------------------

# Impute numerical columns
train_data[numerical_cols] = num_imputer.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = num_imputer.transform(test_data[numerical_cols])

# Impute categorical columns
train_data[categorical_cols] = cat_imputer.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = cat_imputer.transform(test_data[categorical_cols])

## Scalers
Scalers are used to normalize or standardize numerical features in a dataset to ensure they are on a similar scale. This is crucial for algorithms that are sensitive to the magnitude of features, such as KNN or gradient-based models. Common scalers include MinMaxScaler, StandardScaler, and RobustScaler.

In [None]:
# ------------------ Scalers ------------------

# MinMaxScaler (Scales the features to a range [0, 1])
scaler = MinMaxScaler()

# StandardScaler (Standardizes the features by removing the mean and scaling to unit variance)
# scaler = StandardScaler()

# MaxAbsScaler (Scales the features by their maximum absolute value, for data that is already centered at zero)
# scaler = MaxAbsScaler()

# RobustScaler (Scales the features using the median and interquartile range, less sensitive to outliers)
# scaler = RobustScaler()

# Normalizer (Scales the features to have unit norm, i.e., each sample is scaled to have unit norm)
# scaler = Normalizer()

# ------------------ Apply Scaler ------------------

# Scale numerical columns in training and test data
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

## Encoding
Encoding is the process of converting categorical variables into numerical representations so that machine learning models can process them. Techniques like One-Hot Encoding, Label Encoding, and Target Encoding are commonly used to convert categorical data into a format suitable for modeling.

In [None]:
# ------------------ Encoding Methods ------------------

# One-Hot Encoding (Creates binary columns for each category, default drop_first=False)
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=False)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=False)

# Label Encoding (Converts each category into a unique integer value)
# label_encoder = LabelEncoder()
# for col in categorical_cols:
#     train_data[col] = label_encoder.fit_transform(train_data[col])
#     test_data[col] = label_encoder.transform(test_data[col])

# Ordinal Encoding (Maps categories to ordered integers, requires predefined order)
# ordinal_encoder = OrdinalEncoder()
# train_data[categorical_cols] = ordinal_encoder.fit_transform(train_data[categorical_cols])
# test_data[categorical_cols] = ordinal_encoder.transform(test_data[categorical_cols])

# Target Encoding (Encodes categories based on the mean of the target variable)
# target_encoder = TargetEncoder()
# train_data[categorical_cols] = target_encoder.fit_transform(train_data[categorical_cols], train_data['target_column'])
# test_data[categorical_cols] = target_encoder.transform(test_data[categorical_cols])

# Binary Encoding (Encodes categories as binary digits)
# binary_encoder = BinaryEncoder()
# train_data[categorical_cols] = binary_encoder.fit_transform(train_data[categorical_cols])
# test_data[categorical_cols] = binary_encoder.transform(test_data[categorical_cols])

# ------------------ Align Test Data with Training Data ------------------

# Align test data columns with train data columns (fill missing columns with 0)
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

# ------------------ Optional: Drop 'price_doc' Column ------------------

# Remove the target column if present in test_data
# test_data = test_data.drop(columns=['price_doc'], errors="ignore")


## correlation matrix







i tried getting the correlation matrix but apparently a 2000 columns matrix is very computationally expensive as it performs pairs for all. so dont run it. it takes too long and then fails. i ran for 5 minutes. 

In [None]:
# # DONT RUN
# corr_matrix = train_data.corr()
# print(corr_matrix)

## Variance Filter
The variance filter is used to remove features with low variance, which provide little information for predictive modeling. Features with very similar values across all observations are considered redundant and can be safely excluded from the dataset.

In [None]:
# # Save the target column separately
# target_column = train_data['price_doc']

# # Drop the target column from the features
# train_data_copy = train_data.drop(columns=['price_doc'])

# # Call the correlation filter function to filter out highly correlated features
# train_data, test_data = varianceFiter(train_data_copy, test_data, 0.01)

# # Append the target column back to the filtered data
# train_data['price_doc'] = target_column

## Correlation Filter
The correlation filter helps to remove highly correlated features. When two features are highly correlated, they convey similar information, and removing one can help reduce redundancy and improve the model's performance.

In [None]:
# # Save the target column separately
# target_column = train_data['price_doc']

# # Drop the target column from the features
# train_data_copy = train_data.drop(columns=['price_doc'])

# # Call the correlation filter function to filter out highly correlated features
# train_data, test_data = correlationFilter(train_data_copy, test_data, 0.9)

# # Append the target column back to the filtered data
# train_data['price_doc'] = target_column

## PCA (Principal Component Analysis)
PCA is a dimensionality reduction technique that transforms the data into a new coordinate system, where the greatest variances lie along the first axes (principal components). It helps reduce the number of features while retaining most of the data's variance, improving model performance and interpretability.

In [None]:
train_data, test_data = apply_pca(train_data, test_data)

## Data Splitting - features and targets
the data in train_data set is of x1 - x271 columns (271 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [None]:
# Drop the 'price_doc' column from train_data to get the features (X)
X = train_data.drop(columns=['price_doc'])

# Display X to confirm the result
X

Unnamed: 0,full_sq,life_sq,floor,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_education_centers_raion,children_school,...,water_1line_yes,big_road1_1line_no,big_road1_1line_yes,railroad_1line_no,railroad_1line_yes,ecology_excellent,ecology_good,ecology_no data,ecology_poor,ecology_satisfactory
0,0.008074,0.003611,0.051949,0.021207,0.624792,0.220726,0.000134,0.493543,0.384615,0.536135,...,False,True,False,True,False,False,True,False,False,False
1,0.006384,0.002541,0.038962,0.036804,0.460577,0.435610,0.095115,0.352005,0.384615,0.401322,...,False,True,False,True,False,True,False,False,False,False
2,0.008074,0.003878,0.025974,0.013367,0.404870,0.130052,0.227141,0.299454,0.307692,0.319270,...,False,True,False,True,False,False,False,False,True,False
3,0.014457,0.010297,0.051949,0.030966,0.431258,0.015692,0.071506,0.290372,0.538462,0.347872,...,False,True,False,False,True,True,False,False,False,False
4,0.012580,0.006151,0.181821,0.026594,0.168416,0.006804,0.931742,0.117755,0.153846,0.124029,...,False,True,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181502,0.009012,0.004413,0.038962,0.306263,0.009811,0.686564,0.011151,0.008977,0.000000,0.009199,...,False,True,False,True,False,False,False,True,False,False
181503,0.009012,0.004413,0.038962,0.306263,0.009811,0.686564,0.011151,0.008977,0.000000,0.009199,...,False,True,False,True,False,False,False,True,False,False
181504,0.009012,0.004413,0.038962,0.306263,0.009811,0.686564,0.011151,0.008977,0.000000,0.009199,...,False,True,False,True,False,False,False,True,False,False
181505,0.009012,0.004413,0.038962,0.306263,0.009811,0.686564,0.011151,0.008977,0.000000,0.009199,...,False,True,False,True,False,False,False,True,False,False


In [None]:
# Extract the target variable 'price_doc' from train_data into Y
Y = train_data['price_doc']

# Display Y to confirm it contains only the target variable
Y

0          5850000.0
1          6000000.0
2          5700000.0
3         16331452.0
4          9100000.0
             ...    
181502     3480000.0
181503     3480000.0
181504     3480000.0
181505     3480000.0
181506     3480000.0
Name: price_doc, Length: 181507, dtype: float64

## Data Splitting - train and validate







now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.







our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 







So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [27]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3, random_state=2)

## model intialization







here model is intialized

In [30]:
model = GradientBoostingRegressor(n_estimators=600, learning_rate=0.01,min_samples_leaf=5, min_samples_split=3, random_state=2,verbose=2,max_features='log2',max_depth= 12)

In [31]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data.shape)

X shape ->  (181507, 2214)
trainX shape ->  (127054, 2214)
testX shape ->  (54453, 2214)
test_data_processed shape ->  (77789, 2215)


# feature selection







here we will apply feature selection and feature importance

In [32]:
model

In [None]:
# Check if 'price_doc' column exists in the test_data, and drop it if present
# Check if 'price_doc' column exists in the test_data, and drop it if present
if 'price_doc' in test_data.columns:
    test_data = test_data.drop(columns=['price_doc'])
    print("'price_doc' column has been dropped.")
else:
    print("'price_doc' column was not found, nothing to drop.")


In [35]:
# apply feature selection here
model, X, trainX, trainY, testX, test_data = kbest(model, 200, X, trainX, trainY, testX, test_data)

Starting K-Best feature selection...
Extracting features using the selection method...
Features extracted, transforming other datasets...
All datasets transformed.


In [36]:
model

## Grid Search

Grid Search is a technique used for hyperparameter tuning in machine learning models. It systematically tests different combinations of hyperparameters to find the best-performing set, based on a specified performance metric (such as accuracy or mean squared error). GridSearchCV from scikit-learn automates this process by performing cross-validation on each combination to identify the optimal model configuration.

In [38]:
# define hyper parameters of grid
# param_grid = {
#     'max_depth': [ 1, 2, 3, 4, 5 ]
# }

In [39]:
# model = gridsearch(param_grid, model, scorer, trainX, trainY)

In [None]:
model

## model running







here we run the model

In [None]:
create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "gb1.csv")
model

Training model 2024-12-01 18:38:17
      Iter       Train Loss   Remaining Time 
         1 469341275895858.5625           20.82m
         2 462961142471899.8125           20.15m
         3 456739367489211.2500           20.51m
         4 450613685305417.9375           19.17m
         5 444656510086319.1250           19.19m
         6 438812775316429.2500           18.89m
         7 433035461955445.0625           18.80m
         8 427288698837566.8125           18.41m
         9 421783568985588.8125           18.57m
        10 416427589000325.0625           18.21m
        11 411151514430763.1250           17.90m
        12 405944341324282.0625           17.67m
        13 400821733245209.3125           17.51m
        14 395652407549520.0625           17.37m
        15 390655430984203.8750           17.18m
        16 385818391466834.6250           16.99m
        17 381128526310775.3750           16.82m
        18 376514260588036.0000           16.72m
        19 371889361011765.8750      

# Multi-model running
in one file we test different models and create multiple files 

In [None]:
# case 1
model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=1.0, 
    verbose=3
)
# create_submission(model, trainX, trainY, testX, testY, X, Y, test_data, "gboost1.csv")
model

In [None]:
# case 2
model2 = GradientBoostingRegressor(
    n_estimators=50,
    max_depth=2,
    learning_rate=0.2,
    subsample=0.8, 
    verbose=3
)
# create_submission(model2, trainX, trainY, testX, testY, X, Y, test_data, "gboost2.csv")
model2

In [None]:
# case 3
model3 = GradientBoostingRegressor(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9, 
    verbose=3
)
# create_submission(model3, trainX, trainY, testX, testY, X, Y, test_data, "gboost3.csv")
model3

In [None]:
# case 4
model4 = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.85, 
    verbose=3
)
# create_submission(model4, trainX, trainY, testX, testY, X, Y, test_data, "gboost4.csv")
model4

In [None]:
# case 5
model5 = GradientBoostingRegressor(
    n_estimators=150,
    max_depth=2,
    learning_rate=0.15,
    subsample=0.7, 
    verbose=3
)
# create_submission(model5, trainX, trainY, testX, testY, X, Y, test_data, "gboost5.csv")
model5

In [None]:
# case 6
model6 = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    max_features=0.3, 
    verbose=3
)
# create_submission(model6, trainX, trainY, testX, testY, X, Y, test_data, "gboost6.csv")
model6

In [None]:
# case 7
model7 = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.9,
    min_samples_split=10, 
    verbose=3
)
# create_submission(model7, trainX, trainY, testX, testY, X, Y, test_data, "gboost7.csv")
model7

In [None]:
# case 8
model8 = GradientBoostingRegressor(
    n_estimators=50,
    max_depth=3,
    learning_rate=0.3,
    subsample=0.75, 
    verbose=3
)
# create_submission(model8, trainX, trainY, testX, testY, X, Y, test_data, "gboost8.csv")
model8

In [None]:
# case 9
model9 = GradientBoostingRegressor(
    n_estimators=120,
    max_depth=4,
    learning_rate=0.08,
    subsample=0.85,
    max_features='sqrt', 
    verbose=3
)
# create_submission(model9, trainX, trainY, testX, testY, X, Y, test_data, "gboost9.csv")
model9

In [None]:
# case 10
model0 = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.6, 
    verbose=3
)
# create_submission(model0, trainX, trainY, testX, testY, X, Y, test_data, "gboost10.csv")
model0