# Imports

In [120]:
# perform necessary imports
import pandas as pd
from numpy import mean
import numpy as np
import time
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold, f_regression
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

# Functions

In [121]:
# Function to get current date and time as a string
def get_current_datetime():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [122]:
# function to compute this predictions metrics
def metrics(y_pred, testY):
    print("starting to compute metrics")

    # display the mean squared error of this prediction
    mse = mean_squared_error(testY, y_pred)
    print("Mean squared error: %.2f" % mse, "   ")

    # display the root mean squared error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print("Root Mean squared error: %.2f" % rmse, "   ")

    # display the mean absolute error of this prediction
    mae = mean_absolute_error(testY, y_pred)
    print("Mean absolute error: %.2f" % mae, "   ")

    # display the coeffeicient of determination of this preduction
    r2_Score = r2_score(testY, y_pred)
    print("Coefficient of determination: %.2f" % r2_Score, "    ")

In [123]:
# forward backward selection
def fbselection(direction, sample_model, features, X, trainX, trainY, testX, test_data_processed, preprocessor):
    print("starting")
    selection = SequentialFeatureSelector(sample_model, direction=direction, n_features_to_select=features, scoring='roc_auc')
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed, preprocessor)

def modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed, preprocessor):
    print("start extracting")
    trainX = selection.fit_transform(trainX, trainY)
    print("extracted, transforming")
    testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly
    test_data_processed = selection.transform(test_data_processed)      # test data is also transformed
    X = selection.transform(X)                                          # full data transforming
    print("transformed")

    # Get selected feature names
    feature_names = preprocessor.get_feature_names_out()  # Get all feature names from preprocessor
    selected_features = [feature_names[i] for i in range(len(feature_names)) if selection.get_support()[i]]
    print("Selected Features:")
    print(selected_features)

    return sample_model, X, trainX, trainY, testX, test_data_processed

# kbest selection
def kbest(sample_model, features, X, trainX, trainY, testX, test_data_processed, preprocessor):
    print("starting")
    selection = SelectKBest(score_func=f_regression, k=features)
    # Fit the selector to training data
    selection.fit(trainX, trainY)
    # Get selected feature names
    feature_names = preprocessor.get_feature_names_out()  # Get all feature names from preprocessor
    selected_features = [feature_names[i] for i in range(len(feature_names)) if selection.get_support()[i]]
    print("Selected Features:")
    print(selected_features)
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed, preprocessor)

In [124]:
# feature importance function
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("fitting")
    
    # fit the model
    sample_model.fit(trainX, trainY)

    print("extracting features")

    # extract all the feature names from data
    importances = sample_model.feature_importances_
    feature_names = test_data_processed.columns
    print(feature_names)

    # sort with respect to importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # extract the top ones
    top_features = feature_importance_df['Feature'].head(features).values
    print(top_features)

    # change all data according to the top ones we have selected
    trainX = pd.DataFrame(trainX, columns=feature_names)[top_features]
    testX = pd.DataFrame(testX, columns=feature_names)[top_features]
    X = pd.DataFrame(X, columns=feature_names)[top_features]
    test_data_processed = pd.DataFrame(test_data_processed, columns=feature_names)[top_features]

    print("features extracted")
    
    # retrain the model
    sample_model.fit(trainX, trainY)

    print("features trained")
    
    return sample_model, X, trainX, trainY, testX, test_data_processed

In [125]:
def gridsearch(param_grid, model, trainX, trainY):
    print("starting grid search")

    # intialize a scorer metric
    scorer = make_scorer(mean_squared_error, greater_is_better=False)

    # intialize grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scorer, verbose=3)
    print("grid search is intialized")

    # fit the model
    grid_search.fit(trainX, trainY)
    print("grid search fitting completed")

    # display the best model grid search found
    best_model = grid_search.best_estimator_
    print(best_model)

    # display the best parameters of the best model
    best_parameters = grid_search.best_params_
    print(best_parameters)

    # display the best score of the best model
    print("Best cross-validated score:", grid_search.best_score_)

    # assign the best model our model
    model = best_model
    print("model assigned, grid search completed")

    return model

In [126]:
def createFile(model, X, Y, test_data):
    print("fitting on X Y ", get_current_datetime())
    model.fit(X, Y)

    print("scoring on X Y ", get_current_datetime())
    score = model.score(X, Y)
    print("model test score: ", score, "    ")

    print("predicting on test ", get_current_datetime())
    test_prediction = model.predict(test_data)
    print(test_prediction)

    print("getting sample submission ", get_current_datetime())
    sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")
    sample_data['price_doc'] = test_prediction

    print("saving submission ", get_current_datetime())
    sample_data.to_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\knn1.csv", index=False)
    print(sample_data)

# Code

In [127]:
# Importing data
print("-> Started Importing Data ", get_current_datetime())
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\train\train.csv")
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\test\test.csv")

print("train_data shape ", train_data.shape)
print("test_data shape ", test_data.shape)

# Split data into categorical and numerical
print("-> Started splitting columns ", get_current_datetime())
categorical_cols = train_data.select_dtypes(include=["object"]).columns
numerical_cols = train_data.select_dtypes(exclude=["object"]).drop(columns=['price_doc']).columns

print(categorical_cols)
print(numerical_cols)

# Data preprocessing: scalers and imputers on columns
print("-> Defining scalers and imputers ", get_current_datetime())
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)

# Data fitting: fit the preprocessing on the data
print("-> Data Fitting ", get_current_datetime())
train_transformed = preprocessor.fit_transform(train_data)

# Convert the transformed data back to a DataFrame
train_data_processed = pd.DataFrame(train_transformed, columns=numerical_cols + list(preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)))

# Apply preprocessing to test data (without fitting)
test_transformed = preprocessor.transform(test_data)

# Convert the transformed test data back to a DataFrame
test_data_processed = pd.DataFrame(test_transformed, columns=numerical_cols + list(preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)))

# Now you have the processed DataFrames for both train and test data
train_data=train_data_processed
test_data=test_data_processed

# Data splitting: features and targets
print("-> Data splitting X Y ", get_current_datetime())
X = train_data.drop(columns=['price_doc'])
Y = train_data['price_doc']

print(X.shape)
print(Y.shape)

# Data splitting: train, validate, test
print("-> Data splitting tvt ", get_current_datetime())
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3, random_state=2)

# Model declaration
print("-> model declaration ", get_current_datetime())
model = KNeighborsRegressor(n_neighbors=5)
print(model)

# Feature Selection
print("-> feature selection ", get_current_datetime())
model, X, trainX, trainY, testX, test_data= kbest(model, 20, X, trainX, trainY, testX, test_data, preprocessor)

# Grid Search
print("-> Grid Search ", get_current_datetime())
# param_grid = {
#     'model__tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
# }
# model = gridsearch(param_grid, model, trainX, trainY)

# Model train: full dataset
print("-> Model train ", get_current_datetime())
# model.fit(trainX, trainY)

# Model train: sample dataset
sample_train = train_data.sample(frac=0.1)
sample_X = sample_train.drop(columns=['price_doc'])
sample_Y = sample_train['price_doc']

print("sample taken")

model.fit(sample_X, sample_Y)
print("model trained")

# Model Score: compute score of model
print("-> Model Score ", get_current_datetime())
# score = model.score(trainX, trainY)
score = model.score(sample_X, sample_Y)
print("model score: ", score)

# Prediction: predict and display its metrics
print("-> Prediction ", get_current_datetime)
y_pred = model.predict(testX)
print("successfully predicted")
metrics(y_pred, testY)
    
print("-> File Creation ", get_current_datetime())
createFile(model, X, Y, test_data)
print(model)

-> Started Importing Data  2024-11-27 01:25:45
train_data shape  (181507, 272)
test_data shape  (77789, 272)
-> Started splitting columns  2024-11-27 01:26:04
Index(['product_type', 'sub_area', 'culture_objects_top_25',
       'thermal_power_plant_raion', 'incineration_raion',
       'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion',
       'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion',
       'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology'],
      dtype='object')
Index(['full_sq', 'life_sq', 'floor', 'area_m', 'raion_popul',
       'green_zone_part', 'indust_part', 'children_preschool',
       'preschool_education_centers_raion', 'children_school',
       ...
       'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500',
       'cafe_count_5000_price_4000', 'cafe_count_5000_price_high',
       'big_church_count_5000', 'church_count_5000', 'mosque_count_5000',
       'leisure_count_5000', 'sport_count_5000', 'market_count_50

ValueError: operands could not be broadcast together with shapes (256,) (1958,) 