# Importing Libraries
in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [None]:
# importing all the necessary libraries
import pandas as pd

from numpy import mean
import numpy as np
import time

# step 1: preprocessing
from sklearn.impute import SimpleImputer # import some strategic imputer to fill in any missing values using mean
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer # scale all the values to one range to avoid any biasness (this bias is seen in mostly naive bayes and knn etc)

from sklearn.impute import KNNImputer # import some strategic imputer to fill missing values using KNN (finds the nearest neighbour and fills it with that value)

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold

from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso

# step 2: data division
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid # to divide the code into train/test using a specific percentage or with/without replacement

# step 3: model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# step 4: displaying accuracy
from sklearn.metrics import roc_auc_score, accuracy_score # to display the accuracy of our tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer

# step 5: warning filter
import warnings
warnings.filterwarnings('ignore')

# Data Loading
data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [None]:
# lets load the training data set
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\train\train.csv")

# lets also check it by getting the first few rows of the data, there should be x1 - x78 and one target variable Y
train_data.head() 

Unnamed: 0,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,43.0,27.0,4.0,Investment,Bibirevo,6407578.1,155572.0,0.189727,7e-05,9576.0,...,9.0,4.0,0.0,13.0,22.0,1.0,0.0,52.0,4.0,5850000.0
1,34.0,19.0,3.0,Investment,Nagatinskij Zaton,9589336.912,115352.0,0.372602,0.049637,6880.0,...,15.0,3.0,0.0,15.0,29.0,1.0,10.0,66.0,14.0,6000000.0
2,43.0,29.0,2.0,Investment,Tekstil'shhiki,4808269.831,101708.0,0.11256,0.118537,5879.0,...,10.0,3.0,0.0,11.0,27.0,0.0,4.0,67.0,10.0,5700000.0
3,77.0,77.0,4.0,Investment,Basmannoe,8398460.622,108171.0,0.015234,0.037316,5706.0,...,319.0,108.0,17.0,135.0,236.0,2.0,91.0,195.0,14.0,16331452.0
4,67.0,46.0,14.0,Investment,Nizhegorodskoe,7506452.02,43795.0,0.00767,0.486246,2418.0,...,62.0,14.0,1.0,53.0,78.0,1.0,20.0,113.0,17.0,9100000.0


In [None]:
# lets load the test data
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\test\test.csv")

# check if the data has been loaded by getting the first 5 rows - there should be x1 - x78 and no target variable Y as this is test data
test_data.head() 

Unnamed: 0,row ID,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,Row3,89.0,50.0,9.0,Investment,Mitino,12583540.0,178473.0,0.194703,0.069753,...,15.0,11.0,2.0,1.0,4.0,4.0,0.0,0.0,26.0,3.0
1,Row6,25.0,14.0,10.0,Investment,Sokol'niki,10320470.0,57405.0,0.523439,0.042307,...,144.0,81.0,16.0,3.0,38.0,80.0,1.0,27.0,127.0,8.0
2,Row11,38.0,19.0,11.0,Investment,Zapadnoe Degunino,7632940.0,78810.0,0.051844,0.437885,...,39.0,8.0,3.0,0.0,10.0,9.0,0.0,0.0,35.0,4.0
3,Row12,43.0,28.0,4.0,Investment,Kuncevo,52351770.0,142462.0,0.070662,0.035145,...,21.0,13.0,9.0,1.0,7.0,15.0,0.0,2.0,47.0,0.0
4,Row14,31.0,21.0,3.0,Investment,Lefortovo,8993640.0,89971.0,0.066941,0.306977,...,205.0,88.0,19.0,2.0,63.0,100.0,0.0,28.0,132.0,14.0


# Data Preprocessing
before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

## split data into categorical and numerical
categorical will have one-hot and simple imputer of most frequent while numerical will have simple mean imputer and minmax scaler

In [None]:
categorical_cols = train_data.select_dtypes(include=["object"]).columns
numerical_cols = train_data.select_dtypes(exclude=["object"]).drop(columns=['price_doc']).columns

In [None]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])
numerical_scaler = MinMaxScaler()
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [None]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)

## correlation matrix
i tried getting the correlation matrix but apparently a 2000 columns matrix is very computationally expensive as it performs pairs for all. so dont run it. it takes too long and then fails. i ran for 5 minutes. 

In [None]:
# # DONT RUN
# corr_matrix = train_data.corr()
# print(corr_matrix)

# PCA
principal component analysis is applied

In [None]:
# # -------------------------- case  --------------------------
# pca = PCA(n_components=33)                                 
# X = pca.fit_transform(X)
# test_data_processed = pca.transform(test_data_processed)

## Data Splitting - festures and targets
the data in train_data set is of x1 - x78 columns (79 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [None]:
# so in X, it is ALL the columns EXCEPT the last column known as 'Y' (we can confirm this using the train_data.head() we did earlier) so we must get all columns and DROP only the 'y' column
X = train_data.drop(columns=['price_doc'])
X # lets display X and see what it is now

Unnamed: 0,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,43.0,27.0,4.0,Investment,Bibirevo,6.407578e+06,155572.0,0.189727,0.000070,9576.0,...,40.0,9.0,4.0,0.0,13.0,22.0,1.0,0.0,52.0,4.0
1,34.0,19.0,3.0,Investment,Nagatinskij Zaton,9.589337e+06,115352.0,0.372602,0.049637,6880.0,...,36.0,15.0,3.0,0.0,15.0,29.0,1.0,10.0,66.0,14.0
2,43.0,29.0,2.0,Investment,Tekstil'shhiki,4.808270e+06,101708.0,0.112560,0.118537,5879.0,...,25.0,10.0,3.0,0.0,11.0,27.0,0.0,4.0,67.0,10.0
3,77.0,77.0,4.0,Investment,Basmannoe,8.398461e+06,108171.0,0.015234,0.037316,5706.0,...,552.0,319.0,108.0,17.0,135.0,236.0,2.0,91.0,195.0,14.0
4,67.0,46.0,14.0,Investment,Nizhegorodskoe,7.506452e+06,43795.0,0.007670,0.486246,2418.0,...,155.0,62.0,14.0,1.0,53.0,78.0,1.0,20.0,113.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181502,48.0,33.0,3.0,Investment,Poselenie Mihajlovo-Jarcevskoe,6.455617e+07,4949.0,0.586175,0.005819,346.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
181503,48.0,33.0,3.0,Investment,Poselenie Mihajlovo-Jarcevskoe,6.455617e+07,4949.0,0.586175,0.005819,346.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
181504,48.0,33.0,3.0,Investment,Poselenie Mihajlovo-Jarcevskoe,6.455617e+07,4949.0,0.586175,0.005819,346.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
181505,48.0,33.0,3.0,Investment,Poselenie Mihajlovo-Jarcevskoe,6.455617e+07,4949.0,0.586175,0.005819,346.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [None]:
# so as per our X output, we can see that number of columns in train_data is 79 and number of columns in X is 78 meaning we have successfully performed our removal of target variable
# now to get the target variable alone, we can just get it alone,
Y = train_data['price_doc']
Y # lets see what it is
# as per our Y output, we can see it is of one column and 246k rows which means we have successfully extracted the target variable column

0          5850000.0
1          6000000.0
2          5700000.0
3         16331452.0
4          9100000.0
             ...    
181502     3480000.0
181503     3480000.0
181504     3480000.0
181505     3480000.0
181506     3480000.0
Name: price_doc, Length: 181507, dtype: float64

# Filters
there are two types of filters to filter out columns/features:
- variance filter (a column which has same values throughout the column like all are sunny)
- correlation filter (two columns which are same like weight in kg and weight in pounds)

In [None]:
# print("X : ", X.shape)
# print("test data : ", test_data_processed.shape)

In [None]:
# variance filter
# ----------------------------- case  -----------------------------
# variance_filter = VarianceThreshold(threshold=0.001)  # Adjust the threshold if needed
# X = variance_filter.fit_transform(X)
# test_data_processed = variance_filter.fit_transform(test_data_processed)
X.shape

(181507, 271)

In [None]:
# test_data_processed.shape

In [None]:
# # correlation filter
# # ----------------------------- case  -----------------------------
# corr_matrix = pd.DataFrame(X).corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
# X = pd.DataFrame(X).drop(columns=to_drop)
# test_data_processed = pd.DataFrame(test_data_processed).drop(columns=to_drop)
X.shape

(181507, 271)

In [None]:
# test_data_processed.shape

## Data Splitting - train and validate
now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.
our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 
So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [None]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3, random_state=2)

# functions
here we have defined functions like forward-backward selection, kbest selection & algorithm feature importance

In [None]:
# forward backward selection
def fbselection(direction, sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SequentialFeatureSelector(sample_model, direction=direction, n_features_to_select=features, scoring='roc_auc')
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

def modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed):
    print("start extracting")
    trainX = selection.fit_transform(trainX, trainY)
    print("extracted, transforming")
    testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly
    test_data_processed = selection.transform(test_data_processed)      # test data is also transformed
    X = selection.transform(X)                                          # full data transforming
    print("transformed")
    return sample_model, X, trainX, trainY, testX, test_data_processed

# kbest selection
def kbest(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("starting")
    selection = SelectKBest(score_func=f_classif, k=features)
    return modelSelector(sample_model, selection, X, trainX, trainY, testX, test_data_processed)

In [None]:
# feature importance function
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("fitting")
    
    # fit the model
    sample_model.fit(trainX, trainY)

    print("extracting features")

    # extract all the feature names from data
    importances = sample_model.feature_importances_
    feature_names = train_data_processed.drop(columns=['Y']).columns
    print(feature_names)

    # sort with respect to importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # extract the top ones
    top_features = feature_importance_df['Feature'].head(features).values
    print(top_features)

    # change all data according to the top ones we have selected
    trainX = pd.DataFrame(trainX, columns=feature_names)[top_features]
    testX = pd.DataFrame(testX, columns=feature_names)[top_features]
    X = pd.DataFrame(X, columns=feature_names)[top_features]
    test_data_processed = pd.DataFrame(test_data_processed, columns=feature_names)[top_features]

    print("features extracted")
    
    # retrain the model
    sample_model.fit(trainX, trainY)

    print("features trained")
    
    return sample_model, X, trainX, trainY, testX, test_data_processed

## model intialization
here model is intialized

In [None]:
trainX = preprocessor.fit_transform(trainX)
print("trainX completed")
testX = preprocessor.transform(testX)
print("testX completed")
test_data = preprocessor.transform(test_data)
print("test data completed")
X = preprocessor.transform(X)
print(X.shape)

trainX completed
testX completed
test data completed
(181507, 2214)


In [None]:
def build_nn(input_dim):
    model = Sequential()
    model.add(Dense(128, activation="relu", input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer=Adam(learning_rate=0.001), loss="mse")
    return model

In [None]:
# Initialize the model
nn_model = build_nn(trainX.shape[1])

# Define callbacks
lr_scheduler = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, verbose=1)
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, verbose=1)

# Train the model
nn_model.fit(
    trainX, trainY,
    validation_data=(testX, testY),
    epochs=100,
    batch_size=32,
    verbose=1,
    callbacks=[lr_scheduler, early_stopping]
)

In [None]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data.shape)

X shape ->  (181507, 2214)
trainX shape ->  (127054, 2214)
testX shape ->  (54453, 2214)
test_data_processed shape ->  (77789, 2214)


# feature selection
here we will apply feature selection and feature importance

In [None]:
# apply feature selection here
# features_selected = SelectFromModel(model)

## model running
here we run the model

In [None]:
y_pred_scaled = nn_model.predict(testX).flatten()
y_pred = StandardScaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

In [None]:
# fit the model
model.fit(trainX, trainY)

In [None]:
# display information regarding the regression
print("model score: ", model.score(trainX, trainY))
# print("model coefficient: ", model.coef_)
# print("model intercept: ", model.intercept_)

model score:  0.6360634217245433


In [None]:
# compute this predictions metrics
def metrics(y_pred, testY):
    print("starting to compute metrics")
    
    # # display the accuracy of this prediction
    # accuracy = accuracy_score(testY, y_pred)
    # print("model accuracy = ", accuracy, "   ")

    # # now lets calculate the ROC AUC score according to this prediction
    # roc_score = roc_auc_score(testY, y_pred)
    # print("roc score = ", roc_score, "   ")

    # display the mean squared error of this prediction
    mse = mean_squared_error(testY, y_pred)
    print("Mean squared error: %.2f" % mse, "   ")

    # display the root mean squared error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print("Root Mean squared error: %.2f" % rmse, "   ")

    # display the mean absolute error of this prediction
    mae = mean_absolute_error(testY, y_pred)
    print("Mean absolute error: %.2f" % mae, "   ")

    # display the coeffeicient of determination of this preduction
    r2_Score = r2_score(testY, y_pred)
    print("Coefficient of determination: %.2f" % r2_Score, "    ")

In [None]:
# predict using this model USING PREDICT
y_pred = model.predict(testX)
print("successfully predicted")
metrics(y_pred, testY)

successfully predicted
starting to compute metrics
Mean squared error: 178441627053489.91    
Root Mean squared error: 13358204.48    
Mean absolute error: 6738000.85    
Coefficient of determination: 0.63     


In [None]:
# # predict using thus model USING PREDICTPROBA
# y_pred_proba = model.predict_proba(testX)[:, 1]
# print("successfully predicted")
# metrics(y_pred_proba, testY)

## predict for test dataset
fit the model and predict for test dataset

In [None]:
model.fit(X, Y)

In [None]:
# display information regarding the regression
print("model score: ", model.score(X, Y), "    ")
# print("model coefficient: ", model.coef_)
# print("model intercept: ", model.intercept_)

model score:  0.6336635394027026     


In [None]:
y_test_pred_scaled = nn_model.predict(testX).flatten()
y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()

[9088462.40342109 9006610.79896988 6307634.82098109 ... 5172732.95264951
 5172732.95264951 5172732.95264951]


## write into csv
now we write the predictions into the csv file

In [None]:
sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")

sample_data['price_doc'] = test_prediction
sample_data

sample_data.to_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\lasso1.csv", index=False)
sample_data

Unnamed: 0,row ID,price_doc
0,Row3,9.088462e+06
1,Row6,9.006611e+06
2,Row11,6.307635e+06
3,Row12,8.077054e+06
4,Row14,8.868798e+06
...,...,...
77784,Row18591dupl_228801,5.478390e+07
77785,Row18591dupl_228803,5.112033e+07
77786,Row18591dupl_228814,5.172733e+06
77787,Row18591dupl_228817,5.172733e+06


In [None]:
model