# Importing Libraries
in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [158]:
# importing all the necessary libraries
import pandas as pd

from numpy import mean
import numpy as np
import time

# step 1: preprocessing
from sklearn.impute import SimpleImputer # import some strategic imputer to fill in any missing values using mean
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer # scale all the values to one range to avoid any biasness (this bias is seen in mostly naive bayes and knn etc)

from sklearn.impute import KNNImputer # import some strategic imputer to fill missing values using KNN (finds the nearest neighbour and fills it with that value)

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold

from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso

# step 2: data division
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid # to divide the code into train/test using a specific percentage or with/without replacement

# step 3: model
from sklearn.tree import DecisionTreeRegressor

# step 4: displaying accuracy
from sklearn.metrics import roc_auc_score, accuracy_score # to display the accuracy of our tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer

# step 5: warning filter
import warnings
warnings.filterwarnings('ignore')

# Data Loading
data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [159]:
# lets load the training data set
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\train\train.csv")

# lets also check it by getting the first few rows of the data, there should be x1 - x78 and one target variable Y
train_data.head() 

Unnamed: 0,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,43.0,27.0,4.0,Investment,Bibirevo,6407578.1,155572.0,0.189727,7e-05,9576.0,...,9.0,4.0,0.0,13.0,22.0,1.0,0.0,52.0,4.0,5850000.0
1,34.0,19.0,3.0,Investment,Nagatinskij Zaton,9589336.912,115352.0,0.372602,0.049637,6880.0,...,15.0,3.0,0.0,15.0,29.0,1.0,10.0,66.0,14.0,6000000.0
2,43.0,29.0,2.0,Investment,Tekstil'shhiki,4808269.831,101708.0,0.11256,0.118537,5879.0,...,10.0,3.0,0.0,11.0,27.0,0.0,4.0,67.0,10.0,5700000.0
3,77.0,77.0,4.0,Investment,Basmannoe,8398460.622,108171.0,0.015234,0.037316,5706.0,...,319.0,108.0,17.0,135.0,236.0,2.0,91.0,195.0,14.0,16331452.0
4,67.0,46.0,14.0,Investment,Nizhegorodskoe,7506452.02,43795.0,0.00767,0.486246,2418.0,...,62.0,14.0,1.0,53.0,78.0,1.0,20.0,113.0,17.0,9100000.0


In [160]:
# lets load the test data
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\test\test.csv")

# check if the data has been loaded by getting the first 5 rows - there should be x1 - x78 and no target variable Y as this is test data
test_data.head() 

Unnamed: 0,row ID,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,Row3,89.0,50.0,9.0,Investment,Mitino,12583540.0,178473.0,0.194703,0.069753,...,15.0,11.0,2.0,1.0,4.0,4.0,0.0,0.0,26.0,3.0
1,Row6,25.0,14.0,10.0,Investment,Sokol'niki,10320470.0,57405.0,0.523439,0.042307,...,144.0,81.0,16.0,3.0,38.0,80.0,1.0,27.0,127.0,8.0
2,Row11,38.0,19.0,11.0,Investment,Zapadnoe Degunino,7632940.0,78810.0,0.051844,0.437885,...,39.0,8.0,3.0,0.0,10.0,9.0,0.0,0.0,35.0,4.0
3,Row12,43.0,28.0,4.0,Investment,Kuncevo,52351770.0,142462.0,0.070662,0.035145,...,21.0,13.0,9.0,1.0,7.0,15.0,0.0,2.0,47.0,0.0
4,Row14,31.0,21.0,3.0,Investment,Lefortovo,8993640.0,89971.0,0.066941,0.306977,...,205.0,88.0,19.0,2.0,63.0,100.0,0.0,28.0,132.0,14.0


# Data Preprocessing
before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

## split data into categorical and numerical
categorical will have one-hot and simple imputer of most frequent while numerical will have simple mean imputer and minmax scaler

In [161]:
categorical_cols = train_data.select_dtypes(include=["object"]).columns
numerical_cols = train_data.select_dtypes(exclude=["object"]).drop(columns=['price_doc']).columns
print(categorical_cols.shape)
print(numerical_cols.shape)

(15,)
(256,)


In [162]:
# num_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("scaler", MinMaxScaler())
# ])

# cat_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])

In [163]:
# # Column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", num_transformer, numerical_cols),
#         ("cat", cat_transformer, categorical_cols)
#     ]
# )

In [164]:
# train_data = preprocessor.fit_transform(train_data)
# print("train completed")
# test_data = preprocessor.transform(test_data)
# print("test data completed")

# imputer

In [165]:
num_imputer = SimpleImputer(strategy="mean")
train_data[numerical_cols] = num_imputer.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = num_imputer.transform(test_data[numerical_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
train_data[categorical_cols] = cat_imputer.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = cat_imputer.transform(test_data[categorical_cols])

# scaler

In [166]:
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# one-hot

In [167]:
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)
# Align the test data with the training data columns
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

# test_data = test_data.drop(columns=['price_doc'], errors="ignore")

## correlation matrix
i tried getting the correlation matrix but apparently a 2000 columns matrix is very computationally expensive as it performs pairs for all. so dont run it. it takes too long and then fails. i ran for 5 minutes. 

In [168]:
# # DONT RUN
# corr_matrix = train_data.corr()
# print(corr_matrix)

# variance filter

In [169]:
# train_features = train_data.drop(columns=['price_doc'])

In [170]:
# feature_variances = train_features.var(axis=0)
# print(feature_variances.describe())

In [171]:
# from sklearn.feature_selection import VarianceThreshold

# selector = VarianceThreshold(threshold=0.001) 
# train_features_reduced = selector.fit_transform(train_features)
# print(train_features_reduced)
# print(train_features_reduced.shape)

# # test_features = selector.transform(test_data)

# PCA
principal component analysis is applied

In [172]:
# # plot an elbow graph to find the optimal number of components
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA

# pca = PCA()
# pca.fit(train_features_reduced)  # Fit PCA to your dataset

# plt.plot(pca.explained_variance_ratio_)
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')
# pca.explained_variance_ratio_.round(3)
# cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# print("Cumulative explained variance: ", cumulative_variance)
# print(cumulative_variance)

In [173]:
# print("Principal components shape:", pca.components_.shape)
# print("Principal components:\n", pca.components_)

# # Preserve the target variable
# train_copy = train_data.copy()  # Backup original data
# test_copy = test_data.copy()

# # Separate target variable and features
# train_features = train_data.drop(columns=['price_doc'])  # Features for PCA
# train_target = train_data['price_doc']                  # Target variable

# test_features = test_data.drop(columns=['price_doc'])    # Features for PCA

# # Apply PCA transformation
# pca = PCA(n_components=0.95)
# train_features_pca = pca.fit_transform(train_features)  # Fit and transform train data
# print("train transformed")
# test_features_pca = pca.transform(test_features)        # Transform test data

# print(train_features_pca.shape)
# print(test_features_pca.shape)

In [174]:
# # Reconstruct train_data and test_data with PCA-transformed features and target variable
# train_data = pd.DataFrame(train_features_pca)
# train_data['price_doc'] = train_target.reset_index(drop=True)

# test_data = pd.DataFrame(test_features_pca)
# test_data['price_doc'] = test_copy['price_doc'].reset_index(drop=True)

In [175]:
print("train shape: ", train_data.shape)
print("test shape: ", test_data.shape)

train shape:  (181507, 2200)
test shape:  (77789, 2200)


## Data Splitting - festures and targets
the data in train_data set is of x1 - x78 columns (79 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [176]:
# so in X, it is ALL the columns EXCEPT the last column known as 'Y' (we can confirm this using the train_data.head() we did earlier) so we must get all columns and DROP only the 'y' column
X = train_data.drop(columns=['price_doc'])
X # lets display X and see what it is now

Unnamed: 0,full_sq,life_sq,floor,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_education_centers_raion,children_school,...,big_market_raion_yes,nuclear_reactor_raion_yes,detention_facility_raion_yes,water_1line_yes,big_road1_1line_yes,railroad_1line_yes,ecology_good,ecology_no data,ecology_poor,ecology_satisfactory
0,-0.364709,-0.375224,-0.515142,-0.508697,1.231182,-0.241006,-1.014459,1.029869,0.260435,1.149999,...,False,False,False,False,False,False,True,False,False,False
1,-0.372807,-0.380537,-0.582855,-0.441569,0.506751,0.615637,-0.663081,0.351778,0.260435,0.519889,...,False,False,False,False,False,False,False,False,False,False
2,-0.364709,-0.373896,-0.650567,-0.542440,0.260999,-0.602482,-0.174655,0.100009,-0.080357,0.136387,...,False,False,False,False,False,False,False,False,True,False
3,-0.334116,-0.342017,-0.515142,-0.466694,0.377409,-1.058387,-0.750422,0.056496,0.942019,0.270070,...,False,False,False,False,False,True,False,False,False,False
4,-0.343114,-0.362605,0.161979,-0.485513,-0.782113,-1.093817,2.431993,-0.770494,-0.761941,-0.776160,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181502,-0.360210,-0.371239,-0.582855,0.718121,-1.481796,1.616080,-0.973703,-1.291638,-1.443525,-1.312865,...,False,False,False,False,False,False,False,True,False,False
181503,-0.360210,-0.371239,-0.582855,0.718121,-1.481796,1.616080,-0.973703,-1.291638,-1.443525,-1.312865,...,False,False,False,False,False,False,False,True,False,False
181504,-0.360210,-0.371239,-0.582855,0.718121,-1.481796,1.616080,-0.973703,-1.291638,-1.443525,-1.312865,...,False,False,False,False,False,False,False,True,False,False
181505,-0.360210,-0.371239,-0.582855,0.718121,-1.481796,1.616080,-0.973703,-1.291638,-1.443525,-1.312865,...,False,False,False,False,False,False,False,True,False,False


In [177]:
# so as per our X output, we can see that number of columns in train_data is 79 and number of columns in X is 78 meaning we have successfully performed our removal of target variable
# now to get the target variable alone, we can just get it alone,
Y = train_data['price_doc']
Y # lets see what it is
# as per our Y output, we can see it is of one column and 246k rows which means we have successfully extracted the target variable column

0          5850000.0
1          6000000.0
2          5700000.0
3         16331452.0
4          9100000.0
             ...    
181502     3480000.0
181503     3480000.0
181504     3480000.0
181505     3480000.0
181506     3480000.0
Name: price_doc, Length: 181507, dtype: float64

# Filters
there are two types of filters to filter out columns/features:
- variance filter (a column which has same values throughout the column like all are sunny)
- correlation filter (two columns which are same like weight in kg and weight in pounds)

In [178]:
# print("X : ", X.shape)
# print("test data : ", test_data_processed.shape)

In [179]:
# variance filter
# ----------------------------- case  -----------------------------
# variance_filter = VarianceThreshold(threshold=0.001)  # Adjust the threshold if needed
# X = variance_filter.fit_transform(X)
# test_data_processed = variance_filter.fit_transform(test_data_processed)
X.shape

(181507, 2199)

In [180]:
# test_data_processed.shape

In [181]:
# # correlation filter
# # ----------------------------- case  -----------------------------
# corr_matrix = pd.DataFrame(X).corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
# X = pd.DataFrame(X).drop(columns=to_drop)
# test_data_processed = pd.DataFrame(test_data_processed).drop(columns=to_drop)
X.shape

(181507, 2199)

In [182]:
# test_data_processed.shape

## Data Splitting - train and validate
now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.
our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 
So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [183]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3, random_state=2)

# functions
here we have defined functions like forward-backward selection, kbest selection & algorithm feature importance

In [200]:
from sklearn.feature_selection import SequentialFeatureSelector
import numpy as np

class VerboseSequentialFeatureSelector(SequentialFeatureSelector):
    def fit(self, X, y):
        print("Starting feature selection...")
        selected_features = []
        remaining_features = list(range(X.shape[1]))
        scores_per_iteration = []
        
        # Iterate for the number of features to select
        for i in range(1, self.n_features_to_select + 1):
            print(f"\nSelecting feature {i}/{self.n_features_to_select}...")
            
            # Call the parent fit method
            super().fit(X, y)
            selected_features = self.get_support(indices=True)
            
            # Gather scores for the current iteration
            if hasattr(self, 'estimator_') and hasattr(self.estimator_, 'score'):
                current_score = self.estimator_.score(X[:, selected_features], y)
            else:
                current_score = None
            
            # Update remaining features and selected features
            remaining_features = [
                idx for idx in range(X.shape[1]) if idx not in selected_features
            ]
            scores_per_iteration.append(current_score)
            
            # Detailed output
            print(f"Features selected so far: {selected_features}")
            if current_score is not None:
                print(f"Current model score: {current_score:.4f}")
            print(f"Remaining features: {remaining_features}")
            print(f"Scores so far: {scores_per_iteration}")
        
        print("\nFeature selection completed.")
        print(f"Final selected features: {selected_features}")
        return self


In [201]:
# forward backward selection

def fbselection(direction, sample_model, features, sample_X, sample_Y, X, trainX, trainY, testX, test_data_processed):

    print("starting")

    selection = VerboseSequentialFeatureSelector(sample_model, direction=direction, n_features_to_select=features, scoring='r2', n_jobs=-1)

    return modelSelector(sample_model, selection, sample_X, sample_Y, X, trainX, trainY, testX, test_data_processed)



def modelSelector(sample_model, selection, sample_X, sample_Y, X, trainX, trainY, testX, test_data_processed):

    print("start extracting")

    sample_X = selection.fit_transform(sample_X, sample_Y)

    print("extracted, transforming")

    trainX = selection.transform(trainX)

    testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly

    test_data_processed = selection.transform(test_data_processed)      # test data is also transformed

    X = selection.transform(X)                                          # full data transforming

    print("transformed")

    return sample_model, X, trainX, trainY, testX, test_data_processed



# kbest selection

def kbest(sample_model, features, sample_X, sample_Y, X, trainX, trainY, testX, test_data_processed):

    print("starting")

    selection = SelectKBest(score_func=f_classif, k=features)

    return modelSelector(sample_model, selection, sample_X, sample_Y,X, trainX, trainY, testX, test_data_processed)

In [202]:
# feature importance function
def featureImportance(sample_model, features, X, trainX, trainY, testX, test_data_processed):
    print("fitting")
    
    # fit the model
    sample_model.fit(trainX, trainY)

    print("extracting features")

    # extract all the feature names from data
    importances = sample_model.feature_importances_
    feature_names = train_data_processed.drop(columns=['price_doc']).columns
    print(feature_names)

    # sort with respect to importance
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # extract the top ones
    top_features = feature_importance_df['Feature'].head(features).values
    print(top_features)

    # change all data according to the top ones we have selected
    trainX = pd.DataFrame(trainX, columns=feature_names)[top_features]
    testX = pd.DataFrame(testX, columns=feature_names)[top_features]
    X = pd.DataFrame(X, columns=feature_names)[top_features]
    test_data_processed = pd.DataFrame(test_data_processed, columns=feature_names)[top_features]

    print("features extracted")
    
    # retrain the model
    sample_model.fit(trainX, trainY)

    print("features trained")
    
    return sample_model, X, trainX, trainY, testX, test_data_processed

## model intialization
here model is intialized

In [203]:
# declare the model here
model = DecisionTreeRegressor(random_state=0, max_depth=5, criterion='poisson')

# ridge best: ("model", Ridge(alpha=100, solver='lsqr', tol=0.001))

In [204]:
print("X shape -> ", X.shape)
print("trainX shape -> ", trainX.shape)
print("testX shape -> ", testX.shape)
print("test_data_processed shape -> ", test_data.shape)

X shape ->  (181507, 2199)
trainX shape ->  (127054, 2199)
testX shape ->  (54453, 2199)
test_data_processed shape ->  (77789, 2200)


# feature selection
here we will apply feature selection and feature importance

In [205]:
sample_train = train_data.sample(frac=0.1)
sample_X = sample_train.drop(columns=['price_doc'])
sample_Y = sample_train['price_doc']

In [206]:
print(sample_X)
print(sample_X.shape)
print(sample_Y.shape)

         full_sq   life_sq     floor    area_m  raion_popul  green_zone_part  \
65075  -0.364688 -0.374545 -0.448214 -0.551152     0.132269        -0.831097   
807    -0.348513 -0.363934 -0.041157 -0.290469     1.228570         0.454258   
61613  -0.368344 -0.367328 -0.111610 -0.563712    -0.196498        -0.039914   
16184  -0.319720  0.804836 -0.650567 -0.378397     1.643668        -0.217698   
74885  -0.351885 -0.371424 -0.535540 -0.473251     0.531787        -0.904848   
...          ...       ...       ...       ...          ...              ...   
135689 -0.369839 -0.380537 -0.494901 -0.530771    -0.113482        -0.206617   
123343 -0.386927 -0.381774 -0.010717 -0.448044     1.016644        -0.411738   
171079 -0.373921 -0.380812 -0.327878 -0.574591    -1.348905        -0.748397   
87273  -0.358499 -0.373232  0.118091 -0.404969     0.480544        -0.737157   
75327  -0.352143 -0.372865 -0.099964 -0.473251     0.531787        -0.904848   

        indust_part  children_preschool

In [207]:
# apply feature selection here
model, X, trainX, trainY, testX, test_data = fbselection( "forward", model, 10, sample_X, sample_Y, X, trainX, trainY, testX, test_data )

starting
start extracting
Starting feature selection...

Selecting feature 1/10...
Features selected so far: [   0   37   49  111  171  207  216  231  250 1113]
Remaining features: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,

MemoryError: Unable to allocate 1.82 GiB for an array with shape (1926, 127054) and data type float64

In [222]:
# The final selected feature indices
selected_features = [0, 37, 49, 111, 171, 207, 216, 231, 250, 1113]

# Assuming your datasets are pandas DataFrames
# Extract the selected features from each dataset
X = X.iloc[:, selected_features]
trainX = trainX.iloc[:, selected_features]
testX = testX.iloc[:, selected_features]
test_data = test_data.iloc[:, selected_features]

# If the data is not in pandas format (e.g., NumPy arrays), convert to DataFrame or use NumPy indexing:
# X_selected = X[:, selected_features]
# trainX_selected = trainX[:, selected_features]
# testX_selected = testX[:, selected_features]
# test_data_selected = test_data[:, selected_features]

# Verify the shapes of the transformed datasets
print("X_selected shape:", X.shape)
print("trainX_selected shape:", trainX.shape)
print("testX_selected shape:", testX.shape)
print("test_data_selected shape:", test_data.shape)

X_selected shape: (181507, 10)
trainX_selected shape: (127054, 10)
testX_selected shape: (54453, 10)
test_data_selected shape: (77789, 10)


In [242]:
print(X.columns)
print(testX.columns)
print(test_data.columns)

Index(['full_sq', '0_17_all', 'build_count_frame', 'hospice_morgue_km',
       'trc_count_1500', 'leisure_count_2000', 'cafe_count_3000',
       'sport_count_3000', 'big_church_count_5000',
       'sub_area_Orehovo-Borisovo Sevenroe'],
      dtype='object')
Index(['full_sq', '0_17_all', 'build_count_frame', 'hospice_morgue_km',
       'trc_count_1500', 'leisure_count_2000', 'cafe_count_3000',
       'sport_count_3000', 'big_church_count_5000',
       'sub_area_Orehovo-Borisovo Sevenroe'],
      dtype='object')
Index(['full_sq', '0_17_all', 'build_count_frame', 'hospice_morgue_km',
       'trc_count_1500', 'leisure_count_2000', 'cafe_count_3000',
       'sport_count_3000', 'big_church_count_5000',
       'sub_area_Orehovo-Borisovo Seevrnoe'],
      dtype='object')


In [243]:
# Rename the column in test_data
test_data = test_data.rename(columns={
    'sub_area_Orehovo-Borisovo Seevrnoe': 'sub_area_Orehovo-Borisovo Sevenroe'
})

# Verify that the column names now match
print(test_data.columns)

Index(['full_sq', '0_17_all', 'build_count_frame', 'hospice_morgue_km',
       'trc_count_1500', 'leisure_count_2000', 'cafe_count_3000',
       'sport_count_3000', 'big_church_count_5000',
       'sub_area_Orehovo-Borisovo Sevenroe'],
      dtype='object')


# grid search

In [244]:
def gridsearch(param_grid, model, scorer, trainX, trainY):
    print("starting grid search")

    # intialize grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scorer, verbose=3)
    print("grid search is intialized")

    # fit the model
    grid_search.fit(trainX, trainY)
    print("grid search fitting completed")

    # display the best model grid search found
    best_model = grid_search.best_estimator_
    print(best_model)

    # display the best parameters of the best model
    best_parameters = grid_search.best_params_
    print(best_parameters)

    # display the best score of the best model
    print("Best cross-validated score:", grid_search.best_score_)

    # assign the best model our model
    model = best_model
    print("model assigned, grid search completed")

    return model

In [245]:
# Define a scoring metric (e.g., negative mean squared error)
# scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [246]:
# sample from train_data
# sample_train = train_data.sample(frac=0.5)
# sample_X = sample_train.drop('price_doc', axis=1) 
# sample_Y = sample_train['price_doc']

In [247]:
# define hyper parameters of grid
# param_grid = {
#     'splitter': ['best', 'random']
# }

In [248]:
# model = gridsearch(param_grid, model, scorer, trainX, trainY)

## model running
here we run the model

In [250]:
# fit the model
model.fit(trainX, trainY)

In [251]:
# display information regarding the regression
print("model score: ", model.score(trainX, trainY))
# print("model coefficient: ", model.coef_)
# print("model intercept: ", model.intercept_)

model score:  0.651767602874536


In [252]:
# compute this predictions metrics
def metrics(y_pred, testY):
    print("starting to compute metrics")
    
    # # display the accuracy of this prediction
    # accuracy = accuracy_score(testY, y_pred)
    # print("model accuracy = ", accuracy, "   ")

    # # now lets calculate the ROC AUC score according to this prediction
    # roc_score = roc_auc_score(testY, y_pred)
    # print("roc score = ", roc_score, "   ")

    # display the mean squared error of this prediction
    mse = mean_squared_error(testY, y_pred)
    print("Mean squared error: %.2f" % mse, "   ")

    # display the root mean squared error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print("Root Mean squared error: %.2f" % rmse, "   ")

    # display the mean absolute error of this prediction
    mae = mean_absolute_error(testY, y_pred)
    print("Mean absolute error: %.2f" % mae, "   ")

    # display the coeffeicient of determination of this preduction
    r2_Score = r2_score(testY, y_pred)
    print("Coefficient of determination: %.2f" % r2_Score, "    ")

In [253]:
# predict using this model USING PREDICT
y_pred = model.predict(testX)
print("successfully predicted")
metrics(y_pred, testY)

successfully predicted
starting to compute metrics
Mean squared error: 169740577777018.22    
Root Mean squared error: 13028452.62    
Mean absolute error: 6097042.84    
Coefficient of determination: 0.65     


In [254]:
# # predict using thus model USING PREDICTPROBA
# y_pred_proba = model.predict_proba(testX)[:, 1]
# print("successfully predicted")
# metrics(y_pred_proba, testY)

## predict for test dataset
fit the model and predict for test dataset

In [255]:
model.fit(X, Y)

In [256]:
# display information regarding the regression
print("model score: ", model.score(X, Y), "    ")
# print("model coefficient: ", model.coef_)
# print("model intercept: ", model.intercept_)

model score:  0.6500690352657126     


In [257]:
test_prediction = model.predict(test_data)#.drop(columns=['price_doc']))

# test_prediction=test_prediction[:, 1]

print(test_prediction)

[12654074.56319198  5934937.68490818  5934937.68490818 ...
  3146649.04548735  3146649.04548735  3146649.04548735]


## write into csv
now we write the predictions into the csv file

In [None]:
sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\sample_submission.csv")

sample_data['price_doc'] = test_prediction
sample_data

sample_data.to_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger2\iml-fall-2024-challenge-2\regTree2.csv", index=False) 
sample_data

Unnamed: 0,row ID,price_doc
0,Row3,1.265407e+07
1,Row6,5.934938e+06
2,Row11,5.934938e+06
3,Row12,5.934938e+06
4,Row14,5.934938e+06
...,...,...
77784,Row18591dupl_228801,5.697207e+07
77785,Row18591dupl_228803,5.521424e+07
77786,Row18591dupl_228814,3.146649e+06
77787,Row18591dupl_228817,3.146649e+06


In [259]:
model