<a href="https://colab.research.google.com/github/tellosofia/Predicting-Housing-Prices_Supervised-ML/blob/main/ML_Classification_Housing-Prices_Random-Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Import libraries

In [10]:
# import necessary libraries

import pandas as pd                                           # pandas for dataframes
from sklearn.model_selection import train_test_split          # train_test_split for splitting data into train and test sets
from sklearn.impute import SimpleImputer                      # SimpleImputer for imputing missing values
from sklearn.tree import DecisionTreeClassifier               # DecisionTreeClassifier for Decision Tree model
from sklearn.metrics import accuracy_score                    # accuracy_score for accuracy calculation
from sklearn.pipeline import make_pipeline                    # make_pipeline for pipeline creation
from sklearn.preprocessing import OneHotEncoder               # OneHotEncoder for encoding categorical variables
from sklearn.compose import make_column_transformer           # make_column_transformer for column transformation in pipeline
from sklearn.preprocessing import StandardScaler              # StandardScaler for standardization
from sklearn.model_selection import GridSearchCV              # GridSearchCV for hyperparameter tuning
# from sklearn import set_config             # We can optionally set transform output globally,
# set_config(transform_output="pandas")      # or choose it for particular instances
import multiprocessing                                        # for parallel processing
from sklearn.ensemble import RandomForestClassifier           # RandomForestClassifier for Random Forest model

2. Read de data

In [11]:
# define the URL of the file on Google Drive
url = "https://drive.google.com/file/d/1l5R-6kl_dqp-47DTAvn4fdRUC9W-dmS6/view?usp=sharing"

# extract the file ID from the URL and create a direct download link
path = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]

# read the CSV file from the direct download link into a pandas DataFrame
data = pd.read_csv(path)

# display the first row of the DataFrame
data.head(1)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


3. Set X and y

In [12]:
# display columns in dataframe
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

In [13]:
# data.set_index('Id', inplace=True)

In [14]:
# exclude the 'Id' column
column_to_exclude = 'Id'

 # select all columns except 'Id'
columns_for_analysis = data.columns.difference([column_to_exclude])

In [15]:
# select the columns to be used for analysis from the dataset
X = data[columns_for_analysis]

# extract the target variable 'Expensive' from the features and assign it to y
# the 'Expensive' column is removed from X in the process
y = X.pop("Expensive")

4. Create pipeline

In [16]:
# select categorical column names by excluding numerical data types
X_cat_columns = X.select_dtypes(exclude="number").columns
# select numerical column names by including only numerical data types
X_num_columns = X.select_dtypes(include="number").columns

# create a pipeline for numerical features, using SimpleImputer to handle missing values
numeric_pipe = make_pipeline(
    SimpleImputer()  # imputes missing values with the mean by default
)

# Create a pipeline for categorical features
# using SimpleImputer to handle missing values and OneHotEncoder for encoding categorical variables
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),  # imputes missing values with 'N_A'
    OneHotEncoder(drop="first", sparse_output=False, handle_unknown='ignore')  # One-hot encodes the categorical variables, drops the first level to avoid multicollinearity,
)                                                                              # and ignores unknown categories

# initialize a Random Forest Classifier
dtree = RandomForestClassifier()
# initialize a StandardScaler for feature scaling
scaler = StandardScaler()

5. Create column transformer

In [17]:
# create the ColumnTransformer to preprocess numerical and categorical columns
preprocessor = make_column_transformer(
    (numeric_pipe, X_num_columns),       # apply numeric_pipe to numerical columns (handling missing values)
    (categoric_pipe, X_cat_columns),     # apply categoric_pipe to categorical columns (handling missing values and encoding)
)

In [18]:
# create a full pipeline combining preprocessing, feature scaling, and classification
full_pipeline = make_pipeline(
    preprocessor,    # preprocess columns using preprocessor
    scaler,          # scale features using StandardScaler
    dtree            # apply RandomForestClassifier for classification
).set_output(transform='pandas')  # Set output transformation to return a pandas DataFrame

# display the full_pipeline object
full_pipeline

In [19]:
# create a param_grid for a Random Forest Classifier with Pipeline
param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],  # strategies for imputing missing values in numeric features
    'standardscaler__with_mean': [True, False],  # whether to center the data before scaling (StandardScaler parameter)
    'standardscaler__with_std': [True, False],   # whether to scale the data to unit variance (StandardScaler parameter)
    'randomforestclassifier__n_estimators': [100, 200, 300],  # number of trees in the random forest
    'randomforestclassifier__max_depth': [None, 10, 20],  # maximum depth of the trees in the random forest
    'randomforestclassifier__min_samples_split': [2, 5, 10],  # minimum number of samples required to split an internal node in the random forest
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],  # minimum number of samples required to be at a leaf node in the random forest
    'randomforestclassifier__bootstrap': [True, False]  # whether bootstrap samples are used when building trees in the random forest
}

In [20]:
# perform grid search using GridSearchCV to find the best parameters for the pipeline
search = GridSearchCV(full_pipeline,     # full pipeline including preprocessing, scaling, and classification
                      param_grid,        # parameter grid to search over
                      cv=5,              # 5-fold cross-validation
                      scoring='accuracy',# use accuracy as the metric to optimize
                      verbose=1          # display progress messages
                     )

In [21]:
# ignore warnings during fitting process
import warnings
warnings.filterwarnings("ignore")

In [22]:
# fit the GridSearchCV object to the data
search.fit(X, y)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [23]:
# print the best parameters found during the grid search
search.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 100,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': True}

In [24]:
# Pprint the best cross-validation score found during the grid search
search.best_score_

0.9561643835616438

In [25]:
# evaluate the model's accuracy on the training set
y_train_pred = search.predict(X)  # predict labels using the best estimator found by GridSearchCV

# calculate accuracy score between true labels (y) and predicted labels (y_train_pred)
accuracy_score(y, y_train_pred)

0.9993150684931507

In [26]:
# URL of the test data file hosted on Google Drive
test_url = "https://drive.google.com/file/d/1LB51vg8J2xQYc3gOdSVDZXseIbd5q3JC/view?usp=drive_link"

# extract the file ID from the URL
file_id = test_url.split('/')[-2]

# construct the download URL for the file using the file ID
test_path = "https://drive.google.com/uc?export=download&id=" + file_id

# read the test data from the CSV file into a Pandas DataFrame
test_data = pd.read_csv(test_path)

In [27]:
# display the columns in the test data
# since this is supposed to be submitted to a bootcamp contest, the test set doesn't have "Expensive" column, the labels
test_data.columns

Index(['Id', 'LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr',
       'Fireplaces', 'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'BsmtFinT

In [28]:
# extract the "Id" column from the test data and assign it to the variable "Ids"
Ids = test_data.pop("Id")

In [55]:
X_test = test_data

In [56]:
# make predictions using the best estimator found by GridSearchCV
y_test_pred = search.predict(X_test)

In [59]:
# display the predictions
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [60]:
# create a DataFrame with the "Id" column and the predictions
to_save = pd.concat([Ids, pd.DataFrame(y_test_pred)], axis=1)

In [61]:
# Rename the column with index 0 to 'Expensive' and save the DataFrame to a CSV file
to_save.rename(columns={0:'Expensive'}).to_csv('submission.csv', index=False)

In [None]:
from google.colab import files
files.download("./submission.csv")