In [240]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [241]:
dataset = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [242]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Check for missing value ans solving it

In [243]:
cols_with_missing_value = []
for i,j in zip(list(dataset.columns), list(dataset.isnull().sum())):
    if j>0:
        cols_with_missing_value.append(i)

numerical_columns = dataset.select_dtypes(include=[float, int]).columns.tolist()
categorical_columns = dataset.select_dtypes(include=[object, 'category']).columns.tolist()

numerical_columns_with_missing_value, categorical_columns_with_missing_value = [], []
for i in cols_with_missing_value:
    if i in numerical_columns:
        numerical_columns_with_missing_value.append(i)
    else:
        categorical_columns_with_missing_value.append(i)


In [244]:
numerical_columns_with_missing_value

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [245]:
categorical_columns_with_missing_value

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

### reassign missing value in numerical columns with median value of that columns

In [246]:
for column in numerical_columns_with_missing_value:
    median_value = dataset[column].median()
    dataset[column].fillna(median_value, inplace=True)

### reassign missing value in categorical columns with most frequent value of that columns

In [247]:
for column in categorical_columns_with_missing_value:
    mode_value = dataset[column].mode()[0]
    dataset[column].fillna(mode_value, inplace=True)

In [248]:
dataset.isnull().sum().value_counts()

0    81
dtype: int64

- All the missing values is dealed with

### Now lets check and remove all the duplicate values

In [249]:
dataset[dataset.duplicated()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


- Since their is not duplicate values, so lets move to outlier detecting

In [250]:
def outlier_thresholds(df, col_name, q1=0.25, q3=0.75):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit



In [251]:
numerical_cols = dataset.select_dtypes(include=[float, int]).columns.tolist()

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
cols_with_outliers = []
for i in numerical_cols:
    if check_outlier(dataset, i):
        cols_with_outliers.append(i)
print(cols_with_outliers)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']


### Now we will assign outliers with thresholds.

In [252]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for i in cols_with_outliers:
    replace_with_thresholds(dataset, i)

In [253]:
cols_with_outliers2 = []
for i in numerical_cols:
    if check_outlier(dataset, i):
        cols_with_outliers2.append(i)
print(cols_with_outliers2)

[]


- so, all the outliers are removed

#### Now lets deal with Categorical Values and convert them to numerical 

In [254]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = dataset.select_dtypes(include=[object, 'category']).columns.tolist()
encoder = OneHotEncoder(sparse=False, drop='first') 
encoded_array = encoder.fit_transform(dataset[categorical_columns])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_columns))
data_encoded = pd.concat([dataset.drop(columns=categorical_columns), encoded_df], axis=1)
data_encoded




Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450.0,7,5.0,2003,2003,196.0,706.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,20,80.0,9600.0,6,7.5,1976,1976,0.0,978.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,3,60,68.0,11250.0,7,5.0,2001,2002,162.0,486.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,4,70,60.0,9550.0,7,5.0,1915,1970,0.0,216.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,84.0,14260.0,8,5.0,2000,2000,350.0,655.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917.0,6,5.0,1999,2000,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1456,1457,20,85.0,13175.0,6,6.0,1978,1988,119.0,790.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1457,1458,70,66.0,9042.0,7,7.5,1941,2006,0.0,275.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1458,1459,20,68.0,9717.0,5,6.0,1950,1996,0.0,49.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [255]:
categorical_columns = data_encoded.select_dtypes(include=[object, 'category']).columns.tolist()
categorical_columns

[]

In [256]:
df = data_encoded

- so all the categorical values is converted to numerical values

#### Dividing dataset into independent(X) and dependent(y) variables

In [257]:
X = df.drop('SalePrice',axis=1).values
y = df['SalePrice'].values

In [258]:
X

array([[1.000e+00, 6.000e+01, 6.500e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.000e+00, 2.000e+01, 8.000e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.000e+00, 6.000e+01, 6.800e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [1.458e+03, 7.000e+01, 6.600e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.459e+03, 2.000e+01, 6.800e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.460e+03, 2.000e+01, 7.500e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [259]:
y

array([208500., 181500., 223500., ..., 266500., 142125., 147500.])

#### Train Test Split of Dataset

In [260]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 246), (292, 246), (1168,), (292,))

#### Standardization of data

In [261]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [262]:
X_train_scaled.shape

(1168, 246)

In [263]:
X_test_scaled.shape

(292, 246)

#### Apply Principal Component Analysis to find best features

In [264]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

for i,j in enumerate(cumulative_variance_ratio):
    if j>0.99:
        print(i)
        break
        

190


- We can see that only first 190 compenents captures almost 99% of variance of dataset so we will take first 190 principal component

In [265]:
pca = PCA(n_components=190)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_train_pca.shape, X_test_pca.shape, y_train.shape, y_test.shape

((1168, 190), (292, 190), (1168,), (292,))

### Now lastly lets use Random Forest Regression Algorithm to make our Regression model

In [266]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_pca, y_train)
y_pred = rf_regressor.predict(X_test_pca)



In [267]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 616998792.80
R^2 Score: 0.87


### lets apply all the data preprocessing steps on test data

In [268]:
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [269]:
test_id = test_data['Id']
test_id

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [270]:
cols_with_missing_value = []
for i,j in zip(list(test_data.columns), list(test_data.isnull().sum())):
    if j>0:
        cols_with_missing_value.append(i)

numerical_columns = test_data.select_dtypes(include=[float, int]).columns.tolist()
categorical_columns = test_data.select_dtypes(include=[object, 'category']).columns.tolist()

numerical_columns_with_missing_value, categorical_columns_with_missing_value = [], []
for i in cols_with_missing_value:
    if i in numerical_columns:
        numerical_columns_with_missing_value.append(i)
    else:
        categorical_columns_with_missing_value.append(i)

In [271]:
categorical_columns_with_missing_value

['MSZoning',
 'Alley',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType']

In [272]:
numerical_columns_with_missing_value

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [273]:
for column in numerical_columns_with_missing_value:
    median_value = test_data[column].median()
    test_data[column].fillna(median_value, inplace=True)

for column in categorical_columns_with_missing_value:
    mode_value = test_data[column].mode()[0]
    test_data[column].fillna(mode_value, inplace=True)
    
test_data.isnull().sum().value_counts()

0    80
dtype: int64

In [274]:
dataset[dataset.duplicated()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [275]:
def outlier_thresholds(df, col_name, q1=0.25, q3=0.75):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [276]:
numerical_cols = test_data.select_dtypes(include=[float, int]).columns.tolist()

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
cols_with_outliers = []
for i in numerical_cols:
    if check_outlier(test_data, i):
        cols_with_outliers.append(i)
print(cols_with_outliers)

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for i in cols_with_outliers:
    replace_with_thresholds(test_data, i)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']


In [277]:
cols_with_outliers2 = []
for i in numerical_cols:
    if check_outlier(dataset, i):
        cols_with_outliers2.append(i)
print(cols_with_outliers2)

[]


In [278]:
encoded_array_test = encoder.transform(test_data[categorical_columns])
encoded_df_test = pd.DataFrame(encoded_array_test, columns=encoder.get_feature_names_out(categorical_columns))
test_data_encoded = pd.concat([test_data.drop(columns=categorical_columns), encoded_df_test], axis=1)
test_data_encoded

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622.00,5,6.0,1961,1961,0.0,468.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1462,20,81.0,14267.00,6,6.0,1958,1958,108.0,923.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1463,60,74.0,13830.00,5,5.0,1997,1998,0.0,791.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1464,60,78.0,9978.00,6,6.0,1998,1998,20.0,602.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1465,120,43.0,5005.00,8,5.0,1992,1992,0.0,263.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,145,33.0,1936.00,4,7.0,1970,1970,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1455,2916,145,33.0,1894.00,4,5.0,1970,1970,0.0,252.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,2917,20,105.0,17707.25,5,7.0,1960,1996,0.0,1224.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,2918,85,62.0,10441.00,5,5.0,1992,1992,0.0,337.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [279]:
categorical_columns = data_encoded.select_dtypes(include=[object, 'category']).columns.tolist()
categorical_columns

[]

In [280]:
X_test_scaled = scaler.transform(test_data_encoded.values)
X_test_scaled.shape

(1459, 246)

In [281]:
X_test_pca = pca.transform(X_test_scaled)
X_test_pca.shape

(1459, 190)

In [282]:
final_pred = rf_regressor.predict(X_test_pca)
final_pred

array([130559.67, 183845.53, 213431.96, ..., 182508.5 , 142101.3 ,
       239542.4 ])

In [283]:
submission = pd.DataFrame({'Id': test_id, 'SalePrice': final_pred})
submission.to_csv('house-prices-advanced-regression-techniques/submission.csv', index=False)