In [99]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [101]:
file_path = '/Users/viyankamoodley/Desktop/House_Price.csv'
data = pd.read_csv(file_path)

# handling missing vals in 'LotFrontage' based on 'Neighborhood' medians, according to research, house frontage is fairly similar in areas
data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# splitting data into target and features
X = data.drop(columns=['Id', 'SalePrice'])
y = data['SalePrice']

#70 - 30 split using 309 randomstate as per instructions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=309)

# separating numerical and categorical cols
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# prep numerical data
#using standardscaler to normalise skewed data
#https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# prep for cat using OrdinalEncoder to handle unknowns
# https://scikit-learn.org/stable/modules/impute.html

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),  # filling null categorical values with 'NA'
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # handling unknown categories with -1
])

# combining into one preprocesser for cols
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# using preprocessor on training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# tranforming test data w preprocessor
X_test_preprocessed = preprocessor.transform(X_test)

# checking shape to make sure data structure meets my expectation
print(f"Shape of X_train_preprocessed: {X_train_preprocessed.shape}")
print(f"Shape of X_test_preprocessed: {X_test_preprocessed.shape}")

# storing names of num and cat features
all_features = np.concatenate([numerical_cols, categorical_cols])

# making a df to store the preprocessed data
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=all_features)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=all_features)

# saving preppd training and test datasets to csv files
X_train_preprocessed_df.to_csv('/Users/viyankamoodley/Desktop/House_Price_Train_Preprocessed_Updated.csv', index=False)
X_test_preprocessed_df.to_csv('/Users/viyankamoodley/Desktop/House_Price_Test_Preprocessed_Updated.csv', index=False)

Shape of X_train_preprocessed: (1022, 79)
Shape of X_test_preprocessed: (438, 79)


In [103]:
X_train_preprocessed_df.head() #checking 

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,0.094355,-0.260089,0.061833,0.660782,-0.500349,0.897528,0.708786,-0.575678,-0.96138,-0.310489,...,1.0,0.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
1,0.094355,-0.120811,0.017372,1.395624,-0.500349,1.202484,1.097845,-0.575678,-0.918642,-0.310489,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
2,2.508132,-2.163545,-0.842755,0.660782,-0.500349,0.931412,0.757418,0.586535,0.128445,-0.310489,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
3,-0.871156,-0.445792,-0.179351,-0.808901,1.316193,0.185964,1.146477,-0.575678,0.175457,0.546585,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
4,0.094355,0.157743,-0.175954,2.130466,-0.500349,1.100832,1.00058,0.608464,1.538807,-0.310489,...,1.0,0.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0


In [105]:
X_test_preprocessed_df.head() #checking

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,-0.871156,0.529149,0.128373,0.660782,-0.500349,0.219848,-0.312494,0.553642,0.543006,-0.310489,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,0.0,0.0
1,-0.871156,0.436298,-0.099423,-0.808901,0.407922,-0.5256,-1.382406,-0.575678,0.115623,-0.310489,...,1.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
2,0.094355,0.250595,-0.100322,1.395624,-0.500349,1.100832,1.00058,1.310177,-0.96138,-0.310489,...,3.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
3,0.57711,-0.538643,-0.100721,0.660782,-0.500349,1.134716,1.00058,0.422071,0.438297,-0.310489,...,1.0,0.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,4.0
4,-0.871156,0.668427,0.130371,0.660782,-0.500349,0.185964,-0.361126,0.570089,0.795161,-0.310489,...,1.0,0.0,5.0,5.0,2.0,3.0,4.0,1.0,6.0,3.0


In [107]:

#https://scikit-learn.org/stable/modules/decomposition.html#pca
# pca for first method of dimensionality reduction
pca = PCA(n_components=0.95)  # getting number of components that explain 95% of variance
X_train_pca = pca.fit_transform(X_train_preprocessed)
X_test_pca = pca.transform(X_test_preprocessed)

print(f"Number of components after PCA: {pca.n_components_}")
# https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
# selectkbest for second method of dimensionality reduction using univariate feature selection 
k_best = SelectKBest(score_func=f_regression)
X_train_kbest = k_best.fit_transform(X_train_preprocessed, y_train)
X_test_kbest = k_best.transform(X_test_preprocessed)

# showing selected features from selectkbest
selected_kbest_features = np.array(all_features)[k_best.get_support()]
print(f"Selected features by SelectKBest: {selected_kbest_features}")


Number of components after PCA: 44
Selected features by SelectKBest: ['OverallQual' 'TotalBsmtSF' '1stFlrSF' 'GrLivArea' 'FullBath'
 'GarageCars' 'GarageArea' 'ExterQual' 'BsmtQual' 'KitchenQual']


In [109]:
# lr on pca data
linear_reg_pca = LinearRegression()
linear_reg_pca.fit(X_train_pca, y_train)
y_train_pred_pca = linear_reg_pca.predict(X_train_pca)
y_test_pred_pca = linear_reg_pca.predict(X_test_pca)

# ridge reg on pca transformed data
ridge_reg_pca = Ridge(alpha=0.5)
ridge_reg_pca.fit(X_train_pca, y_train)
y_train_pred_ridge_pca = ridge_reg_pca.predict(X_train_pca)
y_test_pred_ridge_pca = ridge_reg_pca.predict(X_test_pca)

# printing results of lr and irdge for pca
print("Linear Regression with PCA:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_pca)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_pca)}")

print("Ridge Regression with PCA:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_ridge_pca)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_ridge_pca)}")


# lr on slectkbest features
linear_reg_kbest = LinearRegression()
linear_reg_kbest.fit(X_train_kbest, y_train)
y_train_pred_kbest = linear_reg_kbest.predict(X_train_kbest)
y_test_pred_kbest = linear_reg_kbest.predict(X_test_kbest)

# ridge reg with selectkbest features
ridge_reg_kbest = Ridge(alpha=0.5)
ridge_reg_kbest.fit(X_train_kbest, y_train)
y_train_pred_ridge_kbest = ridge_reg_kbest.predict(X_train_kbest)
y_test_pred_ridge_kbest = ridge_reg_kbest.predict(X_test_kbest)

# printing results of lr and ridge for selectkbest
print("Linear Regression with SelectKBest:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_kbest)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_kbest)}")

print("Ridge Regression with SelectKBest:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_ridge_kbest)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_ridge_kbest)}")

Linear Regression with PCA:
Train MSE: 1207651130.0447474
Test MSE: 837237981.061347
Ridge Regression with PCA:
Train MSE: 1207651206.596266
Test MSE: 837278424.8034133
Linear Regression with SelectKBest:
Train MSE: 1416411378.7681773
Test MSE: 1113792942.308731
Ridge Regression with SelectKBest:
Train MSE: 1416411799.555341
Test MSE: 1113901764.9324772


In [111]:
# rf reg on pca
rf_reg_pca = RandomForestRegressor(random_state=309)
rf_reg_pca.fit(X_train_pca, y_train)
y_train_pred_rf_pca = rf_reg_pca.predict(X_train_pca)
y_test_pred_rf_pca = rf_reg_pca.predict(X_test_pca)

# printing rf ref results on pca
print("Random Forest Regression on PCA:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_rf_pca)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_rf_pca)}")

# rf reg on selectkbest
rf_reg_kbest = RandomForestRegressor(random_state=309)
rf_reg_kbest.fit(X_train_kbest, y_train)
y_train_pred_rf_kbest = rf_reg_kbest.predict(X_train_kbest)
y_test_pred_rf_kbest = rf_reg_kbest.predict(X_test_kbest)

# printint rf reg  results on selectk
print("Random Forest Regression on SelectKBest:")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred_rf_kbest)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred_rf_kbest)}")


Random Forest Regression on PCA:
Train MSE: 190033663.85781825
Test MSE: 1040741294.2819452
Random Forest Regression on SelectKBest:
Train MSE: 201323091.84153545
Test MSE: 790028489.9397485


In [113]:
# main pages used for preprocessing:
# https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-numerical-and-categorical-data
# https://scikit-learn.org/stable/modules/compose.html#pipeline
# https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features

#main pages used for dim reduction
#https://scikit-learn.org/stable/modules/decomposition.html#pca
# https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

#main pages used for modelling
#https://scikit-learn.org/stable/modules/linear_model.html
#https://scikit-learn.org/stable/modules/ensemble.html#random-forests
#https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-error


Statement on Use of AI tools: During this assignment I used ChatGPT to help me troubleshoot my python code errors and to improve my academic writing in the report. 