## 1. Loading Data

In [172]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [173]:
train_data = pd.read_csv(r"../data/train.csv")
train_data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [174]:
test_data = pd.read_csv(r"../data/test.csv")
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## 2. Data Exploration

In [175]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [176]:
print(train_data.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [177]:
print(train_data.isnull().sum().sort_values(ascending=False)[:10])  # Top missing values

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
dtype: int64


## 3. Feature Selection

In [178]:
continuous_features = ["LotArea", "GrLivArea"]


In [179]:
categorical_features = ["Neighborhood", "HouseStyle"]

In [180]:

target = "SalePrice"

## 4. Data Preprocessing

In [181]:
# Handling Missing Values
train_data.fillna(train_data.median(numeric_only=True), inplace=True)


In [182]:
test_data.fillna(test_data.median(numeric_only=True), inplace=True)


In [183]:
# One-Hot Encoding for Categorical Features
train_data = pd.get_dummies(train_data, columns=categorical_features, drop_first=True)

In [184]:
test_data = pd.get_dummies(test_data, columns=categorical_features, drop_first=True)

In [185]:
# Align train and test sets
train_data, test_data = train_data.align(test_data, join="left", axis=1, fill_value=0)

In [186]:
# Feature Scaling
scaler = StandardScaler()

In [187]:
train_data[continuous_features] = scaler.fit_transform(train_data[continuous_features])

In [188]:
test_data[continuous_features] = scaler.transform(test_data[continuous_features])

## 5. Model Training

In [189]:
X = train_data[continuous_features + list(train_data.columns[train_data.columns.str.startswith(tuple(categorical_features))])]

In [190]:
y = train_data[target]

In [191]:
# Log-transform target
y_log = np.log(y)

In [192]:
# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [193]:
# Train Model
model = LinearRegression()

In [194]:
model.fit(X_train, y_train)

## 6. Model Evaluation

In [195]:
y_pred_log = model.predict(X_val)

In [196]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred_log))

In [197]:
print(f"RMSE (Log-Transformed): {rmse}")


RMSE (Log-Transformed): 0.19775234715617035


##  7. Test Predictions

In [198]:
test_predictions = np.exp(model.predict(test_data[continuous_features + list(test_data.columns[test_data.columns.str.startswith(tuple(categorical_features))])]))


In [199]:
# Save results
submission = pd.DataFrame({"Id": test_data["Id"], "SalePrice": test_predictions})

In [200]:
submission.to_csv("submission.csv", index=False)