In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

drop_cols = ['Unnamed: 0', 'id', 'date', 'zipcode', 'price']

X_train = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
y_train = train_df['price'] / 1000 

X_test  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])
y_test = test_df['price'] / 1000

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
#2.1
model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_train_pred = model.predict(X_train_scaled)

train_mse = mean_squared_error(y_train, y_train_pred)
train_r2  = r2_score(y_train, y_train_pred)

coef_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
})
intercept = model.intercept_


print("Train MSE:", train_mse)
print("Train R²:", train_r2)
print("Train Intercept:", intercept)
coef_df

Train MSE: 31486.16777579488
Train R²: 0.7265334318706018
Train Intercept: 520.414834000001


Unnamed: 0,Feature,Coefficient
0,bedrooms,-12.521962
1,bathrooms,18.527633
2,sqft_living,56.748837
3,sqft_lot,10.881868
4,floors,8.043721
5,waterfront,63.7429
6,view,48.200109
7,condition,12.964269
8,grade,92.231475
9,sqft_above,48.290089


In [4]:
#2.2
y_test_pred = model.predict(X_test_scaled)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2  = r2_score(y_test, y_test_pred)

print("Test MSE:", test_mse)
print("Test R²:", test_r2)

Test MSE: 57628.1547056704
Test R²: 0.6543560876120953
