In [8]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# =========================
# Load training and testing data
# =========================
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

# Target variable
target = "price"

# =========================
# Split features and target
# =========================
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]

# =========================
# encode categorical variables
# =========================
X_train = pd.get_dummies(X_train, drop_first=True)
X_test  = pd.get_dummies(X_test, drop_first=True)

# Align train/test feature columns
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# =========================
# Train linear regression model
# =========================
model = LinearRegression()
model.fit(X_train, y_train)

# =========================
# Predictions
# =========================
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

# =========================
# Metrics
# =========================
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2  = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2  = r2_score(y_test, y_test_pred)

print("Training MSE:", train_mse)
print("Training R^2:", train_r2)
print()
print("Testing MSE:", test_mse)
print("Testing R^2:", test_r2)

# =========================
# Coefficients
# =========================
coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "coefficient": model.coef_
}).sort_values(by="coefficient", key=lambda s: s.abs(), ascending=False)

print("\nIntercept:", model.intercept_)
print("\nTop coefficients:")
print(coef_df.head(15))


Training MSE: 31043433137.295223
Training R^2: 0.7303787115834155

Testing MSE: 58389284496.820786
Testing R^2: 0.649790960024862

Intercept: 12909055.919975331

Top coefficients:
          feature    coefficient
6      waterfront  720512.932252
15            lat  582858.610398
9           grade   80365.075383
16           long  -79169.071678
7            view   64100.513072
2       bathrooms   25504.939035
5          floors   21349.275026
1        bedrooms  -17032.647611
8       condition   15898.373914
12       yr_built   -2600.891928
14        zipcode    -466.672997
3     sqft_living      83.263283
17  sqft_living15      61.839532
13   yr_renovated      42.930997
10     sqft_above      42.298967
