In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')
test_df = test_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')

X_train = train_df.drop(columns=['price'])
y_train = train_df['price']
X_test = test_df.drop(columns=['price'])
y_test = test_df['price']

train_mean = X_train.mean()
train_std = X_train.std()

X_train = (X_train - train_mean) / train_std
X_test = (X_test - train_mean) / train_std

y_train = y_train / 1000
y_test = y_test / 1000

model = LinearRegression()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_
})

print(coef_df)
print("Train MSE:", mean_squared_error(y_train, train_preds))
print("Train R2:", r2_score(y_train, train_preds))
print("Test MSE:", mean_squared_error(y_test, test_preds))
print("Test R2:", r2_score(y_test, test_preds))

          Feature  Coefficient
0        bedrooms   -12.528228
1       bathrooms    18.536903
2     sqft_living    56.777233
3        sqft_lot    10.887313
4          floors     8.047746
5      waterfront    63.774795
6            view    48.224227
7       condition    12.970756
8           grade    92.277625
9      sqft_above    48.314252
10  sqft_basement    27.150611
11       yr_built   -67.676964
12   yr_renovated    17.280022
13            lat    78.414954
14           long    -1.035721
15  sqft_living15    45.600464
16     sqft_lot15   -12.936561
Train MSE: 31486.167775794882
Train R2: 0.7265334318706018
Test MSE: 57628.154705670386
Test R2: 0.6543560876120954


In [None]:
# Notice the highest coefficient is 92.27762518 as grade, so grade contribute mostly to the linear regression model, followed by lat and waterfront
# The R2 for trained and test are 0.72 and 0.65 which are good (at least beating average)
# Trained MSE is 31486. Since the price is divided by 1000, the error should be sqrt(31486)*1000
# Test MSE is 57628 and followed the same idea for the trained and the error should be sqrt(57628)*1000
# Since the trained MSE is way smaller than the test, it should perform way better.