In [25]:
import pandas as pd

# Load datasets
train_data = pd.read_csv("my_train.csv")
dev_data = pd.read_csv("my_dev.csv")

# Drop ID column (not needed for modeling)
train_data = train_data.drop(columns=["Id"])
dev_data = dev_data.drop(columns=["Id"])

# Convert all fields to strings and replace NA with "NA"
train_data = train_data.astype(str).fillna("NA")
dev_data = dev_data.astype(str).fillna("NA")

In [26]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit on training data and transform both train and dev data
X_train = encoder.fit_transform(train_data.drop(columns=["SalePrice"]))
X_dev = encoder.transform(dev_data.drop(columns=["SalePrice"]))

# Extract target variable (SalePrice) as float
y_train = train_data["SalePrice"].astype(float)
y_dev = dev_data["SalePrice"].astype(float)

# Get feature names for reference
feature_names = encoder.get_feature_names_out()
print(f"Total binary features: {len(feature_names)}")

Total binary features: 7226


In [27]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Transform target to log scale
y_train_log = np.log(y_train)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train_log)

In [21]:
from sklearn.metrics import mean_squared_log_error

# Predict on dev data
y_dev_pred_log = model.predict(X_dev)
y_dev_pred = np.exp(y_dev_pred_log)  # Convert back to original scale

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))
print(f"RMSLE on dev set: {rmsle}")

RMSLE on dev set: 0.15271576743080872


In [30]:
# Get feature coefficients
coefficients = model.coef_

# Combine feature names and coefficients into a DataFrame
import pandas as pd
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": coefficients})

# Sort by importance
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Top 10 positive features
print("Top 10 most positive features:")
print(feature_importance.head(10))

# Top 10 negative features
print("\nTop 10 most negative features:")
print(feature_importance.tail(10))

Top 10 most positive features:
                   Feature  Importance
5901            FullBath_3    0.139085
1204         OverallQual_9    0.138575
1162  Neighborhood_StoneBr    0.124563
4816          2ndFlrSF_472    0.113037
1203         OverallQual_8    0.108698
1398      RoofMatl_WdShngl    0.093503
5184        GrLivArea_1192    0.092076
1155  Neighborhood_NoRidge    0.089240
878           LotArea_8029    0.086179
6061          GarageCars_3    0.085705

Top 10 most negative features:
                Feature  Importance
1332  YearRemodAdd_1958   -0.087275
1195      OverallQual_1   -0.089350
6059       GarageCars_1   -0.093855
1207      OverallCond_3   -0.100881
2444     BsmtFinSF2_311   -0.107689
907        LotArea_8281   -0.108174
1198      OverallQual_3   -0.116983
7011  EnclosedPorch_236   -0.122746
5876      GrLivArea_968   -0.126993
15     MSZoning_C (all)   -0.183140


In [29]:
# Check bias term
bias = model.intercept_
print(f"Bias (intercept): {bias}")

Bias (intercept): 12.151878489176461


In [31]:
# Load test data
test_data = pd.read_csv("test.csv")

# Drop ID column (will add it back later for submission)
test_data_id = test_data["Id"]
test_data = test_data.drop(columns=["Id"])

# Convert all fields to strings and fill NA values
test_data = test_data.astype(str).fillna("NA")

# Align test_data columns with the training data
test_data = test_data.reindex(columns=train_data.columns[:-1], fill_value="NA")

# Transform test data
X_test = encoder.transform(test_data)

# Predict SalePrice
y_test_pred_log = model.predict(X_test)
y_test_pred = np.exp(y_test_pred_log)

# Save predictions
submission = pd.DataFrame({"Id": test_data_id, "SalePrice": y_test_pred})
submission.to_csv("predictions_part2.csv", index=False)
print("Submission file saved as predictions_part2.csv")


Submission file saved as predictions_part2.csv
