In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
train_path = '/Users/tanishq/Desktop/Projects/home-data-for-ml-course/train.csv'
test_path = '/Users/tanishq/Desktop/Projects/home-data-for-ml-course/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Feature Selection: Use features for Time on Market prediction
features = ['Condition1', 'MSZoning', 'SaleCondition']
target = 'TimeOnMarket'

# Create 'TimeOnMarket' column as proxy (you can adjust this definition as needed)
train_data['TimeOnMarket'] = (train_data['YrSold'] - train_data['YearBuilt']) * 12 + train_data['MoSold']

# Handle Missing Values
num_cols = [col for col in train_data.select_dtypes(include=['float64', 'int64']).columns 
            if col not in ['SalePrice', 'TimeOnMarket']]
cat_cols = features

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for train and test data
train_data[num_cols] = num_imputer.fit_transform(train_data[num_cols])
train_data[cat_cols] = cat_imputer.fit_transform(train_data[cat_cols])

# Ensure only valid columns are used in test_data
test_data_num_cols = [col for col in num_cols if col in test_data.columns]
test_data[test_data_num_cols] = num_imputer.transform(test_data[test_data_num_cols])
test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])



# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = pd.DataFrame(encoder.fit_transform(train_data[cat_cols]), 
                                columns=encoder.get_feature_names_out(cat_cols))
encoded_test_features = pd.DataFrame(encoder.transform(test_data[cat_cols]), 
                                     columns=encoder.get_feature_names_out(cat_cols))

# Combine encoded features with numerical columns
train_combined = pd.concat([train_data[num_cols], encoded_features, train_data[target]], axis=1)
test_combined = pd.concat([test_data[num_cols], encoded_test_features], axis=1)

# Split Data into Features and Target
X = train_combined.drop(columns=[target])
y = train_combined[target]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model: Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=500, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Results
print(f"Time on Market Prediction RMSE: {rmse:.2f}")
print(f"Time on Market Prediction R²: {r2 * 100:.2f}%")

# Predict on Test Data
test_time_on_market = rf_model.predict(test_combined)

# Save Predictions
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'TimeOnMarket': test_time_on_market
})
submission.to_csv('/Users/tanishq/Desktop/Projects/home-data-for-ml-course/time_on_market_submission.csv', index=False)
print("Time on Market predictions saved to time_on_market_submission.csv")


Time on Market Prediction RMSE: 8.03
Time on Market Prediction R²: 99.94%
Time on Market predictions saved to time_on_market_submission.csv
