<a href="https://colab.research.google.com/github/swalehaparvin/kaggle_projects/blob/main/Housing_prediction_USA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create output directory for plots
os.makedirs('plots', exist_ok=True)

# Load the data
print("Loading datasets...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Save the ID column for later
train_ID = train['Id']
test_ID = test['Id']

# Remove the ID column
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# Save the target variable
y_train = train['SalePrice']
train.drop('SalePrice', axis=1, inplace=True)

# Combine train and test for preprocessing
print("Combining datasets for preprocessing...")
all_data = pd.concat([train, test], axis=0)
print(f"Combined dataset shape: {all_data.shape}")

# Check for skewness in the target variable
plt.figure(figsize=(10, 6))
sns.histplot(y_train, kde=True)
plt.title('SalePrice Distribution')
plt.savefig('plots/saleprice_distribution.png')
plt.close()

# Log transform the target for better model performance
y_train_log = np.log1p(y_train)
plt.figure(figsize=(10, 6))
sns.histplot(y_train_log, kde=True)
plt.title('Log(SalePrice+1) Distribution')
plt.savefig('plots/log_saleprice_distribution.png')
plt.close()

# Analyze numeric features for skewness
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.5]
print(f"Number of skewed numeric features: {len(high_skew)}")

# Apply Box-Cox transformation to skewed features
for feat in high_skew.index:
    all_data[feat] = np.log1p(all_data[feat])

# Handle missing values
print("\nHandling missing values...")
missing_data = all_data.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]
missing_percent = (missing_data / len(all_data)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Percent': missing_percent})
print(missing_df.head(20))

# Features with very high missing values (likely NA means Not Available)
na_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
           'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
           'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
           'MasVnrType']

# Fill NA with 'None' for categorical features that likely mean Not Available
for col in na_cols:
    if col in all_data.columns:
        all_data[col].fillna('None', inplace=True)

# Fill 0 for numeric features that likely mean Not Available
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars',
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
             'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']

for col in zero_cols:
    if col in all_data.columns:
        all_data[col].fillna(0, inplace=True)

# For LotFrontage, impute using the median of the neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

# For remaining missing values, use mode for categorical and median for numeric
cat_cols = all_data.select_dtypes(include=['object']).columns
num_cols = all_data.select_dtypes(exclude=['object']).columns

for col in cat_cols:
    if all_data[col].isnull().sum() > 0:
        all_data[col].fillna(all_data[col].mode()[0], inplace=True)

for col in num_cols:
    if all_data[col].isnull().sum() > 0:
        all_data[col].fillna(all_data[col].median(), inplace=True)

# Verify no missing values remain
assert all_data.isnull().sum().sum() == 0, "There are still missing values in the dataset"
print("All missing values have been handled.")

# Convert categorical variables to numeric using Label Encoding
print("\nEncoding categorical variables...")
label_encoder = LabelEncoder()
for col in cat_cols:
    all_data[col] = label_encoder.fit_transform(all_data[col])

# Create some new features
print("\nCreating new features...")

# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBathrooms'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + \
                            all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])

# House age and when it was remodeled
all_data['Age'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['Remodeled'] = (all_data['YearRemodAdd'] != all_data['YearBuilt']).astype(int)
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']

# Overall quality squared (to emphasize its importance)
all_data['OverallQual2'] = all_data['OverallQual'] ** 2

# Neighborhood and condition interaction
all_data['NeighborhoodQual'] = all_data['Neighborhood'] * all_data['OverallQual']

# Scale the numeric features
print("\nScaling numeric features...")
scaler = StandardScaler()
all_data[num_cols] = scaler.fit_transform(all_data[num_cols])

# Split back into train and test
print("\nSplitting back into train and test sets...")
train_processed = all_data.iloc[:len(train)]
test_processed = all_data.iloc[len(train):]

# Save processed data
train_processed.to_csv('train_processed.csv', index=False)
test_processed.to_csv('test_processed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_train_log.to_csv('y_train_log.csv', index=False)

print("\nPreprocessing complete. Files saved.")
print(f"Processed train shape: {train_processed.shape}")
print(f"Processed test shape: {test_processed.shape}")


Loading datasets...
Combining datasets for preprocessing...
Combined dataset shape: (2919, 79)
Number of skewed numeric features: 27

Handling missing values...
              Missing Count    Percent
PoolQC                 2909  99.657417
MiscFeature            2814  96.402878
Alley                  2721  93.216855
Fence                  2348  80.438506
MasVnrType             1766  60.500171
FireplaceQu            1420  48.646797
LotFrontage             486  16.649538
GarageCond              159   5.447071
GarageFinish            159   5.447071
GarageYrBlt             159   5.447071
GarageQual              159   5.447071
GarageType              157   5.378554
BsmtExposure             82   2.809181
BsmtCond                 82   2.809181
BsmtQual                 81   2.774923
BsmtFinType2             80   2.740665
BsmtFinType1             79   2.706406
MasVnrArea               23   0.787941
MSZoning                  4   0.137033
BsmtHalfBath              2   0.068517
All missing values h

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data[col].fillna('None', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav


Preprocessing complete. Files saved.
Processed train shape: (1460, 86)
Processed test shape: (1459, 86)


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import mean_squared_error

# Load preprocessed data
print("Loading preprocessed data...")
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_train_log = pd.read_csv('y_train_log.csv').values.ravel()

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target shape: {y_train.shape}")

# Define evaluation metric - RMSE on log scale
def rmse_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

# Create a directory for model results
os.makedirs('model_results', exist_ok=True)

# Initialize models (excluding LightGBM due to system dependency issues)
models = {
    'Lasso': make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=42)),
    'ElasticNet': make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42)),
    'Ridge': Ridge(alpha=10.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=1500, learning_rate=0.05, max_depth=4, max_features='sqrt', random_state=42),
    'XGBoost': xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=42
    )
}

# Train and evaluate models
print("\nTraining and evaluating models...")
results = {}
predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(train, y_train_log)

    # Cross-validation score
    score = rmse_cv(model, train, y_train_log)
    print(f"{name} RMSE CV: {score.mean():.4f} ({score.std():.4f})")

    # Make predictions
    pred = model.predict(train)
    train_rmse = np.sqrt(mean_squared_error(y_train_log, pred))
    print(f"{name} Train RMSE: {train_rmse:.4f}")

    # Store results
    results[name] = {
        'cv_score': score.mean(),
        'cv_std': score.std(),
        'train_rmse': train_rmse
    }

    # Make predictions on test set
    test_pred = model.predict(test)
    predictions[name] = np.expm1(test_pred)  # Transform back from log scale

    # Save model predictions
    pd.DataFrame({
        'Id': range(1461, 1461 + len(test)),
        'SalePrice': np.expm1(test_pred)
    }).to_csv(f'model_results/{name}_predictions.csv', index=False)

# Visualize model performance
plt.figure(figsize=(12, 6))
cv_scores = [results[name]['cv_score'] for name in models.keys()]
train_scores = [results[name]['train_rmse'] for name in models.keys()]

x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, cv_scores, width, label='CV RMSE')
plt.bar(x + width/2, train_scores, width, label='Train RMSE')
plt.xticks(x, models.keys(), rotation=45)
plt.ylabel('RMSE')
plt.title('Model Performance Comparison')
plt.legend()
plt.tight_layout()
plt.savefig('model_results/model_comparison.png')
plt.close()

# Create ensemble prediction (weighted average of top models)
print("\nCreating ensemble prediction...")
ensemble_weights = {
    'XGBoost': 0.5,
    'GradientBoosting': 0.3,
    'Ridge': 0.2
}

ensemble_pred = np.zeros(len(test))
for name, weight in ensemble_weights.items():
    ensemble_pred += weight * predictions[name]

# Save ensemble prediction
pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': ensemble_pred
}).to_csv('model_results/ensemble_predictions.csv', index=False)

# Format final submission
print("\nPreparing final submission file...")
submission = pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': ensemble_pred
})
submission.to_csv('submission.csv', index=False)

print("\nModel training and prediction complete.")
print("Final submission file created: submission.csv")

# Save model performance summary
with open('model_results/model_summary.txt', 'w') as f:
    f.write("Model Performance Summary\n")
    f.write("=======================\n\n")
    for name in models.keys():
        f.write(f"{name}:\n")
        f.write(f"  CV RMSE: {results[name]['cv_score']:.4f} (±{results[name]['cv_std']:.4f})\n")
        f.write(f"  Train RMSE: {results[name]['train_rmse']:.4f}\n\n")

    f.write("\nEnsemble Model Weights:\n")
    for name, weight in ensemble_weights.items():
        f.write(f"  {name}: {weight}\n")


Loading preprocessed data...
Train shape: (1460, 86)
Test shape: (1459, 86)
Target shape: (1460,)

Training and evaluating models...

Training Lasso...
Lasso RMSE CV: 0.1397 (0.0232)
Lasso Train RMSE: 0.1238

Training ElasticNet...
ElasticNet RMSE CV: 0.1399 (0.0231)
ElasticNet Train RMSE: 0.1236

Training Ridge...
Ridge RMSE CV: 0.1395 (0.0232)
Ridge Train RMSE: 0.1233

Training RandomForest...
RandomForest RMSE CV: 0.1447 (0.0182)
RandomForest Train RMSE: 0.0539

Training GradientBoosting...
GradientBoosting RMSE CV: 0.1247 (0.0166)
GradientBoosting Train RMSE: 0.0160

Training XGBoost...
XGBoost RMSE CV: 0.1265 (0.0164)
XGBoost Train RMSE: 0.0180

Creating ensemble prediction...

Preparing final submission file...

Model training and prediction complete.
Final submission file created: submission.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
import os

# Create output directory
os.makedirs('model_results', exist_ok=True)

# Load preprocessed data
print("Loading preprocessed data...")
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_train_log = pd.read_csv('y_train_log.csv').values.ravel()

print(f"Original train shape: {train.shape}")
print(f"Original test shape: {test.shape}")

# Reduce dimensionality using feature selection
print("\nReducing dimensionality...")
k_best_features = 20  # Select top 20 features
selector = SelectKBest(f_regression, k=k_best_features)
train_reduced = selector.fit_transform(train, y_train_log)
test_reduced = selector.transform(test)

print(f"Reduced train shape: {train_reduced.shape}")
print(f"Reduced test shape: {test_reduced.shape}")

# Train a simple Ridge model
print("\nTraining Ridge model...")
model = Ridge(alpha=10.0, random_state=42)
model.fit(train_reduced, y_train_log)

# Make predictions
print("\nMaking predictions...")
test_pred_log = model.predict(test_reduced)
test_pred = np.expm1(test_pred_log)  # Transform back from log scale

# Format final submission
print("\nPreparing final submission file...")
submission = pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': test_pred
})
submission.to_csv('submission.csv', index=False)

print("\nModel training and prediction complete.")
print("Final submission file created: submission.csv")


Loading preprocessed data...
Original train shape: (1460, 86)
Original test shape: (1459, 86)

Reducing dimensionality...
Reduced train shape: (1460, 20)
Reduced test shape: (1459, 20)

Training Ridge model...

Making predictions...

Preparing final submission file...

Model training and prediction complete.
Final submission file created: submission.csv


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import mean_squared_error

# Load preprocessed data
print("Loading preprocessed data...")
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_train_log = pd.read_csv('y_train_log.csv').values.ravel()

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target shape: {y_train.shape}")

# Define evaluation metric - RMSE on log scale
def rmse_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

# Create a directory for model results
os.makedirs('model_results', exist_ok=True)

# Initialize models
models = {
    'Lasso': make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=42)),
    'ElasticNet': make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42)),
    'Ridge': Ridge(alpha=10.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=1500, learning_rate=0.05, max_depth=4, max_features='sqrt', random_state=42),
    'XGBoost': xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=42
    ),
    'LightGBM': lgb.LGBMRegressor(
        objective='regression',
        num_leaves=31,
        learning_rate=0.05,
        n_estimators=1000,
        random_state=42
    )
}

# Train and evaluate models
print("\nTraining and evaluating models...")
results = {}
predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(train, y_train_log)

    # Cross-validation score
    score = rmse_cv(model, train, y_train_log)
    print(f"{name} RMSE CV: {score.mean():.4f} ({score.std():.4f})")

    # Make predictions
    pred = model.predict(train)
    train_rmse = np.sqrt(mean_squared_error(y_train_log, pred))
    print(f"{name} Train RMSE: {train_rmse:.4f}")

    # Store results
    results[name] = {
        'cv_score': score.mean(),
        'cv_std': score.std(),
        'train_rmse': train_rmse
    }

    # Make predictions on test set
    test_pred = model.predict(test)
    predictions[name] = np.expm1(test_pred)  # Transform back from log scale

    # Save model predictions
    pd.DataFrame({
        'Id': range(1461, 1461 + len(test)),
        'SalePrice': np.expm1(test_pred)
    }).to_csv(f'model_results/{name}_predictions.csv', index=False)

# Visualize model performance
plt.figure(figsize=(12, 6))
cv_scores = [results[name]['cv_score'] for name in models.keys()]
train_scores = [results[name]['train_rmse'] for name in models.keys()]

x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, cv_scores, width, label='CV RMSE')
plt.bar(x + width/2, train_scores, width, label='Train RMSE')
plt.xticks(x, models.keys(), rotation=45)
plt.ylabel('RMSE')
plt.title('Model Performance Comparison')
plt.legend()
plt.tight_layout()
plt.savefig('model_results/model_comparison.png')
plt.close()

# Create ensemble prediction (weighted average of top models)
print("\nCreating ensemble prediction...")
ensemble_weights = {
    'XGBoost': 0.4,
    'LightGBM': 0.3,
    'GradientBoosting': 0.2,
    'Ridge': 0.1
}

ensemble_pred = np.zeros(len(test))
for name, weight in ensemble_weights.items():
    ensemble_pred += weight * predictions[name]

# Save ensemble prediction
pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': ensemble_pred
}).to_csv('model_results/ensemble_predictions.csv', index=False)

# Format final submission
print("\nPreparing final submission file...")
submission = pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': ensemble_pred
})
submission.to_csv('submission.csv', index=False)

print("\nModel training and prediction complete.")
print("Final submission file created: submission.csv")

# Save model performance summary
with open('model_results/model_summary.txt', 'w') as f:
    f.write("Model Performance Summary\n")
    f.write("=======================\n\n")
    for name in models.keys():
        f.write(f"{name}:\n")
        f.write(f"  CV RMSE: {results[name]['cv_score']:.4f} (±{results[name]['cv_std']:.4f})\n")
        f.write(f"  Train RMSE: {results[name]['train_rmse']:.4f}\n\n")

    f.write("\nEnsemble Model Weights:\n")
    for name, weight in ensemble_weights.items():
        f.write(f"  {name}: {weight}\n")


Loading preprocessed data...
Train shape: (1460, 86)
Test shape: (1459, 86)
Target shape: (1460,)

Training and evaluating models...

Training Lasso...
Lasso RMSE CV: 0.1397 (0.0232)
Lasso Train RMSE: 0.1238

Training ElasticNet...
ElasticNet RMSE CV: 0.1399 (0.0231)
ElasticNet Train RMSE: 0.1236

Training Ridge...
Ridge RMSE CV: 0.1395 (0.0232)
Ridge Train RMSE: 0.1233

Training RandomForest...
RandomForest RMSE CV: 0.1447 (0.0182)
RandomForest Train RMSE: 0.0539

Training GradientBoosting...
GradientBoosting RMSE CV: 0.1247 (0.0166)
GradientBoosting Train RMSE: 0.0160

Training XGBoost...
XGBoost RMSE CV: 0.1265 (0.0164)
XGBoost Train RMSE: 0.0180

Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4077
[LightGBM] [Info] Number of data points in the train set: 14

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import os

# Create output directory
os.makedirs('model_results', exist_ok=True)

# Load preprocessed data
print("Loading preprocessed data...")
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_train_log = pd.read_csv('y_train_log.csv').values.ravel()

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target shape: {y_train.shape}")

# Define evaluation metric - RMSE on log scale
def rmse_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

# Initialize lightweight models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0, random_state=42)
}

# Train and evaluate models
print("\nTraining and evaluating models...")
results = {}
predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(train, y_train_log)

    # Cross-validation score
    score = rmse_cv(model, train, y_train_log)
    print(f"{name} RMSE CV: {score.mean():.4f} ({score.std():.4f})")

    # Make predictions
    pred = model.predict(train)
    train_rmse = np.sqrt(mean_squared_error(y_train_log, pred))
    print(f"{name} Train RMSE: {train_rmse:.4f}")

    # Store results
    results[name] = {
        'cv_score': score.mean(),
        'cv_std': score.std(),
        'train_rmse': train_rmse
    }

    # Make predictions on test set
    test_pred = model.predict(test)
    predictions[name] = np.expm1(test_pred)  # Transform back from log scale

    # Save model predictions
    pd.DataFrame({
        'Id': range(1461, 1461 + len(test)),
        'SalePrice': np.expm1(test_pred)
    }).to_csv(f'model_results/{name}_predictions.csv', index=False)

# Use Ridge as the final model (typically more robust than simple linear regression)
print("\nPreparing final submission file...")
submission = pd.DataFrame({
    'Id': range(1461, 1461 + len(test)),
    'SalePrice': predictions['Ridge']
})
submission.to_csv('submission.csv', index=False)

print("\nModel training and prediction complete.")
print("Final submission file created: submission.csv")

# Save model performance summary
with open('model_results/model_summary.txt', 'w') as f:
    f.write("Model Performance Summary\n")
    f.write("=======================\n\n")
    for name in models.keys():
        f.write(f"{name}:\n")
        f.write(f"  CV RMSE: {results[name]['cv_score']:.4f} (±{results[name]['cv_std']:.4f})\n")
        f.write(f"  Train RMSE: {results[name]['train_rmse']:.4f}\n\n")


Loading preprocessed data...
Train shape: (1460, 86)
Test shape: (1459, 86)
Target shape: (1460,)

Training and evaluating models...

Training LinearRegression...
LinearRegression RMSE CV: 0.1432 (0.0230)
LinearRegression Train RMSE: 0.1225

Training Ridge...
Ridge RMSE CV: 0.1395 (0.0232)
Ridge Train RMSE: 0.1233

Preparing final submission file...

Model training and prediction complete.
Final submission file created: submission.csv


## Statistical Prediction

In [14]:
import pandas as pd
import numpy as np
import os

# Create output directory
os.makedirs('model_results', exist_ok=True)

# Load the original data (not the processed data that's causing issues)
print("Loading original data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Extract the target variable
y_train = train['SalePrice']
print(f"Target mean: {y_train.mean()}")
print(f"Target median: {y_train.median()}")

# Identify key numeric features that are likely to influence house prices
key_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
print(f"Selected key features: {key_features}")

# Create a simple prediction based on OverallQual (overall quality rating)
# This is the most important feature for house prices
print("\nCreating predictions based on OverallQual groups...")

# Group by OverallQual and calculate mean SalePrice for each group
qual_price_map = train.groupby('OverallQual')['SalePrice'].median().to_dict()

# For any missing OverallQual in test, use the median of all prices
median_price = y_train.median()

# Generate predictions
predictions = []
for qual in test['OverallQual']:
    if qual in qual_price_map:
        predictions.append(qual_price_map[qual])
    else:
        predictions.append(median_price)

# Format final submission
print("\nPreparing final submission file...")
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)

print("\nPrediction complete.")
print("Final submission file created: submission.csv")
print(f"Predictions mean: {np.mean(predictions)}")
print(f"Predictions median: {np.median(predictions)}")

# Save a summary of the approach
with open('model_results/approach_summary.txt', 'w') as f:
    f.write("House Price Prediction Approach Summary\n")
    f.write("=====================================\n\n")
    f.write("Due to system constraints, a simplified statistical approach was used:\n\n")
    f.write("1. The 'OverallQual' feature was identified as the most important predictor of house prices\n")
    f.write("2. For each 'OverallQual' value in the test set, the median sale price of houses with that quality in the training set was used\n")
    f.write("3. For any 'OverallQual' values not present in training, the overall median price was used\n\n")
    f.write("This approach provides a reasonable baseline prediction without requiring complex model training.\n\n")
    f.write("Quality-Price Mapping:\n")
    for qual, price in qual_price_map.items():
        f.write(f"  Quality {qual}: ${price:.2f}\n")


Loading original data...
Train shape: (1460, 81)
Test shape: (1459, 80)
Target mean: 180921.19589041095
Target median: 163000.0
Selected key features: ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']

Creating predictions based on OverallQual groups...

Preparing final submission file...

Prediction complete.
Final submission file created: submission.csv
Predictions mean: 177674.08567511995
Predictions median: 160000.0
