In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump

print("="*50)
print("HOUSING PRICE PREDICTION MODEL")
print("="*50)

# Step 1: Load the data
print("\n1. Loading Data...")
data = pd.read_csv("/content/Housingdata.csv")
print(f"Data loaded successfully!")
print(f"Shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"\nFirst 5 rows:")
print(data.head())

# Step 2: Data preprocessing
print("\n2. Data Preprocessing...")

# Create a copy for cleaning
clean_data = data.copy()
print("Created data copy for cleaning")

# Find categorical columns (text columns)
cat_cols = clean_data.select_dtypes(include="object").columns
print(f"Categorical columns found: {list(cat_cols)}")

# Convert categorical data to numbers
encoders = {}
for col in cat_cols:
    encoder = LabelEncoder()
    clean_data[col] = encoder.fit_transform(clean_data[col])
    encoders[col] = encoder
    print(f"Encoded column: {col}")

print("All categorical columns converted to numbers")

# Separate features (X) and target (y)
X = clean_data.drop("price", axis=1)  # Features (everything except price)
y = clean_data["price"]               # Target (price we want to predict)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Scale the features to same range
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled to standard range")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 3: Train different models
print("\n3. Training Models...")

# Define models to test
models = {
    "Linear": LinearRegression(),
    "KNN": KNeighborsRegressor(),
    "Tree": DecisionTreeRegressor(random_state=42),
    "Forest": RandomForestRegressor(random_state=42)
}

# Train each model and store results
results = []
for name, model in models.items():
    print(f"\nTraining {name} model...")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    preds = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    # Store results
    results.append({
        "Model": name,
        "MSE": mse,
        "R²": r2
    })

    print(f"{name} - MSE: {mse:.2f}, R²: {r2:.4f}")

# Step 4: Compare model performance
print("\n4. Model Comparison...")
results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)
print("\nModel Performance (sorted by R² score):")
print(results_df.to_string(index=False))

# Step 5: Save the best model
print("\n5. Saving Best Model...")
best_name = results_df.iloc[0]["Model"]
best_model = models[best_name]
best_r2 = results_df.iloc[0]["R²"]

print(f"Best model: {best_name}")
print(f"Best R² score: {best_r2:.4f}")

# Save the model
dump(best_model, "/content/housingModel.joblib")
print("Best model saved as 'housingModel.joblib'")

# Save the scaler and encoders for deployment
dump(scaler, "/content/scaler.joblib")
dump(encoders, "/content/encoders.joblib")
print("Scaler saved as 'scaler.joblib'")
print("Encoders saved as 'encoders.joblib'")

# Save feature names for reference
feature_names = list(X.columns)
dump(feature_names, "/content/feature_names.joblib")
print("Feature names saved as 'feature_names.joblib'")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("Files saved:")
print("- housingModel.joblib (best model)")
print("- scaler.joblib (feature scaler)")
print("- encoders.joblib (categorical encoders)")
print("- feature_names.joblib (feature column names)")
print("="*50)

HOUSING PRICE PREDICTION MODEL

1. Loading Data...
Data loaded successfully!
Shape: (545, 13)
Columns: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

First 5 rows:
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no     