In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Kaggle dataset path
import kagglehub
import os

path = kagglehub.dataset_download("marcopale/housing")
print("Path to dataset files:", path)

# List files
for file in os.listdir(path):
    print(f"- {file}")

# Load train, test, target
train_file = os.path.join(path, "train.csv")
target_file = os.path.join(path, "target.csv")

train_df = pd.read_csv(train_file)
target_df = pd.read_csv(target_file)

print(f"\nTrain columns: {train_df.columns.tolist()[:5]}...")
print(f"Target columns: {target_df.columns.tolist()}")

In [None]:
if 'SalePrice' in train_df.columns:
    print("\n'SalePrice' already present in train_df — no merge needed.")
    data = train_df.copy()
else:
    # Check correct merge key
    common_cols = set(train_df.columns).intersection(set(target_df.columns))
    print("Common columns between train and target files:", common_cols)

    if 'Id' in common_cols:
        merge_key = 'Id'
    elif 'PID' in common_cols:
        merge_key = 'PID'
    else:
        merge_key = list(common_cols)[0]  # fallback just in case

    data = pd.merge(train_df, target_df, on=merge_key, how='left')
    print(f"\nMerged on key: {merge_key}")

# Handle duplicate SalePrice columns if they exist
if 'SalePrice_x' in data.columns and 'SalePrice_y' in data.columns:
    data = data.drop(columns=['SalePrice_x'])
    data = data.rename(columns={'SalePrice_y': 'SalePrice'})

# Drop identifier columns
cols_to_drop = []
if 'Order' in data.columns:
    cols_to_drop.append('Order')
if 'PID' in data.columns:
    cols_to_drop.append('PID')

if cols_to_drop:
    data = data.drop(columns=cols_to_drop)
    print(f"Dropped identifier columns: {cols_to_drop}")

print(f"\nDataset loaded successfully!")
print(f"Shape: {data.shape}")
print(f"Columns: {data.columns.tolist()[:10]}...")
print(f"\nFirst 5 rows:")
print(data.head())

In [None]:
# Explore the Dataset
print("Dataset Information")
print(data.info())
print("\nColumn Names:")
print(data.columns.tolist())

In [None]:
# Check for Missing Values
print("Data Preprocessing - Handling Missing Values")
missing_counts = data.isnull().sum()
missing_pct = (missing_counts / len(data)) * 100

missing_info = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Percentage': missing_pct
}).sort_values('Missing_Count', ascending=False)

print("\nTop 10 columns with missing values:")
print(missing_info[missing_info['Missing_Count'] > 0].head(10))

# Handle Missing Values
# Drop columns with more than 50% missing values
high_missing = missing_info[missing_info['Percentage'] > 50].index.tolist()
if 'SalePrice' in high_missing:
    high_missing.remove('SalePrice')

if high_missing:
    print(f"\nDropping {len(high_missing)} columns with >50% missing data")
    data = data.drop(columns=high_missing)

# Fill numeric columns with median
numeric_cols = data.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if col != 'SalePrice' and data[col].isnull().sum() > 0:
        data[col].fillna(data[col].median(), inplace=True)

# Fill categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mode()[0], inplace=True)

print(f"\nMissing values after handling: {data.isnull().sum().sum()}")

In [None]:
# Encode Categorical Variables
print("Encoding Categorical Variables")

# Keep only categorical columns with fewer than 15 unique values
le = LabelEncoder()
for col in categorical_cols:
    if col in data.columns and data[col].nunique() <= 15:
        data[col] = le.fit_transform(data[col].astype(str))
        print(f"Encoded: {col}")
    elif col in data.columns:
        print(f"Dropped high cardinality column: {col}")
        data = data.drop(columns=[col])


In [None]:
# Basic Statistical Summary
print(" Exploratory Data Analysis (EDA)")
print("\nStatistical Summary:")
print(data.describe())


In [None]:
# Target Variable Distribution
plt.figure(figsize=(14, 5))
plt.subplot(1, 3, 1)
plt.hist(data['SalePrice'], bins=50, edgecolor='black', color='skyblue')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')

plt.subplot(1, 3, 2)
plt.boxplot(data['SalePrice'])
plt.ylabel('Sale Price')
plt.title('Boxplot of House Prices')

plt.subplot(1, 3, 3)
plt.hist(np.log(data['SalePrice']), bins=50, edgecolor='black', color='green')
plt.xlabel('Log(Sale Price)')
plt.ylabel('Frequency')
plt.title('Log-Transformed Distribution')

plt.tight_layout()
plt.savefig('price_distribution.png', dpi=300, bbox_inches='tight')
print("\nSaved: price_distribution.png")
plt.show()

In [None]:
# Correlation Analysis
print("Correlation with Sale Price")
correlations = data.corr()['SalePrice'].sort_values(ascending=False)
print("\nTop 10 features correlated with SalePrice:")
print(correlations.head(11))

In [None]:
# Correlation Heatmap
top_features = correlations.abs().sort_values(ascending=False).head(11).index
plt.figure(figsize=(10, 8))
sns.heatmap(data[top_features].corr(), annot=True, cmap='coolwarm',
            center=0, fmt='.2f', square=True, linewidths=1)
plt.title('Correlation Matrix - Top 10 Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
print("Saved: correlation_matrix.png")
plt.show()

In [None]:
# Feature vs Target Scatter Plots
top_4 = correlations.iloc[1:5].index

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for i, feature in enumerate(top_4):
    axes[i].scatter(data[feature], data['SalePrice'], alpha=0.5)
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Sale Price')
    axes[i].set_title(f'SalePrice vs {feature}\n(Corr: {correlations[feature]:.3f})')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('feature_relationships.png', dpi=300, bbox_inches='tight')
print("Saved: feature_relationships.png")
plt.show()

In [None]:
# Handle Outliers
print("Handling Outliers")
# --- 1. Remove extreme outliers in GrLivArea (common Kaggle fix)
initial_shape = data.shape
data = data[data['Gr Liv Area'] < 4000]

print(f"Dropped {initial_shape[0] - data.shape[0]} extreme Gr Liv Area outliers")

# --- 2. Log-transform SalePrice (to reduce skewness)
data['SalePrice'] = np.log1p(data['SalePrice'])

print("Applied log transformation to SalePrice")

# --- 3. Log-transform skewed numeric features
numeric_feats = data.select_dtypes(include=[np.number]).drop(columns=['SalePrice']).columns
skewness = data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)

skewed_features = skewness[abs(skewness) > 0.75].index
print(f"Log-transforming {len(skewed_features)} skewed features")

for feat in skewed_features:
    data[feat] = np.log1p(data[feat])

print("Outlier handling complete!")

In [None]:
# Split Data into Features and Target
print("Preparing Data for Modeling")
# Separate features (X) and target (y)
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
 # Split into Training and Test Sets (80-20)
print("Splitting Data: 80% Training, 20% Testing")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape[0]} samples (80%)")
print(f"Testing set: {X_test.shape[0]} samples (20%)")


In [None]:
# Train Linear Regression Model
print("Training Linear Regression Model")
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained successfully!")
print(f"\nIntercept: ${model.intercept_:,.2f}")


In [None]:
# Display Top Feature Coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

In [None]:
# Make Predictions
print("Making Predictions")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("Predictions completed!")

In [None]:
# Evaluate Model Performance
print("Model Evaluation")

# Training set metrics
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Test set metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("\nTRAINING SET PERFORMANCE:")
print(f"  Mean Squared Error (MSE): {train_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE): ${train_rmse:,.2f}")
print(f"  R-squared (R²): {train_r2:.4f}")

print("\nTEST SET PERFORMANCE:")
print(f"  Mean Squared Error (MSE): {test_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE): ${test_rmse:,.2f}")
print(f"  R-squared (R²): {test_r2:.4f}")

print("\nINTERPRETATION:")
print(f"  The model explains {test_r2*100:.2f}% of variance in house prices")
print(f"  Average prediction error: ${test_rmse:,.2f}")

In [None]:
# Plot Predicted vs Actual Prices

print("Visualizing Results")


fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Predicted vs Actual
axes[0].scatter(y_test, y_test_pred, alpha=0.6, edgecolors='k', linewidths=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
             'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Sale Price')
axes[0].set_ylabel('Predicted Sale Price')
axes[0].set_title(f'Predicted vs Actual House Prices\n(R² = {test_r2:.4f})')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residual Plot
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.6, edgecolors='k', linewidths=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Sale Price')
axes[1].set_ylabel('Residuals (Actual - Predicted)')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('prediction_results.png', dpi=300, bbox_inches='tight')
print("Saved: prediction_results.png")
plt.show()

In [None]:
#Additional Visualizations
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual Distribution
axes[0].hist(residuals, bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0].axvline(x=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Residual Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Residuals')
axes[0].grid(True, alpha=0.3)

# Feature Importance
top_10 = coefficients.head(10)
axes[1].barh(range(len(top_10)), top_10['Coefficient'], color='steelblue')
axes[1].set_yticks(range(len(top_10)))
axes[1].set_yticklabels(top_10['Feature'])
axes[1].set_xlabel('Coefficient Value')
axes[1].set_title('Top 10 Feature Importance')

plt.tight_layout()
plt.savefig('additional_analysis.png', dpi=300, bbox_inches='tight')
print("Saved: additional_analysis.png")
plt.show()

In [None]:
# Final Analysis and Conclusions
print("Analysis & Conclusions")

# Check for overfitting/underfitting
r2_diff = train_r2 - test_r2

print("\nMODEL PERFORMANCE ASSESSMENT:")
if r2_diff > 0.1:
    print("\nOVERFITTING DETECTED:")
    print(f"  Training R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Difference: {r2_diff:.4f}")
    print("  The model performs much better on training data than test data.")
    print("  Recommendation: Use regularization (Ridge/Lasso regression)")
elif test_r2 < 0.5:
    print("\nUNDERFITTING DETECTED:")
    print(f"  Test R² is low: {test_r2:.4f}")
    print("  The model is too simple to capture the relationships in the data.")
    print("  Recommendation: Add polynomial features or use a more complex model")
else:
    print("\nMODEL PERFORMS WELL:")
    print(f"  Training R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Difference: {r2_diff:.4f}")
    print("  The model generalizes reasonably well to unseen data.")

print(f"\nKEY FINDINGS:")
print(f"  1. Model explains {test_r2*100:.1f}% of house price variance")
print(f"  2. Average prediction error: ${test_rmse:,.0f}")
print(f"  3. Most important features:")
for i, (_, row) in enumerate(coefficients.head(3).iterrows(), 1):
    print(f"     {i}. {row['Feature']}: ${row['Coefficient']:,.2f}")

print("ANALYSIS COMPLETE!")