In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

print("âœ… Libraries loaded successfully!") 

âœ… Libraries loaded successfully!


In [2]:
# Load the data
df = pd.read_csv('train.csv')

# Show the first few rows
print("Dataset loaded! Here are the first 5 rows:")
print(df.head())

# Show how many rows and columns we have
print(f"\nDataset has {df.shape[0]} houses and {df.shape[1]} features")

Dataset loaded! Here are the first 5 rows:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleC

In [4]:
# Keep only the 9 features we're allowed to use
features_to_keep = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 
                     'BedroomAbvGr', 'FullBath', 'YearBuilt', 'Neighborhood', 'SalePrice']

df = df[features_to_keep]

print("âœ… Selected only the 9 recommended features")
print(df.head())

âœ… Selected only the 9 recommended features
   OverallQual  GrLivArea  TotalBsmtSF  GarageCars  BedroomAbvGr  FullBath  \
0            7       1710          856           2             3         2   
1            6       1262         1262           2             3         2   
2            7       1786          920           2             3         2   
3            7       1717          756           3             3         1   
4            8       2198         1145           3             4         2   

   YearBuilt Neighborhood  SalePrice  
0       2003      CollgCr     208500  
1       1976      Veenker     181500  
2       2001      CollgCr     223500  
3       1915      Crawfor     140000  
4       2000      NoRidge     250000  


In [5]:
# Check if any data is missing
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
BedroomAbvGr    0
FullBath        0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64


In [6]:
# Fill missing numbers with the average value
df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(df['TotalBsmtSF'].median())
df['GarageCars'] = df['GarageCars'].fillna(df['GarageCars'].median())

# Remove any remaining rows with missing data
df = df.dropna()

print("âœ… Missing data handled!")
print(f"Now we have {df.shape[0]} houses with complete information")

âœ… Missing data handled!
Now we have 1460 houses with complete information


In [7]:
# YOU CHOOSE: Pick any 6 features from the 9 available
# I'm picking these 6, but you can change them if you want:

selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 
                     'GarageCars', 'YearBuilt', 'Neighborhood']

# X = features we use to predict
# y = the price we want to predict
X = df[selected_features].copy()
y = df['SalePrice']

print("âœ… Selected these 6 features:")
for i, feature in enumerate(selected_features, 1):
    print(f"  {i}. {feature}")

âœ… Selected these 6 features:
  1. OverallQual
  2. GrLivArea
  3. TotalBsmtSF
  4. GarageCars
  5. YearBuilt
  6. Neighborhood


In [8]:
# 'Neighborhood' has text (like "Downtown", "Suburb")
# We need to convert it to numbers for the computer to understand

label_encoder = LabelEncoder()
X['Neighborhood'] = label_encoder.fit_transform(X['Neighborhood'])

# Save this converter so we can use it later
joblib.dump(label_encoder, 'label_encoder.pkl')

print("âœ… Converted neighborhood names to numbers")
print("Example:", X.head())

âœ… Converted neighborhood names to numbers
Example:    OverallQual  GrLivArea  TotalBsmtSF  GarageCars  YearBuilt  Neighborhood
0            7       1710          856           2       2003             5
1            6       1262         1262           2       1976            24
2            7       1786          920           2       2001             5
3            7       1717          756           3       1915             6
4            8       2198         1145           3       2000            15


In [9]:
# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("âœ… Data split complete!")
print(f"Training set: {X_train.shape[0]} houses")
print(f"Testing set: {X_test.shape[0]} houses")

âœ… Data split complete!
Training set: 1168 houses
Testing set: 292 houses


In [10]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("âœ… Model training complete!")
print("The computer has learned to predict house prices!")

âœ… Model training complete!
The computer has learned to predict house prices!


In [11]:
# Use the model to predict prices on test data
y_pred = model.predict(X_test)

print("âœ… Made predictions on test data")
print("\nExample predictions:")
print(f"Actual Price: ${y_test.iloc[0]:,.0f}")
print(f"Predicted Price: ${y_pred[0]:,.0f}")

âœ… Made predictions on test data

Example predictions:
Actual Price: $154,500
Predicted Price: $145,702


In [12]:
# Calculate accuracy metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("ðŸ“Š MODEL PERFORMANCE:")
print("="*50)
print(f"MAE (Mean Absolute Error): ${mae:,.2f}")
print(f"  â†’ On average, predictions are off by this amount")
print()
print(f"RMSE (Root Mean Squared Error): ${rmse:,.2f}")
print(f"  â†’ Typical error in predictions")
print()
print(f"RÂ² Score: {r2:.4f}")
print(f"  â†’ {r2*100:.2f}% of price variation is explained by the model")
print(f"  â†’ (Closer to 1.0 = better)")
print("="*50)

ðŸ“Š MODEL PERFORMANCE:
MAE (Mean Absolute Error): $25,445.10
  â†’ On average, predictions are off by this amount

RMSE (Root Mean Squared Error): $39,798.63
  â†’ Typical error in predictions

RÂ² Score: 0.7935
  â†’ 79.35% of price variation is explained by the model
  â†’ (Closer to 1.0 = better)


In [13]:
# Save the trained model so we can use it later without retraining
joblib.dump(model, 'house_price_model.pkl')

print("âœ… Model saved as 'house_price_model.pkl'")
print("âœ… Encoder saved as 'label_encoder.pkl'")
print("\nðŸŽ‰ PART A COMPLETE! You've successfully built a machine learning model!")

âœ… Model saved as 'house_price_model.pkl'
âœ… Encoder saved as 'label_encoder.pkl'

ðŸŽ‰ PART A COMPLETE! You've successfully built a machine learning model!
