In [7]:
import numpy as np 
import pandas as pd 

In [8]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from typing import Tuple, Dict

class HousePricePipeline:
    def __init__(self):
        self.preprocessing_pipeline = None
        self.model = None
        self.feature_names = [
            'Avg. Area Income',
            'Avg. Area House Age',
            'Avg. Area Number of Rooms',
            'Avg. Area Number of Bedrooms',
            'Area Population'
        ]

    def create_preprocessing_pipeline(self) -> Pipeline:
        """Create the preprocessing pipeline."""
        numeric_features = self.feature_names

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features)
            ])

        return Pipeline([
            ('preprocessor', preprocessor)
        ])

    def feature_engineering(self, X: pd.DataFrame) -> pd.DataFrame:
        """Add engineered features to the dataset."""
        X = X.copy()
        # Create total rooms to bedrooms ratio
        X['Rooms_to_Bedrooms_Ratio'] = X['Avg. Area Number of Rooms'] / X['Avg. Area Number of Bedrooms']
        self.feature_names.append('Rooms_to_Bedrooms_Ratio')
        return X

    def create_ensemble_model(self):
        """Create an ensemble of multiple regression models."""
        models = {
            'rf': RandomForestRegressor(n_estimators=100, random_state=42),
            'gb': GradientBoostingRegressor(n_estimators=100, random_state=42),
            'lr': LinearRegression()
        }
        
        return models

    def train_models(self, X_train: pd.DataFrame, y_train: pd.Series) -> Dict:
        """Train multiple regression models."""
        trained_models = {}
        models = self.create_ensemble_model()
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            trained_models[name] = model
            
        return trained_models

    def ensemble_predictions(self, models: Dict, X: pd.DataFrame) -> np.ndarray:
        """Make predictions using the ensemble of models."""
        predictions = np.column_stack([
            model.predict(X) for model in models.values()
        ])
        return np.mean(predictions, axis=1)

    def train(self, data_path: str) -> Tuple[Dict[str, float], Dict]:
        """Train the complete pipeline and return metrics."""
        # Load and prepare data
        df = pd.read_csv(data_path)
        df = df.drop('Address', axis=1)
        
        # Feature engineering
        X = self.feature_engineering(df.drop('Price', axis=1))
        y = df['Price']
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Create and fit preprocessing pipeline
        self.preprocessing_pipeline = self.create_preprocessing_pipeline()
        X_train_processed = self.preprocessing_pipeline.fit_transform(X_train)
        X_test_processed = self.preprocessing_pipeline.transform(X_test)
        
        # Train models
        trained_models = self.train_models(X_train_processed, y_train)
        
        # Make predictions
        train_predictions = self.ensemble_predictions(trained_models, X_train_processed)
        test_predictions = self.ensemble_predictions(trained_models, X_test_processed)
        
        # Calculate metrics
        metrics = {
            'train_r2': r2_score(y_train, train_predictions),
            'test_r2': r2_score(y_test, test_predictions),
            'train_rmse': np.sqrt(mean_squared_error(y_train, train_predictions)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, test_predictions))
        }
        
        # Save pipelines
        joblib.dump(self.preprocessing_pipeline, 'preprocessing_pipeline.pkl')
        joblib.dump(trained_models, 'trained_models.pkl')
        
        return metrics, trained_models

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Make predictions on new data."""
        # Feature engineering
        X = self.feature_engineering(X)
        
        # Preprocess the data
        X_processed = self.preprocessing_pipeline.transform(X)
        
        # Load models
        trained_models = joblib.load('trained_models.pkl')
        
        # Make predictions
        predictions = self.ensemble_predictions(trained_models, X_processed)
        return predictions

# Example usage
if __name__ == "__main__":
    pipeline = HousePricePipeline()
    metrics, models = pipeline.train("train.csv")
    print("Model Performance Metrics:", metrics)

Model Performance Metrics: {'train_r2': 0.9518282633263166, 'test_r2': 0.9085804191951348, 'train_rmse': np.float64(77611.96876769899), 'test_rmse': np.float64(106054.58661628918)}
