In [1]:
import pandas as pd
data = pd.read_csv("/Users/lehergulati/Downloads/CostarExport_1.csv") # getting the data

data.head()

Unnamed: 0,Property Address,Property City,Property State,Property County,Property Zip Code,Property Type,Sale Price,Sale Price Comment,Number Of Units,Size,...,Parcel Number 1 (Min),Parcel Number 2 (Max),Tenancy,Vacancy,Zoning,Studio Mix,One Bedroom Mix,Two Bedroom Mix,Three Bedroom Mix,Other Mix
0,1241 Irolo St,Los Angeles,CA,Los Angeles,90006,Multi-Family,"$2,990,000",Confirmed,4,6555,...,5080-035-009,5080-035-009,,,LAR3,,,,,2(4 + 4) 2(5 + 5)
1,722 Robinson St,Los Angeles,CA,Los Angeles,90026-3615,Multi-Family,"$1,460,000",Full Value,3,2898,...,5401-017-008,5401-017-008,,,,,,3(2 + 1),,
2,1821 S New England St,Los Angeles,CA,Los Angeles,90006-5313,Multi-Family,"$2,950,000",Confirmed,4,7443,...,5056-016-015,5056-016-015,,100.0,LARD1.5,,,,,1(5 + 4) 3(5 + 5)
3,1510-1514 S Cochran Ave,Los Angeles,CA,Los Angeles,90019-4068,Multi-Family,"$2,830,602",Confirmed,12,9166,...,5070-012-056,5070-012-056,,,LARD1.5,,8(1 + 1),4(2 + 1),,
4,951 S Berendo St,Los Angeles,CA,Los Angeles,90006,Multi-Family,"$1,750,000",Confirmed,14,4349,...,5078-007-003,5078-007-003,,3.9,LAR4,,,,,


In [2]:
# imports

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
# setting sale price as ouput and removing it from input

X = data.drop('Sale Price', axis=1)
y = data['Sale Price']

In [4]:
# establishing test & train data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# identifying numerical & categorical columns

num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

In [6]:
# preprocessing numerical

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [7]:
# preprocessing categorical

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [9]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

In [10]:
# Train and evaluate models
for name, model in models.items():
    # Create and evaluate the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)
                              ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print the performance metrics
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    print("="*40)




ValueError: Input contains NaN