In [14]:
import pandas as pd
data1 = pd.read_csv("./CostarExport_1.csv") # getting the data
data2 = pd.read_csv("./LA_multifamily.csv") # getting the data

columnsInD1 = data1.keys()
columnsInD2 = data2.keys()

validCols = ['Size', 'Net Income', 
                     'Number Of Units', 'Number Of Parking Spaces', 'Typical Floor (SF)',
                     'Number Of Studios Units', 'Number Of 1 Bedrooms Units', 'Number Of 2 Bedrooms Units',
                     'Number Of 3 Bedrooms Units']

print('Number Of Studios Units' in columnsInD2)
mixed = pd.merge(data2, data1,
                 on = validCols,
                 how = 'outer')

merged_file = './merged_file.csv'

mixed.to_csv(merged_file, index = False)
pd.DataFrame(mixed)


True


KeyError: 'Number Of Studios Units'

In [7]:
# imports

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# cleanup
merged_file = r'D:\Code\Matthews\data\merged_data.csv'
df = pd.read_csv(merged_file) 

columns_to_clean = ["Sale Price", "Net Income", "Building SF", 
                    "Star Rating", "Price Per AC Land",
                    "Price Per Unit", "Down Payment", "Size", "Total Expense Amount",
                    'Parcel Number 1 (Min)', 'Parcel Number 2 (Max)', "Price Per SF Land"]
                    
for col in columns_to_clean:
    df[col] = df[col].str.replace(',', '')
    df[col] = df[col].str.replace('$', '')
    df[col] = df[col].str.replace(' Star', '')
    df[col] = df[col].str.replace('-', '')
    

# Convert the columns to numeric
df[columns_to_clean] = df[columns_to_clean].apply(pd.to_numeric, errors='coerce')
keep = ['Size', 'Building SF', 'Number Of Units', 'Number Of Floors', 
         'Price Per AC Land', 'Price Per SF Land', 'Asking Price', 
         'Number Of 1 Bedrooms Units', 'Number Of 2 Bedrooms Units', 
         'Floor Area Ratio', 'Number Of Parking Spaces', 
         'Number Of Studios Units', 'Typical Floor (SF)', 
         'Number Of 3 Bedrooms Units', 'Land Area AC', 'Land Area SF', 
         'Star Rating', 'Net Income', 'Year Built', 'Age',
         'Sale Price']

df = df[keep]
df.dropna(axis = 0, inplace=True)


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])



# Save the cleaned dataset
cleaned_file = '/Users/lehergulati/Downloads/matthews-data-files/cleaned_dataset.csv'
df.to_csv(cleaned_file, index=False)

KeyError: 'Sale Price'

In [14]:
import pickle
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# selected_features = ['Building SF', 'Size', 'Net Income', 'Number Of Units', 
#                      'Number Of Parking Spaces', 'Typical Floor (SF)',
#                      'Land Area SF', 'Land Area AC', 'Number Of 2 Bedrooms Units',
#                      'Number Of 3 Bedrooms Units', 'Number Of 1 Bedrooms Units']

# Building SF                   0.865942    0.184476 -0.184476    0.892998  
# Size                          0.865942    0.184476 -0.184476    0.892998  
# Net Income                    1.000000    0.259688 -0.259688    0.848735  
# Number Of Units               0.757420    0.105807 -0.105807    0.841576  
# Number Of Parking Spaces      0.752257    0.221623 -0.221623    0.806509  
# Typical Floor (SF)

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

selected_features = ['Size', 'Net Income', 
                     'Number Of Units', 'Number Of Parking Spaces', 'Typical Floor (SF)',
                     'Number Of Studios Units', 'Number Of 1 Bedrooms Units', 'Number Of 2 Bedrooms Units',
                     'Number Of 3 Bedrooms Units']
# setting train test data

df = pd.read_csv('cleaned_dataset.csv')

df.dropna(axis = 0)

X = df[selected_features]
y = df['Sale Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)


# models

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
    ])


# run models
maes = []
for name, model in models.items():
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)
                              ])
    
    pipeline.fit(X_train, y_train)

    with open(f"v2_model{name}.pkl", "wb") as f:
        pickle.dump(model, f)

    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mpe = np.mean(np.abs((y_test - y_pred)/y_test))*100
    maes += [mae]

    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r2}")
    print(f"Mean Percentage Error: {mpe}")
    print("="*40)





non_numeric_columns = X.select_dtypes(exclude=['number']).columns
X_train_numeric = X_train.drop(columns=non_numeric_columns)
X_test_numeric = X_test.drop(columns=non_numeric_columns)

poly = PolynomialFeatures(degree=2, interaction_only = 2, include_bias = False)

X_poly_train = poly.fit_transform(X_train_numeric)
X_poly_test = poly.transform(X_test_numeric)

rf = RandomForestRegressor(random_state = 42)
gb = GradientBoostingRegressor(random_state = 42)
high_performing_models = {
    'Random Forest': rf,
    'Gradient Boosting': gb
}

for name, model in high_performing_models.items():
    model.fit(X_poly_train, y_train)
    y_pred = model.predict(X_poly_test)
                           
    mpe = np.mean(np.abs((y_test - y_pred)/y_test))*100

    print(f"{name} - MAE: {mean_absolute_error(y_test, y_pred)}, R2: {r2_score(y_test, y_pred)}, MPE: {mpe}")
print("="*80)

Model: Linear Regression
Mean Absolute Error: 709779.538525532
R-squared: 0.9314609490590121
Mean Percentage Error: 27.978595886937757
Model: Random Forest
Mean Absolute Error: 545259.6971052632
R-squared: 0.924536715631016
Mean Percentage Error: 16.799344110058755
Model: Gradient Boosting
Mean Absolute Error: 566635.0208138095
R-squared: 0.9341271388117273
Mean Percentage Error: 17.312193365916993




Random Forest - MAE: 561723.8727631578, R2: 0.9363583277301212, MPE: 17.286924005305394
Gradient Boosting - MAE: 578117.3236018096, R2: 0.9182035906950465, MPE: 17.735956061878692


In [15]:
with open('maes.txt', "w") as f:
    for mae in maes:
        f.write(f"{mae}\n")