In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('cleaned_realtor_data_new.csv')

# Splitting data into features and target
X = data.drop(columns='price')
y = data['price']

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining preprocessing steps
numeric_features = ['bed', 'bath', 'acre_lot', 'house_size', 'zip_code']
categorical_features = ['status', 'city', 'state']

# Creating transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Applying Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Initializing tree-based models
models = {
    # "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=50, random_state=42)
    # "Extra Trees": ExtraTreesRegressor(n_estimators=50, random_state=42)
}

mae_scores = {}

for name, model in models.items():
    # Creating and evaluating the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    print("starting")
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    mae_scores[name] = mean_absolute_error(y_test, predictions)
    print(mae_scores)

print(mae_scores)


starting


KeyboardInterrupt: 