In [2]:
# data_preprocessing.py

import pandas as pd

def preprocess_data():
    df = pd.read_csv("listings.csv")

    # Parse price
    df["price"] = df["price"].replace('[\$,]', '', regex=True).astype(float)

    # Drop rows with missing required fields
    df = df.dropna(subset=["price", "last_review", "review_scores_rating", "room_type", "neighbourhood_cleansed", "number_of_reviews"])

    # Convert last_review to datetime
    df["last_review"] = pd.to_datetime(df["last_review"])

    # Extract season
    df["month"] = df["last_review"].dt.month
    df["season"] = df["month"].map({
        12: 'winter', 1: 'winter', 2: 'winter',
        3: 'spring', 4: 'spring', 5: 'spring',
        6: 'summer', 7: 'summer', 8: 'summer',
        9: 'fall', 10: 'fall', 11: 'fall'
    })

    # Create a clean dataframe
    df_clean = df[[
        "price",
        "neighbourhood_cleansed",
        "room_type",
        "number_of_reviews",
        "review_scores_rating",
        "season"
    ]].rename(columns={
        "neighbourhood_cleansed": "city",
        "room_type": "property_type",
        "review_scores_rating": "review_rating"
    })

    df_clean.to_csv("airbnb_ready.csv", index=False)
    print("âœ… Clean data saved to airbnb_ready.csv")

if __name__ == "__main__":
    preprocess_data()


âœ… Clean data saved to airbnb_ready.csv


In [3]:
# train_model.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

def train_model():
    df = pd.read_csv("airbnb_ready.csv")

    # Convert season to string to avoid errors
    df["season"] = df["season"].astype(str)

    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df, columns=["city", "property_type", "season"], drop_first=True)

    X = df_encoded.drop("price", axis=1)
    y = df_encoded["price"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Save model and feature columns
    joblib.dump((model, list(X.columns)), "pricing_model.pkl")
    print("âœ… Model trained and saved as pricing_model.pkl")

if __name__ == "__main__":
    train_model()


âœ… Model trained and saved as pricing_model.pkl


In [4]:
# recommend_price.py

import pandas as pd
import joblib

def recommend_price(input_features: dict):
    model, model_columns = joblib.load("pricing_model.pkl")
    df_input = pd.DataFrame([input_features])

    # One-hot encode input
    df_input = pd.get_dummies(df_input)

    # Add missing columns
    for col in model_columns:
        if col not in df_input.columns:
            df_input[col] = 0

    # Reorder columns
    df_input = df_input[model_columns]

    prediction = model.predict(df_input)[0]
    return round(prediction, 2)

if __name__ == "__main__":
    sample_input = {
        "number_of_reviews": 25,
        "review_rating": 4.8,
        "city_PINE HILLS": 1,
        "property_type_Private room": 1,
        "season_summer": 1
    }

    price = recommend_price(sample_input)
    print(f"ðŸ’° Suggested price: ${price}")


ðŸ’° Suggested price: $146.7
