In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/8.7 MB 3.3 MB/s eta 0:00:03
   -------- ------------------------------- 1.8/8.7 MB 3.4 MB/s eta 0:00:03
   ------------ --------------------------- 2.6/8.7 MB 3.7 MB/s eta 0:00:02
   ---------------- ----------------------- 3.7/8.7 MB 3.8 MB/s eta 0:00:02
   ------------------- -------------------- 4.2/8.7 MB 3.6 MB/s eta 0:00:02
   ------------------------ --------------- 5.2/8.7 MB 3.9 MB/s eta 0:00:01
   ---

In [25]:
# 1. Imports
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# 2. Load full listings.csv (downloaded from Inside Airbnb)
df = pd.read_csv("merged_data.csv")  # Ensure this file has all 80+ columns

# 3. Select relevant features
selected_columns = [
    'price', 'room_type', 'neighbourhood_cleansed',
    'minimum_nights', 'number_of_reviews',
    'reviews_per_month', 'availability_365',
    'accommodates', 'bedrooms', 'beds'
]

df = df[selected_columns]

# 4. Drop missing values
df.dropna(inplace=True)

# 5. Convert price to numeric
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# 6. One-hot encode categorical variables
df = pd.get_dummies(df, columns=['room_type', 'neighbourhood_cleansed'], drop_first=True)

# 7. Define features and target
X = df.drop("price", axis=1)
y = np.log1p(df["price"])

# 8. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


mae_log = mean_absolute_error(y_test, y_pred)
print("Original Price MAE (~):", np.expm1(mae_log))

# 10. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_original = np.expm1(y_pred)
y_test_original = np.expm1(y_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# 11. Save model and feature columns
with open("best_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("model_features.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)

print("✅ Model and features saved successfully.")


Original Price MAE (~): 0.37209326644509405
MAE: 0.3163375054517861
R² Score: 0.46555640420615485
✅ Model and features saved successfully.


In [29]:
# converting CSV to Json file
import pandas as pd

# Read CSV — path to your listings.csv file
df = pd.read_csv('static/listings.csv')

# Optional: Select columns you want to send, e.g. id, name, neighbourhood_cleansed, latitude, longitude, price
columns_to_keep = ['id', 'name', 'neighbourhood_cleansed', 'latitude', 'longitude', 'price', 'room_type']

df = df[columns_to_keep]

# Convert price from string like "$27,895.00" to float 27895.00
def clean_price(price_str):
    try:
        return float(price_str.replace('$', '').replace(',', ''))
    except:
        return None

df['price'] = df['price'].apply(clean_price)

# Convert to list of dicts
listings_json = df.to_dict(orient='records')

# Save to JSON file for Flask to serve
import json
with open('static/listings.json', 'w', encoding='utf-8') as f:
    json.dump(listings_json, f, ensure_ascii=False, indent=2)


In [30]:
# Rearranging neighbourhoods.geojson
import json

# List of Budapest kerület names in order
names = [
    "I. kerület", "II. kerület", "III. kerület", "IV. kerület", "V. kerület", "VI. kerület", "VII. kerület",
    "VIII. kerület", "IX. kerület", "X. kerület", "XI. kerület", "XII. kerület", "XIII. kerület", "XIV. kerület",
    "XV. kerület", "XVI. kerület", "XVII. kerület", "XVIII. kerület", "XIX. kerület", "XX. kerület", "XXI. kerület",
    "XXII. kerület", "XXIII. kerület"
]

with open('static/neighbourhoods.geojson', 'r', encoding='utf-8') as f:
    geojson = json.load(f)

for i, feature in enumerate(geojson['features']):
    feature['properties'] = feature.get('properties', {})
    feature['properties']['name'] = names[i] if i < len(names) else f"Unknown {i+1}"

with open('static/neighbourhoods.geojson', 'w', encoding='utf-8') as f:
    json.dump(geojson, f, ensure_ascii=False, indent=2)

print("Neighbourhood names assigned successfully.")

Neighbourhood names assigned successfully.


In [32]:
import json

geojson_path = "static/neighbourhoods.geojson"
with open(geojson_path, "r", encoding="utf-8") as f:
    data = json.load(f)

missing_names = []
for idx, feature in enumerate(data.get("features", [])):
    props = feature.get("properties", {})
    name = props.get("name")
    if not name or not isinstance(name, str) or not name.strip():
        missing_names.append((idx, props))

if missing_names:
    print("Features missing 'properties.name':")
    for idx, props in missing_names:
        print(f"Feature index: {idx}, properties: {props}")
else:
    print("All features have a valid 'properties.name' field.")

All features have a valid 'properties.name' field.


In [40]:
import json

with open('static/listings.json', 'r', encoding='utf-8') as f:
    listings = json.load(f)

neighbourhoods = set(d.get('neighbourhood_cleansed', '').strip() for d in listings)
room_types = set(d.get('room_type', '').strip() for d in listings)

print("Unique neighbourhood_cleansed values:")
for n in sorted(neighbourhoods):
    print(f"- '{n}'")

print("\nUnique room_type values:")
for r in sorted(room_types):
    print(f"- '{r}'")


Unique neighbourhood_cleansed values:
- 'I. kerület'
- 'II. kerület'
- 'III. kerület'
- 'IV. kerület'
- 'IX. kerület'
- 'V. kerület'
- 'VI. kerület'
- 'VII. kerület'
- 'VIII. kerület'
- 'X. kerület'
- 'XI. kerület'
- 'XII. kerület'
- 'XIII. kerület'
- 'XIV. kerület'
- 'XIX. kerület'
- 'XV. kerület'
- 'XVI. kerület'
- 'XVII. kerület'
- 'XVIII. kerület'
- 'XX. kerület'
- 'XXI. kerület'
- 'XXII. kerület'
- 'XXIII. kerület'

Unique room_type values:
- 'Entire home/apt'
- 'Hotel room'
- 'Private room'
- 'Shared room'


In [None]:
import pandas as pd

df = pd.read_csv('merged_data.csv')

# Fill missing numeric columns with zeros or appropriate defaults
df['minimum_nights'] = pd.to_numeric(df['minimum_nights'], errors='coerce').fillna(0).astype(int)
df['number_of_reviews'] = pd.to_numeric(df['number_of_reviews'], errors='coerce').fillna(0).astype(int)
df['reviews_per_month'] = pd.to_numeric(df['reviews_per_month'], errors='coerce').fillna(0)

# Also clean price column (remove $ and commas)
df['price'] = df['price'].replace('[\$,]', '', regex=True)
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)

# Then save to JSON (records orient)
df.to_json('static/listings.json', orient='records', force_ascii=False)

  df['price'] = df['price'].replace('[\$,]', '', regex=True)


In [45]:
def clean_price(price_str):
    if not price_str:
        return 0
    try:
        return float(price_str.replace("$", "").replace(",", "").strip())
    except:
        return 0

filtered = [
    d for d in listings
    if d.get('neighbourhood_cleansed', '').strip() == "II. kerület"
    and d.get('room_type', '').strip() == "Entire home/apt"
    and abs(float(d.get('minimum_nights', 0)) - 3) <= 5
    and abs(float(d.get('number_of_reviews', 0)) - 10) <= 10
    and abs(float(d.get('reviews_per_month', 0) or 0) - 1.5) <= 2
]

print(f"Filtered count with looser ranges: {len(filtered)}")

if filtered:
    for d in filtered[:10]:
        print({
            'minimum_nights': d.get('minimum_nights'),
            'number_of_reviews': d.get('number_of_reviews'),
            'reviews_per_month': d.get('reviews_per_month'),
            'room_type': d.get('room_type'),
            'neighbourhood': d.get('neighbourhood_cleansed'),
            'price': clean_price(d.get('price')),
        })
else:
    print("No similar listings found.")


Filtered count with looser ranges: 287
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'number_of_reviews': None, 'reviews_per_month': None, 'room_type': 'Entire home/apt', 'neighbourhood': 'II. kerület', 'price': 0}
{'minimum_nights': None, 'numbe

In [50]:
import pandas as pd
import numpy as np
import pickle

# Load your dataset (the cleaned and preprocessed CSV)
df = pd.read_csv('merged_data.csv')

# Clean price column as in training
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Filter for similar listings with loose matching criteria
target = {
    'minimum_nights': 3,
    'number_of_reviews': 10,
    'reviews_per_month': 1.5,
    'room_type': 'Entire home/apt',
    'neighbourhood_cleansed': 'II. kerület'
}

filtered = df[
    (df['neighbourhood_cleansed'] == target['neighbourhood_cleansed']) &
    (df['room_type'] == target['room_type']) &
    (df['minimum_nights'].between(target['minimum_nights'] - 1, target['minimum_nights'] + 1)) &
    (df['number_of_reviews'].between(target['number_of_reviews'] - 2, target['number_of_reviews'] + 2)) &
    (df['reviews_per_month'].between(target['reviews_per_month'] - 1, target['reviews_per_month'] + 1))
]

print(f"Found {len(filtered)} similar listings.")

if len(filtered) > 0:
    actual_avg_price = filtered['price'].mean()
    print(f"Average actual price of similar listings: €{actual_avg_price:.2f}")
else:
    print("No similar listings found.")

# Load your saved model and feature list
with open("model/best_model.pkl", "rb") as f:
    model = pickle.load(f)
with open("model/model_features.pkl", "rb") as f:
    model_features = pickle.load(f)

# Prepare input vector for model prediction (remember to do one-hot encoding for categorical vars)
input_dict = {
    'minimum_nights': target['minimum_nights'],
    'number_of_reviews': target['number_of_reviews'],
    'reviews_per_month': target['reviews_per_month'],
    # For one-hot encoding:
    **{f"room_type_{target['room_type']}": 1},
    **{f"neighbourhood_cleansed_{target['neighbourhood_cleansed']}": 1},
}

# Set zeros for all other features expected by the model
for feat in model_features:
    if feat not in input_dict:
        input_dict[feat] = 0

# Create DataFrame
input_df = pd.DataFrame([input_dict], columns=model_features)

# Predict log(price)
log_pred = model.predict(input_df)[0]
predicted_price = np.expm1(log_pred)  # revert log1p

print(f"Predicted price for input: €{predicted_price:.2f}")


Found 4 similar listings.
Average actual price of similar listings: €35742.00
Predicted price for input: €17887.60
