In [7]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [8]:
airbnbDataSet_filename = os.path.join(os.getcwd(), "airbnbListingsData.csv")

df = pd.read_csv(airbnbDataSet_filename, header=0)

print(df.shape) 
list(df.columns)




(28022, 50)


['name',
 'description',
 'neighborhood_overview',
 'host_name',
 'host_location',
 'host_about',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_group_cleansed',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'review_scores_rating',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'calculated_host_listings_count',
 'cal

In [9]:
# dropping_col = [ 'host_name',
#  'host_location',
#  'host_about',
#  'host_response_rate',
#  'host_acceptance_rate',
#  'host_is_superhost',
#  'host_listings_count',
#  'host_total_listings_count',
#  'host_has_profile_pic',
#  'host_identity_verified',
#  'n_host_verifications',
# 'calculated_host_listings_count',
#  'calculated_host_listings_count_entire_homes',
#  'calculated_host_listings_count_private_rooms',
#  'calculated_host_listings_count_shared_rooms', 'neighborhood_overview', 'name', 'description'] 

# # Avoid errors if columns are missing
# df = df.drop(columns=dropping_col, errors='ignore') 
# df = df.drop(columns=[col for col in df.columns if 'host' in col.lower()], errors='ignore')

# list(df.columns)
# print(df.shape)

# df.head()


In [10]:
# Check missing values
missing_val = df.isnull().sum()
print("Missing Values Before Filling is: \n", missing_val)

Missing Values Before Filling is: 
 name                                                5
description                                       570
neighborhood_overview                            9816
host_name                                           0
host_location                                      60
host_about                                      10945
host_response_rate                              11843
host_acceptance_rate                            11113
host_is_superhost                                   0
host_listings_count                                 0
host_total_listings_count                           0
host_has_profile_pic                                0
host_identity_verified                              0
neighbourhood_group_cleansed                        0
room_type                                           0
accommodates                                        0
bathrooms                                           0
bedrooms                                      

In [11]:
# Fill missing values: numeric columns -> mean, categorical columns -> 'drop'
for col in df.columns:
    if df[col].dtype == 'object':  # Categorical column
        df.dropna(subset=[col], inplace=True)

    else:  # Numeric column
        df[col].fillna(df[col].mean(), inplace=True)

# Check missing values after filling
missing_val_after = df.isnull().sum()
print("Missing Values After Filling is: \n", missing_val_after)

# # Display updated column list
# print("Updated Columns:\n", list(df.columns))

Missing Values After Filling is: 
 name                                            0
description                                     0
neighborhood_overview                           0
host_name                                       0
host_location                                   0
host_about                                      0
host_response_rate                              0
host_acceptance_rate                            0
host_is_superhost                               0
host_listings_count                             0
host_total_listings_count                       0
host_has_profile_pic                            0
host_identity_verified                          0
neighbourhood_group_cleansed                    0
room_type                                       0
accommodates                                    0
bathrooms                                       0
bedrooms                                        0
beds                                            0
amenities      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [12]:
# Convert 'amenities' column (which contains lists as strings) into actual lists
df['amenities'] = df['amenities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Convert each list of amenities into a set
df['amenities'] = df['amenities'].apply(set)

In [13]:
# Gather all unique amenities
all_amenities = set()

for amenities in df['amenities']:
    all_amenities.update(amenities)


In [14]:
print(all_amenities)

{'Pantene and Head & Shoulders shampoo', 'Beko (Turkish) stainless steel oven', 'Dedicated workspace: office chair, monitor, desk, and table', '57" HDTV with Amazon Prime Video, Netflix', 'Fridgedare 30 inches stainless steel gas stove', '48" HDTV with Apple TV, Netflix, HBO Max', 'Sonos Bluetooth sound system', 'ICE Air conditioner', 'Summit refrigerator', 'Game console: PS3, PS4, and Xbox Series X', 'black african soap body soap', 'ACURE conditioner', 'Luxury  conditioner', '30" HDTV', 'Samsung sound bar/ subwoofer sound system with Bluetooth and aux', 'Dr. Brommers body soap', 'Changing table', '32" HDTV with Netflix, standard cable', 'Many body soap', 'Fast wifi – 327 Mbps', 'Game console: Xbox 360', 'Amika conditioner', 'Fast wifi – 490 Mbps', '50" TV with standard cable', 'Electric stovetop cooker stainless steel electric stove', 'Central Air  conditioner', '50" HDTV with Amazon Prime Video', 'Dedicated workspace: office chair, monitor, table, and desk', 'KENMOORE refrigerator', 

In [15]:
import pandas as pd
from rapidfuzz import process, fuzz

# Define normalized amenity categories and common keywords
standard_amenities = {
    'wifi': ['wifi', 'fast wifi'],
    'tv': ['tv', 'hdtv', 'flat screen'],
    'streaming_services': ['netflix', 'hbo max', 'amazon prime video', 'apple tv', 'chromecast', 'roku'],
    'body_soap': ['body soap', 'bar soap', 'body wash'],
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'sound_system': ['sound system', 'bluetooth sound system', 'speaker'],
    'oven': ['oven', 'air fryer'],
    'stove': ['stove', 'gas stove', 'electric stove'],
    'workspace': ['workspace', 'monitor', 'desk', 'office chair'],
    'refrigerator': ['refrigerator', 'fridge', 'mini fridge'],
    'parking': ['parking', 'garage', 'driveway'],
    'children_amenities': ['children', 'books and toys', 'crib', 'baby bath'],
    'gym': ['gym', 'fitness'],
    'pool': ['pool', 'rooftop pool', 'heated pool']
}

def normalize_amenity(raw_amenity):
    raw_amenity = raw_amenity.lower()
    for category, keywords in standard_amenities.items():
        match_tuple = process.extractOne(raw_amenity, keywords, scorer=fuzz.partial_ratio)
        if match_tuple:
            match, score, _ = match_tuple
            if score > 80:
                return category
    return None

# Normalize amenities per listing
df['normalized_amenities'] = df['amenities'].apply(lambda amenity_list: {
    normalize_amenity(a) for a in amenity_list if normalize_amenity(a) is not None
})

# Create binary columns for each normalized amenity
all_amenities = set()
df['normalized_amenities'].apply(all_amenities.update)

amenities_df = pd.DataFrame([
    {amenity: int(amenity in amenities) for amenity in all_amenities}
    for amenities in df['normalized_amenities']
])

# Merge and clean up
df = df.drop(columns=['amenities', 'normalized_amenities'])
df = df.join(amenities_df)

# Optional: One-hot encode categorical features
# categorical_features = ['neighbourhood_group_cleansed', 'room_type']
# df = pd.get_dummies(df, columns=categorical_features)

# Show final structure
print("Final DataFrame Shape:", df.shape)
print("Final Columns:", df.columns.tolist())
df.head()


Final DataFrame Shape: (12217, 63)
Final Columns: ['name', 'description', 'neighborhood_overview', 'host_name', 'host_location', 'host_about', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'calculated_host_listings_count', 'calculated_

Unnamed: 0,name,description,neighborhood_overview,host_name,host_location,host_about,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,oven,wifi,sound_system,streaming_services,gym,pool,refrigerator,conditioner,body_soap,parking
0,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...",Centrally located in the heart of Manhattan ju...,Jennifer,"New York, New York, United States",A New Yorker since 2000! My passion is creatin...,0.8,0.17,True,8.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
1,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",Just the right mix of urban center and local n...,LisaRoxanne,"New York, New York, United States",Laid-back Native New Yorker (formerly bi-coast...,0.09,0.69,True,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"Theater district, many restaurants around here.",Shunichi,"New York, New York, United States",I used to work for a financial industry but no...,1.0,1.0,True,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,Cozy Clean Guest Room - Family Apt,"Our best guests are seeking a safe, clean, spa...",Our neighborhood is full of restaurants and ca...,MaryEllen,"New York, New York, United States",Welcome to family life with my oldest two away...,0.922255,0.785526,True,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
5,"Lovely Room 1, Garden, Best Area, Legal rental","Beautiful house, gorgeous garden, patio, cozy ...",Neighborhood is amazing!<br />Best subways to ...,Laurie,"New York, New York, United States","Hello, \r\nI will be welcoming and helpful, w...",1.0,1.0,True,3.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [16]:

# Convert 'price' to numeric, removing non-numeric characters (like '$' or commas)
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)


In [17]:
def labels(columns):
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df


categorical_features = df.select_dtypes(exclude=['int']).columns
print("Boolean Columns:", print(len(categorical_features)), list(categorical_features))

df = labels(categorical_features)
df.head()

48
Boolean Columns: None ['name', 'description', 'neighborhood_overview', 'host_name', 'host_location', 'host_about', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 'room_type', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'reviews_per_month', 'tv', 'children_amenities', 'shampoo', 'workspace', 'oven', 'wifi', 'sound_system', 'streaming_services', 'gym', 'pool', 'refrigerator', 'conditioner', 'body_soap', 'parking']


Unnamed: 0,name,description,neighborhood_overview,host_name,host_location,host_about,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,oven,wifi,sound_system,streaming_services,gym,pool,refrigerator,conditioner,body_soap,parking
0,9486,1783,1464,1754,413,240,46,15,0,8,...,1,1,0,0,0,0,1,1,0,1
1,11681,3216,3797,2305,413,6623,3,66,0,1,...,1,1,0,0,0,0,1,1,0,1
3,6093,7132,8020,3577,413,5155,66,98,0,1,...,1,1,0,0,0,0,0,1,0,1
4,3742,6690,5212,2570,413,8667,58,76,0,1,...,1,1,0,0,1,0,0,1,1,1
5,6526,1617,4739,2210,413,2053,66,98,0,3,...,1,1,0,0,0,0,1,1,0,1


In [18]:
X = df.drop(columns="price")
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape, X_test.shape, y_train.shape, y_test)

(8551, 62) (3666, 62) (8551,) 9845     235
3344     150
13133    558
22942      6
19845    239
        ... 
17004    221
23705    431
12275     56
13065     35
5061      71
Name: price, Length: 3666, dtype: int64


In [19]:
params = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
}

dt = DecisionTreeRegressor()
grid_search = GridSearchCV(dt, params, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

dt = grid_search.best_estimator_
predictions = dt.predict(X_test)

print(predictions)
mse = mean_squared_error(y_test, dt.predict(X_test))
# accuracy = accuracy_score(y_test, dt.predict(X_test))
print(mse)
# print(accuracy)

Traceback (most recent call last):
  File "/Users/jasonjiang/.pyenv/versions/3.9.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/jasonjiang/.pyenv/versions/3.9.9/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/jasonjiang/.pyenv/versions/3.9.9/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/jasonjiang/.pyenv/versions/3.9.9/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/jasonjiang/.pyenv/versions/3.9.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

[166. 196. 492. ... 185.  41.  41.]
9542.151936715767


In [20]:

y= df['price'] 
X = df.drop(columns='price', axis=1) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

print('start creating model') 




start creating model


In [21]:
rf = RandomForestRegressor(n_estimators=200, random_state=1234)
rf.fit(X_train, y_train)

# Make predictions
rf_preds = rf.predict(X_test)

# Evaluate model
print('Model Performance:')
print('Mean Squared Error:', mean_squared_error(y_test, rf_preds))
print('R^2 Score:', r2_score(y_test, rf_preds))
print('End')

Model Performance:
Mean Squared Error: 4446.360904203869
R^2 Score: 0.6113479747463897
End
