In [194]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [195]:
path = ""
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

In [196]:
train['dataset'] = "train"
test['dataset'] = "test"
data = pd.concat([train,test], axis = 0)
categorical = ['property_type','room_type','bed_type','cancellation_policy','city']
data = pd.get_dummies(data, columns = categorical)

In [197]:
data['cleaning_fee'] = data.cleaning_fee.astype(int)

In [198]:
cols_to_drop = ['description', 'first_review','host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode']

In [199]:
data.drop(cols_to_drop,axis=1,inplace=True)

In [201]:
train_amenities = data['amenities'].apply(lambda x : x[1:-1])

In [202]:
train_amenities = train_amenities.apply(lambda x : x.replace('"',''))

In [203]:
train_amenities = train_amenities.apply(lambda x : x.lower())

In [204]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [205]:
v = TfidfVectorizer(stop_words = 'english')

In [206]:
train_x = v.fit_transform(train_amenities)

In [207]:
from sklearn.cluster import KMeans

In [208]:
model = KMeans()

In [209]:
model.fit(train_x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [210]:
data['amenities_cluster'] = model.labels_

In [212]:
data.drop(['amenities'],axis=1,inplace=True)

In [213]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold
cv_groups = KFold(n_splits=3)
regr = RandomForestRegressor(random_state = 0, n_estimators = 10)

In [214]:
numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [215]:
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0).values

In [216]:
test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0).values

In [217]:
train_y = data[data.dataset == "train"].log_price.values

In [218]:
for train_index, test_index in cv_groups.split(train_x):
    
    # Train the model using the training sets
    regr.fit(train_x[train_index], train_y[train_index])
    
    # Make predictions using the testing set
    pred_rf = regr.predict(train_x[test_index])
    
    # Calculate RMSE for current cross-validation split
    rmse = str(np.sqrt(np.mean((train_y[test_index] - pred_rf)**2)))
    
    print("RMSE for current split: " + rmse)

RMSE for current split: 0.5049917227234826
RMSE for current split: 0.5035516608034823
RMSE for current split: 0.5098629350342008


In [219]:
final_prediction = regr.predict(test_x)

In [220]:
final_prediction

array([4.1584674 , 4.75113795, 4.79456205, ..., 4.7666075 , 5.17166112,
       4.40863585])