In [8]:
from comment_analysis import *
from feature_tools import *
from cleaning_utility import *
import pandas as pd

In [None]:
df_listings = pd.read_csv('/Volumes/Disk2/Courses MA3/MA3 - ADA/AIRBNB data/DataSet/2019-09-14_Amsterdam_listings_detailed.csv', header=0, low_memory = False)
df_neighbourhood = pd.read_csv('/Volumes/Disk2/Courses MA3/MA3 - ADA/AIRBNB data/DataSet/NaT_Amsterdam_neighbourhoods.csv')
df_comments = pd.read_csv('/Volumes/Disk2/Courses MA3/MA3 - ADA/AIRBNB data/DataSet/2019-09-14_Amsterdam_reviews.csv', header=0)

# drop neighbourhood_group column as it is only filled with NaN
df_neighbourhood = df_neighbourhood.drop(columns = ['neighbourhood_group'])

# transform neighbourhood name into a categorical integer to create a new feature
df_neighbourhood['neighborhood_id'] = df_neighbourhood.index.values + [1]*df_neighbourhood.shape[0]

#############################
df_comments_sentiment = pd.read_pickle("../data/df_comments_sentiment_final.pkl")

#############################

# create a new dataframe for success metrics
df_success_metrics = pd.merge(df_listings[['id', 'review_scores_rating', 'reviews_per_month']],
                              df_comments_sentiment, left_on='id', right_on='listing_id')
# change index to id
df_success_metrics = df_success_metrics.set_index('id')


features_ensemble       = ['id', 'host_since', 'host_response_rate', 'host_is_superhost', 'host_total_listings_count', 
                          'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
                          'property_type', 'room_type', 'bed_type', 'amenities', 'price', 'security_deposit', 'cleaning_fee',
                          'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 
                          'instant_bookable', 'cancellation_policy']
# features are seperated depending on their types to be prepared before being used for ML
date_features           = 'host_since'
bool_features           = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
list_features           = ['host_verifications', 'amenities']
price_features          = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']
neighborhood_features   = ['neighbourhood_cleansed']
string_features         = ['property_type', 'cancellation_policy','room_type', 'bed_type']
rate_features           = ['host_response_rate']
replace_nan_features    = ['host_response_rate', 'host_is_superhost', 'host_total_listings_count', 'security_deposit', 'cleaning_fee', 'number_of_amenities']
replace_values          = [0, 0, 0, 0, 0, 0]
# create an instance of the class CleaningUtility()
cu = CleaningUtility()
# FEATURES PREPARATION

# using the class CleaningUtility(), features are prepared to be used 
df_features = df_listings[features_ensemble].copy()
df_features = cu.bool_to_int(df_features, bool_features)
df_features = cu.host_activity_period(df_features, date_features)
df_features = cu.list_to_number_of_services(df_features, list_features)
df_features = cu.format_price(df_features, price_features)
df_features = cu.format_rate(df_features, rate_features)
df_features = cu.replace_nan_by_values(df_features, replace_nan_features, replace_values)

# create a new feature with neighbourhood_id which indicates in which neighborhood of this city the listing is located
df_features = cu.string_to_id(df_features, neighborhood_features[0], df_neighbourhood.neighbourhood, df_neighbourhood.neighborhood_id)

# convert string_featutes to one-hot labels (-> will increase the number of columns)
df_features = cu.convert_to_one_hot_label(df_features, string_features)

# computes the price per person
df_features = cu.prices_per_person(df_features, price_features[0:-1], 'guests_included')

##############################################################
## now that all the data are numeric, convert all to float64 #
##############################################################
cols           = df_features.columns
df_features[cols] = df_features[cols].apply(pd.to_numeric, errors = 'raise')

##############################################################
############# keep only rows with non-nan values #############
##############################################################
tmp      = df_features.shape[0]
df_features = cu.select_numeric_column_only(df_features)
df_features = df_features.dropna()

##############################################################
##################### print cleaning info ####################
##############################################################
print('\nNumber of rows    before data set cleaning:       %.0f'%(df_listings.shape[0]))
print(  'Number of rows    after data set cleaning:        %.0f'%(tmp))
print(  'Number of rows    after removal of rows with nan: %.0f'%(df_features.shape[0]))
print(  'Number of columns before data set cleaning:       %.0f'%(df_listings.shape[1]))
print(  'Number of columns initially selected:             %.0f'%len(features_ensemble))
print(  'Number of columns after data set cleaning:        %.0f'%(df_features.shape[1]))

display(df_features.describe())
df_features.head()

# merge to have metrics and features in the same dataFrame
df_features_w_metrics = pd.merge(df_success_metrics, df_features, left_on = 'id', right_on = 'id')

# set 'id' as the index of the dataFrame
df_features_w_metrics = df_features_w_metrics.set_index('id')
df_features_w_metrics.head()

# for all these step random_seed is set to 1
random_seed = 1

This data contains 484507 lines.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [None]:
def perform_rf(metric, df_features_w_metrics, success_metrics_features, tune = False):
    '''
    takes the DataFrame containing features and metrics as argument, removes all metrics except one
    which becomes then the 'label' and finally performs the random forest on these datas
    '''
    df = df_features_w_metrics.copy()
    m = success_metrics_features.copy()
    m.remove(metric)
    df = df.drop(columns = m)
    print('Random forest of features for the metric: ' + metric)
    f = FeaturesTools(df, metric, random_seed = random_seed)
    norm_feat = f.normalize_features()
    f = FeaturesTools(norm_feat, metric, random_seed = random_seed)
    imp, rf = f.randomForestAnalysis(plotResults = True, tuneModelParameters = tune)
    return imp, rf, f

In [None]:
# perform random forest for each metric
metric1 = 'review_scores_rating'
imp1, rf1, f1 = perform_rf(metric1, df_features_w_metrics, success_metrics_features)