## Importing the libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

## Reading in the data files

In [2]:
df_listings = pd.read_csv('Data/listings.csv', dtype={'listing_url': 'object',
                                                      'price': 'object',
                                                      'weekly_price': 'object',
                                                      'monthly_price': 'object',
                                                      'security_deposit': 'object',
                                                      'cleaning_fee': 'object',
                                                      'extra_people': 'object',
                                                      'license': 'object', 
                                                      'jurisdiction_names': 'object',})

## Data Exploration

### Listings Dataframe

In [None]:
df_listings.shape

In [None]:
pd.set_option('display.max_columns', None)

# show listings data sample
df_listings.head(3)

In [None]:
df_listings.describe()

In [None]:
set(df_listings.columns[df_listings.isnull().mean()>0.75])

In [None]:
neighbourhood_vals = df_listings['property_type'].value_counts()

# The below should be a bar chart of the proportion of individuals in each professional category if your status_vals
# is set up correctly.

(neighbourhood_vals[0:5]/df_listings.shape[0]).plot(kind="bar");
plt.title("Top 5 type of properties, that are most advertised");

In [None]:
room_vals = df_listings['room_type'].value_counts()

# The below should be a bar chart of the proportion of individuals in each professional category if your status_vals
# is set up correctly.

(room_vals/df_listings.shape[0]).plot(kind="bar");
plt.title("The type of rooms, are most advertised");

## The Business Questions

### What are Airbnb current challanges?

### Question 1
- What is the average price of the listings, for the different location within London?

In [None]:
price_df = df_listings[df_listings['price'].isnull() == False]

In [None]:
price_df['neighbourhood_cleansed'].value_counts()

In [None]:
price_df['price'] = price_df['price'].str.replace('$', '', regex=True)
price_df['price'] = price_df['price'].str.replace(',', '', regex=True)
price_df['price'] = price_df['price'].astype(float)

In [None]:
nb_price_avg = price_df.groupby('neighbourhood_cleansed', as_index=False).mean()
nb_price_avg = pd.DataFrame(nb_price_avg).sort_values(by='price', ascending=False)

In [None]:
nb_price_avg = nb_price_avg.rename(columns={'neighbourhood_cleansed':'Neighbourhood','price': 'Avg Price'})

In [None]:
plt.figure(figsize=(10,5))

sns.set(style="whitegrid")
chart = sns.barplot(x="Neighbourhood", y="Avg Price", data=nb_price_avg)

chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=60, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='medium', 
    rotation_mode='anchor'
)

## New Question on amenities

In [None]:
#def convert_to_num(column):
    
#    column = column.str.replace('{', '', regex=True).replace('}', '', regex=True).replace('\"', '', regex=True)
#    column = column.str.split(',')
        
#    return column


#df['amenities'] = convert_to_num(df['amenities'])

### Question 2

- What types of verifications are hosts using? How long did it take them to respond?

In [None]:
verifications_df = df_listings[df_listings['host_verifications'].isnull() == False]

In [None]:
verifications_df.shape

In [None]:
verifications_df = verifications_df[verifications_df['host_verifications'] != '[]']

In [None]:
verifications_df.shape

In [None]:
verifications_df['host_verifications'].value_counts()

In [None]:
def verifications_types(column):
    verifications_dict = {}
    temp = []

    for i in column:
        temp.append(eval(i))
    
    for i in temp:
        try:
            for j in i:
                if j not in verifications_dict:
                    verifications_dict[j] = 1
                else:
                    verifications_dict[j] += 1
        except:
            continue
            
    return verifications_dict
        
verifications_types = verifications_types(verifications_df['host_verifications'])

In [None]:
verifications_types

In [None]:
for k,v in verifications_types.items():
    verifications_types[k] = round(v / verifications_df.shape[0] * 100, 3)

In [None]:
verifications_types

In [None]:
verifications_types = pd.DataFrame.from_dict(verifications_types,orient='index',columns=['A']).reset_index()
verifications_types = verifications_types.rename(columns={'index':'Verification Type','A': '% of listings'})
verifications_types = verifications_types.sort_values(by='% of listings', ascending=False)

In [None]:
verifications_types

In [None]:
plt.figure(figsize=(10,5))

sns.set(style="whitegrid")
chart = sns.barplot(x="Verification Type", y="% of listings", data=verifications_types)

chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=60, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='medium', 
    rotation_mode='anchor'
)

### Question 3
- What features assist in the pricing of a listing? 
    - Could the price be predicted?
    - importance of the features, rank features

#### Cleaning the data

In [3]:
col_nulls = set(df_listings.columns[df_listings.isnull().mean()>0.70])

In [4]:
df = df_listings.drop(list(col_nulls), axis=1)
df = df[df.columns.drop(list(df.filter(regex='url')))]
df = df[df.columns.drop(list(df.filter(regex='id')))]
df = df[df.columns.drop(list(df.filter(regex='scraped')))]
df = df[df.columns.drop(list(df.filter(regex='first_review')))]
df = df[df.columns.drop(list(df.filter(regex='last_review')))]

In [5]:
df = df.drop(columns=['name','summary','space','description','neighborhood_overview','notes',
                     'transit','access','interaction','house_rules','host_name','host_about',
                      'host_since','host_neighbourhood','street','neighbourhood','market','latitude','longitude',
                     'host_location','city','state','zipcode','smart_location','country_code',
                      'country','amenities'])

In [6]:
df = df.dropna(subset=["price"])

In [7]:
df.head(3)

Unnamed: 0,experiences_offered,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,neighbourhood_cleansed,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,business,within a few hours,71%,89%,f,4.0,4.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,Islington,...,f,f,moderate,f,f,2,1,1,0,0.18
1,romantic,within a day,50%,67%,f,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,Kensington and Chelsea,...,t,f,strict_14_with_grace_period,t,t,1,1,0,0,0.71
2,none,within an hour,80%,94%,f,18.0,18.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,Westminster,...,t,f,strict_14_with_grace_period,f,f,15,15,0,0,0.38


In [8]:
pd.set_option('display.max_rows', None)

df.dtypes

experiences_offered                              object
host_response_time                               object
host_response_rate                               object
host_acceptance_rate                             object
host_is_superhost                                object
host_listings_count                             float64
host_total_listings_count                       float64
host_verifications                               object
host_has_profile_pic                             object
neighbourhood_cleansed                           object
is_location_exact                                object
property_type                                    object
room_type                                        object
accommodates                                      int64
bathrooms                                       float64
bedrooms                                        float64
beds                                            float64
bed_type                                        

In [9]:
def data_float(column):
    column = column.str.replace('$', '', regex=True)
    column = column.str.replace(',', '', regex=True)
    column = column.astype(float)
    
    return column

df['price'] = data_float(df['price'])
df['security_deposit'] = data_float(df['security_deposit'])
df['cleaning_fee'] = data_float(df['cleaning_fee'])
df['extra_people'] = data_float(df['extra_people'])

In [10]:
def data_percentage(column):
    column = column.str.replace('%', '', regex=True)
    #column = column.str.replace(',', '', regex=True)
    column = column.astype('float') / 100
    
    return column

df['host_response_rate'] = data_percentage(df['host_response_rate'])
df['host_acceptance_rate'] = data_percentage(df['host_acceptance_rate'])

In [11]:
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df[col].fillna((df[col].mean()), inplace=True)

In [12]:
df['host_response_time'].fillna('No Response', inplace=True)
df['host_is_superhost'].fillna('f', inplace=True)
df['host_has_profile_pic'].fillna('f', inplace=True)
df['bed_type'].fillna('Unknown', inplace=True)

In [13]:
df.isnull().sum()

experiences_offered                             0
host_response_time                              0
host_response_rate                              0
host_acceptance_rate                            0
host_is_superhost                               0
host_listings_count                             0
host_total_listings_count                       0
host_verifications                              0
host_has_profile_pic                            0
neighbourhood_cleansed                          0
is_location_exact                               0
property_type                                   0
room_type                                       0
accommodates                                    0
bathrooms                                       0
bedrooms                                        0
beds                                            0
bed_type                                        0
price                                           0
security_deposit                                0


In [14]:
# Dummy the categorical variables
cat_vars = df.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    # for each cat add dummy var, drop original column
    df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var],
                                                         prefix=var, prefix_sep='_', drop_first=True)], axis=1)

#### Building the Model

In [15]:
X = df.drop('price', axis = 1)
y = df['price']

In [16]:
X.shape

(86358, 793)

In [17]:
X_new = SelectKBest(chi2, k=100).fit_transform(X, y)

In [18]:
X_new.shape

(86358, 100)

In [19]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state=42)

In [None]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true,y_predict)
    
    # Return the score
    return score

In [22]:
from sklearn.model_selection import ShuffleSplit

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor(random_state = 42)

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': range(1,11)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    ##scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search object #scoring=scoring_fnc
    grid = GridSearchCV(estimator=regressor,param_grid=params,cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [23]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for 'max_depth'
print(reg.get_params()['max_depth'])

6
