# Data Loading

In [4]:
import pandas as pd
import csv
import re
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor

In [5]:
# File path for cities: San Francisco(sf), 
# Okaland(ok), Santa Clara(sc), Santa Cruze(scz)

file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}/calendar.csv'

sf_file = file_url.format('sf_airbnb')
ok_file = file_url.format('ok_airbnb')
sc_file = file_url.format('sc_airbnb')
scz_file = file_url.format('scz_airbnb')

In [6]:
# Loaded price by date information for each city

pricing_sf = pd.read_csv(sf_file, parse_dates=['date'])
pricing_ok = pd.read_csv(ok_file, parse_dates=['date'])
pricing_sc = pd.read_csv(sc_file, parse_dates=['date'])
pricing_scz = pd.read_csv(scz_file, parse_dates=['date'])

print(pricing_sf.shape, pricing_ok.shape, pricing_sc.shape, pricing_scz.shape)

(2420680, 4) (1057770, 4) (2068820, 4) (573050, 4)


In [7]:
# Concatenate four files into one
# Created keys for each file for future references

frames = [pricing_sf, pricing_ok, pricing_sc, pricing_scz]
keys = ['pricing_sf', 'pricing_ok', 'pricing_sc', 'pricing_scz']

df_pricing = pd.concat(frames, keys=keys)

df_pricing.head(10)

Unnamed: 0,Unnamed: 1,listing_id,date,available,price
pricing_sf,0,958,2019-08-05,f,
pricing_sf,1,958,2019-08-04,f,
pricing_sf,2,958,2019-08-03,f,
pricing_sf,3,958,2019-08-02,f,
pricing_sf,4,958,2019-08-01,f,
pricing_sf,5,958,2019-07-31,f,
pricing_sf,6,958,2019-07-30,f,
pricing_sf,7,958,2019-07-29,f,
pricing_sf,8,958,2019-07-28,f,
pricing_sf,9,958,2019-07-27,f,


# Data Clean up

## Filter down to data with pricing information

In [8]:
# Filter to houses that are available and have listing prices

df_pricing_smaller = df_pricing[df_pricing['available']=='t']
df_pricing_smaller = df_pricing_smaller.copy()

print('Dimension for df_pricing_smaller: ', df_pricing_smaller.shape)
print ('\nNumber of Unique ids: ', df_pricing_smaller.listing_id.nunique())
print ('\nAll columns: ', df_pricing_smaller.columns)
print ('\nData insight:')
display( df_pricing_smaller.date.describe(include='all'))

Dimension for df_pricing_smaller:  (2853357, 4)

Number of Unique ids:  14770

All columns:  Index(['listing_id', 'date', 'available', 'price'], dtype='object')

Data insight:


count                 2853357
unique                    389
top       2018-11-03 00:00:00
freq                    10393
first     2018-08-06 00:00:00
last      2019-08-29 00:00:00
Name: date, dtype: object

## Format Conversion

In [9]:
# Noted columns with currency information are formated as strings, 
# and need to convert to float

def currency2float(string):
    """
    Convert currency to float
    """
    float_ = float(str(string).strip('$').replace(',',''))
    return float_

In [10]:
# Converted price to float format

df_pricing_smaller['price'] = df_pricing_smaller['price'].apply(
                              lambda x: currency2float(x))

## Select Data that have Monthly and Daily information

In [11]:
# Added 'month' column to keep track of month of date
# Added 'day_of_week' column based on Date. Default format: Monday=0, Sunday=6

df_pricing_smaller['month'] = df_pricing_smaller['date'].dt.month
df_pricing_smaller['day_of_week'] = df_pricing_smaller['date'].dt.weekday


print('Dimensions of df_pricing_smaller', df_pricing_smaller.shape)
display(df_pricing_smaller.head())
display(df_pricing_smaller.describe(include ='all'))

Dimensions of df_pricing_smaller (2853357, 6)


Unnamed: 0,Unnamed: 1,listing_id,date,available,price,month,day_of_week
pricing_sf,63,958,2019-05-02,t,181.0,5,3
pricing_sf,64,958,2019-05-01,t,181.0,5,2
pricing_sf,65,958,2019-04-30,t,181.0,4,1
pricing_sf,66,958,2019-04-29,t,181.0,4,0
pricing_sf,67,958,2019-04-28,t,181.0,4,6


Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0
unique,,389,1,,,
top,,2018-11-03 00:00:00,t,,,
freq,,10393,2853357,,,
first,,2018-08-06 00:00:00,,,,
last,,2019-08-29 00:00:00,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646
std,8563144.0,,,703.5042,3.629179,2.003061
min,958.0,,,10.0,1.0,0.0
25%,6453774.0,,,80.0,3.0,1.0


In [12]:
# Counted number of months each house was listed in the last year

df_month_count = df_pricing_smaller.groupby(['listing_id']).month.nunique()  \
                 .reset_index()
df_month_count.rename(columns={'month':'month_count'}, inplace=True)
df_month_count.head()

Unnamed: 0,listing_id,month_count
0,958,10
1,3083,5
2,3264,3
3,5739,2
4,5858,12


In [13]:
# Counted number of days in a week each house was listed in the last year

df_day_of_week_count = df_pricing_smaller.groupby(['listing_id'])  \
                       .day_of_week.nunique().reset_index()
df_day_of_week_count.rename(columns={'day_of_week':'day_of_week_count'}, inplace=True)
df_day_of_week_count.head()

Unnamed: 0,listing_id,day_of_week_count
0,958,7
1,3083,7
2,3264,7
3,5739,7
4,5858,7


In [14]:
# Mergered count information with original dataframe

df_pricing_with_counts = pd.merge(left=df_pricing_smaller, 
                                  right=df_month_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')

df_pricing_with_counts = pd.merge(left=df_pricing_with_counts, 
                                  right=df_day_of_week_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')

display(df_pricing_with_counts.head())
display(df_pricing_with_counts.describe(include='all'))

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
0,958,2019-05-02,t,181.0,5,3,10,7
1,958,2019-05-01,t,181.0,5,2,10,7
2,958,2019-04-30,t,181.0,4,1,10,7
3,958,2019-04-29,t,181.0,4,0,10,7
4,958,2019-04-28,t,181.0,4,6,10,7


Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0,2853357.0,2853357.0
unique,,389,1,,,,,
top,,2018-11-03 00:00:00,t,,,,,
freq,,10393,2853357,,,,,
first,,2018-08-06 00:00:00,,,,,,
last,,2019-08-29 00:00:00,,,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646,10.20866,6.995392
std,8563144.0,,,703.5042,3.629179,2.003061,2.696574,0.1206497
min,958.0,,,10.0,1.0,0.0,1.0,1.0
25%,6453774.0,,,80.0,3.0,1.0,9.0,7.0


## Removed Outliers

In [15]:
# Removed outliers for 'price'. Use 5% and 95% percentiles as cutoff

five_percentile = np.percentile(df_pricing_with_counts.price, 5)
ninety_five_percentile = np.percentile(df_pricing_with_counts.price, 95)

df_pricing_with_counts = df_pricing_with_counts[(
                         df_pricing_with_counts.price>=five_percentile)
                        & (df_pricing_with_counts.price<=ninety_five_percentile)]

display (df_pricing_with_counts.shape)

(2573002, 8)

In [16]:
# Drilled down to houses that price available in 12 months and 7 days a week

df_pricing_available = df_pricing_with_counts[
                                (df_pricing_with_counts['month_count']==12)
                                & (df_pricing_with_counts['day_of_week_count']==7)
                                             ]

print ('Dimensions for df_pricing_available: ',df_pricing_available.shape)
display (df_pricing_available.price.describe())

print ('Check for null values')
display (df_pricing_available.isnull().sum())

Dimensions for df_pricing_available:  (1480292, 8)


count    1.480292e+06
mean     1.667032e+02
std      1.075272e+02
min      4.100000e+01
25%      8.500000e+01
50%      1.370000e+02
75%      2.150000e+02
max      5.500000e+02
Name: price, dtype: float64

Check for null values


listing_id           0
date                 0
available            0
price                0
month                0
day_of_week          0
month_count          0
day_of_week_count    0
dtype: int64

In [17]:
# Group by month by weekday, and calculate an average pricing

df_pricing_by_month_weekday = df_pricing_available.groupby(['listing_id', 'month', 'day_of_week']) \
                              .price.mean()  \
                              .reset_index()
df_pricing_by_month_weekday.head()

Unnamed: 0,listing_id,month,day_of_week,price
0,5858,1,0,235.0
1,5858,1,1,235.0
2,5858,1,2,235.0
3,5858,1,3,235.0
4,5858,1,4,235.0


In [18]:
df_pricing_by_month_weekday.describe(include='all')

Unnamed: 0,listing_id,month,day_of_week,price
count,366935.0,366935.0,366935.0,366935.0
mean,13497900.0,6.453816,3.004633,167.118685
std,8700925.0,3.462652,2.002994,107.669596
min,5858.0,1.0,0.0,41.0
25%,5560256.0,3.0,1.0,85.0
50%,13735180.0,6.0,3.0,137.75
75%,21294620.0,9.0,5.0,217.0
max,28152470.0,12.0,6.0,550.0


In [19]:
# Check for null values if any

df_pricing_by_month_weekday.isnull().sum()

listing_id     0
month          0
day_of_week    0
price          0
dtype: int64

In [20]:
# Rename the average price column to avoid confusion

df_pricing_by_month_weekday.rename(columns={'price':'price_by_month_week'}, inplace=True)

In [21]:
df_pricing_by_month_weekday.shape

(366935, 4)

In [22]:
def get_month(number):
    """
    Given a number, return the name of month.
    
    """
    month_lookup = {1:'Jan.', 2:'Feb.', 3:'Mar.', 4:'Apr.', 5:'May.', 6:'Jun.',
                    7:'Jul.', 8:'Aug.', 9:'Sep.', 10:'Oct.', 11:'Nov.', 12:'Dec.'}
    
    return month_lookup[number]

In [23]:
def get_weekdays(number):
    """
    Given a number, return the name of month.
    
    """
    weekday_lookup = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 
                      4:'Friday', 5:'Saturday', 6:'Sunday'}
    
    return weekday_lookup[number]

In [24]:
df_pricing_by_month_weekday['month'] = df_pricing_by_month_weekday['month']  \
                                       .apply(lambda x: get_month(x))

In [25]:
df_pricing_by_month_weekday['day_of_week'] = df_pricing_by_month_weekday['day_of_week']  \
                                             .apply(lambda x: get_weekdays(x))

In [28]:
# This is to create visual to show seasonality

df_visual = pd.merge(left=df_pricing_by_month_weekday, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [29]:
df_visual.shape

(243886, 37)

In [30]:
df_visual.columns

Index(['listing_id', 'month', 'day_of_week', 'price_by_month_week', 'id',
       'bedrooms', 'cleaning_fee', 'room_type_Entire home/apt',
       'reviews_per_month', 'city_San Francisco', 'city_Oakland', 'bathrooms',
       'accommodates', 'number_of_reviews', 'host_listings_count',
       'review_scores_rating_log10', 'host_yrs', 'guests_included',
       'calculated_host_listings_count', 'review_scores_location_log10',
       'beds', 'yrs_since_first_review', 'host_response_rate',
       'amenities_Air conditioning', 'review_scores_value_log10',
       'jurisdiction_names_Santa Cruz County, CA', 'amenities_Cable TV',
       'neighbourhood_cleansed_Palo Alto', 'host_is_superhost_f',
       'amenities_Lock on bedroom door', 'amenities_Private entrance',
       'amenities_Familykid friendly', 'district_Santa Clara',
       'neighbourhood_cleansed_San Jose', 'host_identity_verified_f',
       'ref_district', 'ref_jurisdiction'],
      dtype='object')

In [31]:
df_visual_entire_apartment = df_visual[df_visual['room_type_Entire home/apt']==1 ]
df_visual_single_room = df_visual[df_visual['room_type_Entire home/apt']!=1 ]

print(df_visual_entire_apartment.shape, df_visual_single_room.shape )

(135863, 37) (108023, 37)


In [32]:
df_visual_entire_apartment.ref_district.describe

<bound method NDFrame.describe of 0         San Francisco
1         San Francisco
2         San Francisco
3         San Francisco
4         San Francisco
5         San Francisco
6         San Francisco
7         San Francisco
8         San Francisco
9         San Francisco
10        San Francisco
11        San Francisco
12        San Francisco
13        San Francisco
14        San Francisco
15        San Francisco
16        San Francisco
17        San Francisco
18        San Francisco
19        San Francisco
20        San Francisco
21        San Francisco
22        San Francisco
23        San Francisco
24        San Francisco
25        San Francisco
26        San Francisco
27        San Francisco
28        San Francisco
29        San Francisco
              ...      
243856      Santa Clara
243857      Santa Clara
243858      Santa Clara
243859      Santa Clara
243860      Santa Clara
243861      Santa Clara
243862      Santa Clara
243863      Santa Clara
243864      Santa Clara
243865

In [33]:
df_visual_entire_apartment_sf = df_visual_entire_apartment[df_visual_entire_apartment['ref_district']=='San Francisco']
df_visual_entire_apartment_scz = df_visual_entire_apartment[df_visual_entire_apartment['ref_district']=='Santa Clara']

print(df_visual_entire_apartment_sf.shape, df_visual_entire_apartment_scz.shape )

(56047, 37) (35034, 37)


In [34]:
df_visual_entire_apartment_sf.id.nunique()

695

In [35]:
df_visual_entire_apartment_scz.id.nunique()

422

In [36]:
# Weekly pattern for entire apartment

df_week_entire_apartment = df_visual_entire_apartment.groupby(['ref_district', 'day_of_week'])  \
                                                     .price_by_month_week.mean()
df_week_entire_apartment

ref_district   day_of_week
Oakland        Friday         159.474027
               Monday         154.944938
               Saturday       159.725316
               Sunday         155.161974
               Thursday       155.060959
               Tuesday        154.571370
               Wednesday      154.744414
San Francisco  Friday         211.870899
               Monday         204.897288
               Saturday       211.966729
               Sunday         205.295141
               Thursday       204.853625
               Tuesday        205.165253
               Wednesday      205.049197
Santa Clara    Friday         209.905996
               Monday         208.723240
               Saturday       209.942656
               Sunday         208.205175
               Thursday       209.189709
               Tuesday        209.198270
               Wednesday      209.493972
Santa Cruz     Friday         279.242437
               Monday         265.768724
               Saturday       

In [37]:
# Weekly pattern for single room

df_week_single_room = df_visual_single_room.groupby(['ref_district', 'day_of_week'])  \
                                           .price_by_month_week.mean()
df_week_single_room

ref_district   day_of_week
Oakland        Friday          77.888629
               Monday          75.923754
               Saturday        77.889589
               Sunday          75.908070
               Thursday        75.999174
               Tuesday         75.945734
               Wednesday       75.944883
San Francisco  Friday         122.744881
               Monday         116.899184
               Saturday       122.918653
               Sunday         117.269371
               Thursday       117.207964
               Tuesday        117.112207
               Wednesday      117.236528
Santa Clara    Friday          82.902979
               Monday          81.616328
               Saturday        82.969004
               Sunday          81.619595
               Thursday        81.587401
               Tuesday         81.545600
               Wednesday       81.502136
Santa Cruz     Friday         127.845906
               Monday         122.235648
               Saturday       

In [38]:
# Montly Pattern for entire apartment

df_visual_entire_apartment.groupby(['ref_district', 'month']).price_by_month_week.mean()

ref_district   month
Oakland        Apr.     158.153970
               Aug.     157.264184
               Dec.     153.305877
               Feb.     156.082218
               Jan.     153.080845
               Jul.     159.672266
               Jun.     159.677606
               Mar.     157.705606
               May.     159.250697
               Nov.     151.698238
               Oct.     151.726567
               Sep.     156.919118
San Francisco  Apr.     207.472221
               Aug.     209.898199
               Dec.     203.178067
               Feb.     207.013189
               Jan.     205.855001
               Jul.     209.035882
               Jun.     208.817503
               Mar.     207.347120
               May.     207.832853
               Nov.     202.567577
               Oct.     206.984157
               Sep.     208.986456
Santa Clara    Apr.     211.766231
               Aug.     208.364194
               Dec.     205.188419
               Feb.     210.138842

In [39]:
# Montly Pattern for single room

df_visual_single_room.groupby(['ref_district', 'month']).price_by_month_week.mean()

ref_district   month
Oakland        Apr.      77.978622
               Aug.      77.144193
               Dec.      74.461501
               Feb.      76.915111
               Jan.      74.832027
               Jul.      78.095111
               Jun.      78.095111
               Mar.      77.971022
               May.      78.181333
               Nov.      73.789352
               Oct.      74.318885
               Sep.      75.846730
San Francisco  Apr.     119.739205
               Aug.     121.163280
               Dec.     114.110648
               Feb.     118.419817
               Jan.     115.921330
               Jul.     122.255816
               Jun.     121.711247
               Mar.     119.560597
               May.     120.704303
               Nov.     114.883151
               Oct.     118.288852
               Sep.     118.692875
Santa Clara    Apr.      83.150027
               Aug.      81.439842
               Dec.      80.502131
               Feb.      82.919020

In [40]:
df_pricing_final = pd.get_dummies(df_pricing_by_month_weekday, 
                                  prefix='',
                                  prefix_sep='',
                                  columns=['month', 'day_of_week'])

In [41]:
df_pricing_final.columns

Index(['listing_id', 'price_by_month_week', 'Apr.', 'Aug.', 'Dec.', 'Feb.',
       'Jan.', 'Jul.', 'Jun.', 'Mar.', 'May.', 'Nov.', 'Oct.', 'Sep.',
       'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday'],
      dtype='object')

In [42]:
df_pricing_final.shape

(366935, 21)

In [27]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_path = pkl_file.format('listings_with_selected_features')

df_listings_reviews = pd.read_pickle(pkl_path)
df_listings_reviews.head()

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,ref_district,ref_jurisdiction
0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,1,1,1,0,0,0,0,San Francisco,SAN FRANCISCO
3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,1,1,1,1,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,0,1,0,0,1,0,0,1,San Francisco,SAN FRANCISCO


In [None]:
df_listings_reviews.shape

In [None]:
df_final = pd.merge(left=df_pricing_final, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [None]:
df_final.shape

In [None]:
df_final.isnull().sum()

In [None]:
df_final.shape

In [None]:
df_final.columns

In [None]:
# Drop duplicate columns

df_final.drop(['id'], axis=1, inplace=True)

In [None]:
df_final.shape

In [None]:
# Create correlation matrix
corr_matrix = df_final.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

to_drop

In [None]:
# Drop features that have high correlation

df_final.drop(to_drop, axis=1, inplace=True)

In [None]:
df_final.shape

In [None]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'
pkl_path = pkl_file.format('listings_seasonality')

try: 
    with open(pkl_path, 'rb') as file:
        df_final = pd.read_pickle(file)
except:
    with open(pkl_path, 'wb') as file:
        pd.to_pickle(df_final, file)

In [None]:
# Perform multiple modeling for price prediction

X = df_final.drop(['price_by_month_week', 'ref_district', 'ref_jurisdiction'], axis=1)
y = df_final.loc[:, ['listing_id', 'price_by_month_week']]

In [None]:
grp_by_listing_ids = df_final.groupby('listing_id').first().reset_index().listing_id

In [None]:
num_of_grps = len(grp_by_listing_ids)

training_ids = np.random.choice(listing_ids, int(num_of_grps*0.75), replace=False)
print(len(training_ids))
training_ids

In [None]:
testing_ids = set(grp_by_listing_ids) - set (training_ids)
len(testing_ids)

In [None]:
X_train = X[X['listing_id'].isin(training_ids)].drop('listing_id', axis=1)
X_test = X[X['listing_id'].isin(testing_ids)].drop('listing_id', axis=1)
y_train = y[y['listing_id'].isin(training_ids)].drop('listing_id', axis=1).values.ravel()
y_test = y[y['listing_id'].isin(testing_ids)].drop('listing_id', axis=1).values.ravel()

print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [None]:
# Model-LR

model=sm.OLS(y_train, sm.add_constant(X_train_trans))
fit=model.fit()
fit.summary()

In [None]:
# Model-Random Forest

rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           random_state=42, 
                           n_jobs=-1)
rf.fit(X_train_trans, y_train.ravel())

y_train_pred = rf.predict(X_train_trans)
y_test_pred = rf.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

In [None]:
RMSE = 1043.202**0.5
RMSE

In [None]:
df_final.listing_id.nunique()

In [None]:
# Double check on price range. The range is as expected

y.describe(include='all')

In [None]:
# Map feature importance with labels

feature_labels = X.columns
feature_tuples = zip (feature_labels, rf.feature_importances_)
feature_lists = [list(feature_tuple) for feature_tuple in feature_tuples]

In [None]:
# Order features based on importance

feature_lists.sort(key=lambda x: abs(x[1]), reverse=True)
feature_lists

In [None]:
features = [list_[0] for list_ in feature_lists]
features

In [None]:
columns =['listing_id']
columns.extend(features)
columns.extend(['ref_district', 'ref_jurisdiction', 'price_by_month_week'])

In [None]:
df_data = df_final[columns]
df_data.head()

In [None]:
df_data.shape

In [None]:
df_data.rename(columns={'price_by_month_week':'price'}, inplace=True)

In [None]:
df_data.isnull().sum()

In [None]:
temp_df = df_data[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].copy()

days = temp_df.apply(lambda x: x.idxmax(), axis=1)
days

In [None]:
csv_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

csv_path = csv_file.format('airbnb_data')

df_data.to_csv(csv_path)