In [None]:
# import some libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import nltk
from nltk.corpus import stopwords
%matplotlib inline

In [None]:
# read in data
calendar=pd.read_csv('data/calendar.csv')
listings=pd.read_csv('data/listings.csv')
reviews=pd.read_csv('data/reviews.csv')

#### Business understanding of the project

Airbnb is an online marketplace to connect hosts and guests for lodging or tourism experiences. Airbnb serves as an alternative to hotels. Advantanges and disadvantages of Airbnb compared to hotels.

Advantages: 
1. Could be less expensive than hotels
2. Adding benefits of kitchens and hostpitabilites.
3. Could be great for family travels.

Disadvantages:
1. For business travel, hotel might just be easier.
2. Hosts can be more quirky or friendly than hotel staff.
3. Rooms could be less clean than hotels. 
4. Sharing rooms with other guest could be unpredictable.
5. Prices is determined by host, which is unot necessarily reasonble.
6. Overall, the experience is less standardized and might not be suitable for people who are looking for predictability.

On the other side of the market, hosts could make extra money by subletting a guest room in holidays or entire home while you're away. commerical rentals are possible but might not be advised in terms of amount of money you can make and regulations on short-term rentals, it also might not be the purpose for using this platform. Some disadvantages for hosts using Airbnb: Some buildings not allow for short-term sublet. safety issues about guests. could feel that guest mess up with your home. perhaps relatively longer terms, less guests, exchanging information before guests coming to know about them.

Is superhost a good thing or not? perhaps not sure.

**Some questions** I try to answer with this dataset
1. Can you describe the vibe of each Seattle neighborhood using listing descriptions?
2. What are the busiest times of the year to visit Seattle? By how much do prices spike?
3. What do people generally say about these listings? perhaps in different areas?
4. Does price related to any of the variables?

#### Data overview

Data are downloaded from https://www.kaggle.com/airbnb/seattle
Data are posted by Airbnb to Kaggle, but original data are collected by Airbnb Inside (scraped from Airbnb public available data), which is an personal funded site not associated with Airbnb. More about Airbnb Inside and this original data can be found here http://insideairbnb.com/about.html

This dataset includes one year's data- price and availability- scraped from the website. Could see time trends of price over the year. availability throughout the year. listings include desciption about the place, host information, neighborhood information, room details, prices, availaiblities, reivew scores, and some other info. reviews include guest comments for the listings.

In [None]:
calendar.head()

In [None]:
#pd.options.display.max_columns = 10
#listings.info()
# 3818 listings, 2751 hosts

In [None]:
#reviews.head()

After exploring dataset with `df.head()`, `df.info()`, `df.describe()`,

know that calendar includes listing id and the price and availability for a day; listings includes full descriptions and average review score; reviews includes unique id for each reviewer and detailed comments

#### Data preparation

In [None]:
# clean up price column in listings. remove "$" and "," and convert to numerical values
listings['price']=listings['price'].map(lambda x: x.strip('$').replace(',', ''))
listings['price']=listings['price'].map(float)

In [None]:
# similary for caldenar, clean up price column (only where it's not nan) and change data type for date
calendar.loc[pd.notnull(calendar['price']),'price']=calendar.loc[pd.notnull(calendar['price']),'price'].map(lambda x: x.strip('$').replace(',', ''))
calendar['price']=calendar['price'].map(float)
calendar['date'] = pd.to_datetime(calendar['date'])

#### Evaluation

In [None]:
# word count for one listing description
# tried to group description by review scores to see if any difference between of description by review scores, 
# but majority of reviews are very good- scores 9 or 10
tokens = nltk.word_tokenize(listings['space'][0])
sw = set(stopwords.words('english'))
tokens = [x for x in tokens if x not in sw]
punc=['.', ',', '(', ')']
tokens = [x for x in tokens if x not in punc]
#from nltk.stem import WordNetLemmatizer
#lemmatizer = WordNetLemmatizer() 
#tokens = [lemmatizer.lemmatize(x) for x in tokens]
from nltk.probability import FreqDist
fdist = FreqDist(tokens).most_common(10)
fdist

In [None]:
# what did reviewers say about the listings. word count of reviews by neighborhood
reviews=reviews.merge(listings[['id', 'neighbourhood_group_cleansed', 'host_name']], how='inner', left_on='listing_id', right_on='id')

In [None]:
tokens_all = []
stopwords_to_remove = set(stopwords.words('english'))
others_to_remove=['.', ',', '(', ')', '&', ':', '!', '-', '*', 'The', 'apartment', "'s", 'Seattle', 'This', 
                  'I', 'We', 'us', 'It', 'stay', 'place', 'host', "n't", 'would', 'room', 'home']
for comment in reviews.loc[(reviews['neighbourhood_group_cleansed']=='Central Area') & pd.notnull(reviews['comments']), 'comments'].tolist():
    tokens = nltk.word_tokenize(comment)
    tokens = [x for x in tokens if x not in stopwords_to_remove]
    tokens = [x for x in tokens if x not in others_to_remove]
    tokens_all = tokens_all + tokens

from nltk.probability import FreqDist
fdist = FreqDist(tokens_all).most_common(30)
fdist

In [None]:
# need to download nltk if not used before otherwise will pop up TK errors
#nltk.download()

In [None]:
# scatter matrix of numerical variables to explore their relationships to price
# could use pandas plotting.scatter_matrix or seaborn pairplot
#pd.plotting.scatter_matrix(listings[['price', 'review_scores_value', 'square_feet', 'bedrooms', 'number_of_reviews']], 
#                           figsize = (9, 9))

# for seaborn pairplot, histogram diagnols don't accept nans so has to be kde diagnols
#sns.pairplot(listings, vars=['price', 'review_scores_value', 'square_feet', 'bedrooms', 'number_of_reviews'], diag_kind='kde')

In [None]:
# histogram of listing prices
listings['price'].plot.hist(bins=30)
#plt.ylabel('price')

In [None]:
# describe listing prices
# listings['price'].describe()

In [None]:
# normalize price by bedrooms
listings.loc[listings['bedrooms']==0, 'bedrooms']=0.5
listings['price_per_bed']=listings['price']/(listings['bedrooms'])

In [None]:
# get average price by location. groupby and visuals. visual could be matplotlib errorbar or seaborn pointplot

In [None]:
# difference between neighborhood and neighbourhood_group_cleansed? choose one of the fields as location group
listings['neighbourhood'].unique()
#listings.loc[listings['neighbourhood_group_cleansed']=='Cascade', 'neighbourhood'].unique()
#listings.loc[listings['neighbourhood']=='Eastlake', 'neighbourhood_group_cleansed'].unique()

In [None]:
# number of lists by location
neighborcts=listings.groupby(['neighbourhood_group_cleansed'], as_index=False)['id'].count()
neighborcts.sort_values(by='id', ascending=False, inplace=True)
neighborcts.plot.bar(x='neighbourhood_group_cleansed', y='id')

In [None]:
price_by_area=listings.groupby(['neighbourhood_group_cleansed'],as_index=False)['price'].agg([np.mean, 'sem', 'count', 'max', 'min', 'median']).reset_index()
price_by_area.sort_values(by='mean', ascending=False)

In [None]:
# matplotlib
plt.errorbar(x=price_by_area.index.tolist(), y=price_by_area['mean'], yerr=price_by_area['sem']*2,fmt='o')
plt.xticks(price_by_area.index.tolist())
plt.gca().set_xticklabels(price_by_area['neighbourhood_group_cleansed'],rotation=90);
#plt.show()
# sns.pointplot(x='neighbourhood_group_cleansed',y='price', data=listings, join=False)

In [None]:
# review by location
#review_by_area=listings.groupby(['neighbourhood_group_cleansed'],as_index=False)['review_scores_value'].agg([np.mean, 'sem', 'count', 'max', 'min', 'median']).reset_index()
#review_by_area.sort_values(by='mean', ascending=False)

In [None]:
# percent super host by location
#listings['host_is_superhost']=listings['host_is_superhost'].apply(lambda x: 1 if x=='t' else 0)
#pct_super_by_area=listings.groupby(['neighbourhood_group_cleansed'],as_index=False)['host_is_superhost'].mean()
#pct_super_by_area.sort_values(by='host_is_superhost', ascending=False)

In [None]:
## average price by review scores
#price_by_review=listings.groupby(['review_scores_value'],as_index=False)['price'].agg([np.mean, 'sem', 'count', 'max', 'min', 'median']).reset_index()
#price_by_review.sort_values(by='mean', ascending=False)
##plt.figure(figsize=(6.4,4.8))
#plt.errorbar(x=price_by_review.index.tolist(), y=price_by_review['mean'], yerr=price_by_review['sem']*2,fmt='o')
#plt.xticks(price_by_review.index.tolist())
#plt.gca().set_xticklabels(price_by_review['review_scores_value'],rotation=90)

In [None]:
# price by property type
#price_by_bedType=listings.groupby(['property_type'],as_index=False)['price'].agg([np.mean, 'sem', 'count', 'max', 'min', 'median']).reset_index()
#price_by_bedType.sort_values(by='mean', ascending=False)

In [None]:
# time trends for average price for each neighborhood. merge with listings to get neighborhood information first
#calendar['week']=calendar['date'].dt.week
calendar = pd.merge(calendar, listings[['id', 'neighbourhood_group_cleansed']], left_on='listing_id', right_on='id',how='inner')

In [None]:
neighbor_avg_by_date=calendar.groupby(['neighbourhood_group_cleansed', 'date'], as_index=False)['price'].mean()
plt.figure(figsize=(12,6))
sns.lineplot(data=neighbor_avg_by_date, x='date', y='price', hue='neighbourhood_group_cleansed')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

#### Modeling

In [None]:
# keep only parameters that are possibly related to price
# listings.info()
#sns.set_context('talk') #rc={'axes.labelsize': 16})
listings_less = listings[['transit', 'neighbourhood_group_cleansed',
                         'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 
                         'amenities', 'square_feet', 'number_of_reviews', 'price']].copy()
#sns.pairplot(listings_less, diag_kind = 'kde')
pd.plotting.scatter_matrix(listings_less, figsize=(10,10));

In [None]:
#sns.reset_orig()

In [None]:
# handle missing values
# fill property_type as mode
# fill bathrooms, bedrooms, beds as median
# fill square_feet as 0 and add another column is_nan_square_feet as 1 to indicate nan values in square_feet
# fill blanks for missing transit
listings_less['property_type'] = listings_less['property_type'].fillna(listings_less['property_type'].mode())
for col in ['bathrooms', 'bedrooms', 'beds']:
    listings_less[col] = listings_less[col].fillna(listings_less[col].median())
listings_less.loc[pd.isnull(listings_less['square_feet']), 'square_feet'] = 0
listings_less.loc[listings_less['square_feet']<10, 'square_feet'] = 0 # change small values to zero
listings_less['is_nan_square_feet'] = 0
listings_less.loc[listings_less['square_feet']==0, 'is_nan_square_feet'] = 1 # add another column to indicate if nan square feet
listings_less.loc[pd.isnull(listings_less['transit']), 'transit'] = ''

In [None]:
# add dummy variables for categorical variables
listings_less_cat = listings_less.select_dtypes(include='object').copy()
listings_less_cat.drop(['transit', 'amenities'], axis=1, inplace=True)
listings_less_cat = pd.get_dummies(listings_less_cat)

In [None]:
listings_less_num = listings_less.select_dtypes(include=['float64', 'int64']).copy()
listings_less_num.drop(['price'], axis=1, inplace=True)

In [None]:
# extract text features from transit 
# tutorial for text classification https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
X_text=listings_less['amenities']
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
X_text_counts = count_vect.fit_transform(X_text)
X_text_names = count_vect.get_feature_names()
print(X_text_counts.shape)
tfidf_transformer = TfidfTransformer()
X_text_tfidf = tfidf_transformer.fit_transform(X_text_counts)
print(X_text_tfidf.shape)

df_text = pd.DataFrame(data = X_text_tfidf.toarray(), columns = X_text_names)

In [None]:
X = pd.concat([listings_less_cat, listings_less_num], axis=1)
y = listings['price']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score # what is r2
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Coefficient of determination: {0:.2f}'.format(r2_score(y_test, y_pred)))

In [None]:
coefs_df = pd.DataFrame({'feature_name': X_train.columns, 'coefs': regr.coef_, 'abs_coefs': np.abs(regr.coef_)}, 
                        columns=['feature_name', 'coefs', 'abs_coefs'])
coefs_df.sort_values('abs_coefs', ascending=False).head(20)