 # Richie Bobby's Air Bnb project
Here we explore the Airbnb data we scraped. 

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Point # Shapely for converting latitude/longtitude to geometry
import geopandas as gpd # To create GeodataFrame

airbnbto = pd.read_csv("listingsto.csv")
airbnbqc = pd.read_csv("listingsqc.csv")
airbnb = pd.concat([airbnbto,airbnbqc])
airbnb.drop_duplicates(subset='id', keep=False, inplace=False).head()


Unnamed: 0,id,latitude,longitude,room_type,reviews_per_month,number_of_reviews,price
0,1419,43.64617,-79.42451,Entire home/apt,0.16,7,469
1,8077,43.64105,-79.37628,Private room,1.45,169,100
2,12604,43.66724,-79.41598,Private room,,0,67
3,23691,43.69602,-79.45468,Private room,1.94,207,70
4,26654,43.6453,-79.3894,Entire home/apt,0.36,36,200


In [2]:
airbnb.shape

(23363, 7)

In [3]:
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [4]:
airbnb.columns.tolist()

['id',
 'latitude',
 'longitude',
 'room_type',
 'reviews_per_month',
 'number_of_reviews',
 'price']

In [5]:
airbnb.head()

Unnamed: 0,id,latitude,longitude,room_type,reviews_per_month,number_of_reviews,price
0,1419,43.64617,-79.42451,Entire home/apt,0.16,7,469
1,8077,43.64105,-79.37628,Private room,1.45,169,100
2,12604,43.66724,-79.41598,Private room,,0,67
3,23691,43.69602,-79.45468,Private room,1.94,207,70
4,26654,43.6453,-79.3894,Entire home/apt,0.36,36,200


In [6]:
type(airbnb)

pandas.core.frame.DataFrame

In [7]:
x_axis = np.arange(len(airbnb))
tick_locations = [value+0.4 for value in x_axis]    

In [None]:
plt.figure(figsize=(20,3))
plt.bar(x_axis, airbnb["number_of_reviews"], color='r', alpha=0.5, align="edge")
plt.xticks(tick_locations, airbnb["room_type"], rotation="vertical")

([<matplotlib.axis.XTick at 0x28326fcd978>,
  <matplotlib.axis.XTick at 0x28326fcd2e8>,
  <matplotlib.axis.XTick at 0x28326fcd160>,
  <matplotlib.axis.XTick at 0x283372fa860>,
  <matplotlib.axis.XTick at 0x283372facf8>,
  <matplotlib.axis.XTick at 0x28337308278>,
  <matplotlib.axis.XTick at 0x283373087b8>,
  <matplotlib.axis.XTick at 0x28337308cf8>,
  <matplotlib.axis.XTick at 0x28337310278>,
  <matplotlib.axis.XTick at 0x283373107b8>,
  <matplotlib.axis.XTick at 0x28337310cf8>,
  <matplotlib.axis.XTick at 0x28337310630>,
  <matplotlib.axis.XTick at 0x28337308710>,
  <matplotlib.axis.XTick at 0x28337316160>,
  <matplotlib.axis.XTick at 0x283373167b8>,
  <matplotlib.axis.XTick at 0x28337316cf8>,
  <matplotlib.axis.XTick at 0x2833731e278>,
  <matplotlib.axis.XTick at 0x2833731e7b8>,
  <matplotlib.axis.XTick at 0x2833731ecf8>,
  <matplotlib.axis.XTick at 0x28337327278>,
  <matplotlib.axis.XTick at 0x2833731e630>,
  <matplotlib.axis.XTick at 0x28337316710>,
  <matplotlib.axis.XTick at 0x28

In [None]:
# Set x and y limits
plt.xlim(-0.25, len(x_axis))
plt.ylim(0, max(airbnb["price"])+10)

In [None]:
plt.title("Rooms Types that receive maximum reviews and hence are most popular/talked about")
plt.xlabel("Room_Type)
plt.ylabel("Number_of_reviews")

In [None]:
plt.tight_layout()
plt.savefig("c:/Users/beatl/desktop/roomtypevsreviews.png")
plt.show()

In [None]:
x =plt.hist(airbnb['price'],bins= 200)
plt.xlabel('room_type')
plt.ylabel('price')
plt.title('Co-relation of Price and room_type', fontsize=20)
plt.xlim(0,1200)    # set the ylim to ymin, ymax

In [None]:
plt.tight_layout()
plt.savefig("c:/Users/beatl/desktop/Co-relation of Price and room_type.png")
plt.show()

In [None]:
# creating a geometry column 
geometry = [Point(xy) for xy in zip(listings['longitude'], cities['latitude'])]

# Coordinate reference system : WGS84
crs = {'init': 'epsg:4326'}

# Creating a Geographic data frame 
gdf = gpd.GeoDataFrame(cities, crs=crs, geometry=geometry)

In [None]:
gdf.head()

In [None]:
# Plot all points
gdf.plot(marker='o', color='b', markersize=0.5)

In [None]:
import plotly
# connected=True means it will download the latest version of plotly javascript library.
plotly.offline.init_notebook_mode(connected=True)


In [None]:
from plotly.graph_objs import Figure, Histogram, Layout

min_ = airbnb['price'].min()
max_ = airbnb['price'].max()

data = [Histogram(x=airbnb['price'], 
                  xbins=dict(start=min_,
                             end=max_,
                             size=(max_-min_)/100))]
layout = Layout(title="Costs",
                bargap=0.2)
fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False, image_width=600, image_height=400)

In [None]:
from plotly.graph_objs import Scatter


data = [Scatter(x=airbnb['numRooms'], y=airbnb['price'], mode = 'markers')]#, text=df['movie_title'])]
layout = Layout(title="Price versus number of rooms")

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [None]:
airbnb.isnull().sum(axis=0)

In [None]:
airbnbNew = airbnb[pd.notnull(airbnb['checkin'])]
airbnbNew.isnull().sum(axis=0)

In [None]:
airbnbNew.columns.to_series().groupby(airbnbNew.dtypes).groups

Trying to generate a 3D plot, could probably do a better one...

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

ax.scatter(airbnbNew['numGuests'], airbnbNew['numRooms'], airbnbNew['price'])
plt.show()

In [None]:
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);
plot_corr(airbnbNew)

From the above covariance matrix we can see that the ratings are heavily correlated with each other. Not surprising. We can also see that the price is highly correlated with the size of the place (numRooms, numGuests, numBeds, numBaths). It is loosely correlated with the rest of the variables. 

I'm personally a little surprised that longitude and latitude didn't play a bigger part in the prices. I think the lack of perceived correlation is the fact that these values are not yet normalized. Next I'll normalize the data and then see how it looks on the covariance.

I'll also drop responseTimeShown and roomType and roomID. These will not play a part in the machine learning algorithm development. 

In [None]:
print(airbnbNew.iloc[:,8:18])

In [None]:
airbnbNew.drop(['responseTimeShown', 'roomType','roomID','bedType'], axis=1)

In [None]:
#['bathType', 'bedType', 'bedroomType', 'shortDesc'], dtype='object')}

airbnbNew.bathType.unique() # need to take out s in two of them, convert to string
airbnbNew.bedroomType.unique() #bedroom(s), need to take out s, convert to string
airbnbNew.shortDesc.unique() #fine

airbnbNew.loc[:,'bathType'] = [word[:-1] if word[-1]=="s" else word for word in airbnbNew['bathType']]
airbnbNew.loc[:,'bedroomType'] = [word[:-1] if word[-1]=="s" else word for word in airbnbNew['bedroomType']]

Some more data cleaning getting ready for machine learning!

In [None]:
bathDF = pd.get_dummies(airbnbNew['bathType'])
bedroomDF = pd.get_dummies(airbnbNew['bedroomType'])
shortDescDF = pd.get_dummies(airbnbNew['shortDesc'])

airbnbNew = pd.concat([airbnbNew, bathDF, bedroomDF, shortDescDF],axis=1, join_axes=[airbnbNew.index])

# Dropping all the non numeric columns...or the ones with a ton of NAs (cough...host reviews...cough)
airbnbNew.drop(['bathType','roomType','bedroomType','shortDesc','bedType','responseTimeShown','numHostReviews'], axis=1, inplace=True)
airbnbNew.set_index('roomID')

airbnbNew['isSuperhost'] = (airbnbNew['isSuperhost'] == True).astype(int)
airbnbNew = airbnbNew[airbnbNew['price'] < 700]
print(airbnbNew['price'].max)
airbnbNew

In [None]:
print(airbnbNew['price'].max())

In [None]:
#Normalizing:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(airbnbNew)
airbnb_Norm = pd.DataFrame(np_scaled)

# airbnb_Norm['roomID'] = Series(airbnbNew['roomID'], index=airbnb_Norm.index)
# airbnb_Norm.set_index('roomID')
# airbnbNew.isnull().sum()
# Entire home/apt  Private room  Shared room 

# listNames = list(airbnbNew.columns.values)
airbnb_Norm.columns = listNames


print(airbnb_Norm.head(5))
print(airbnb_Norm.columns.values)

In [None]:
plot_corr(airbnb_Norm)

In [None]:
import seaborn as sns

cols = airbnb_Norm.columns.tolist()
#print(cols)
cols = [cols[14]] + cols[:-14]+cols[15:]
print(cols)
airbnb_Norm = airbnb_Norm[cols]

In [None]:
corr = airbnb_Norm.corr()
f, ax = plt.subplots(figsize=(10, 8))
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask=mask, cmap="YlGnBu",
           square=True, ax=ax)
plt.show()

# 2.   Machine Learning on Dataset: Linear Regression
Now the data is looking pretty good! We can now proceed to doing some machine learning I do believe!

In [None]:
airbnbNew = airbnbNew.set_index('roomID')

In [None]:
from sklearn import svm, datasets, cross_validation
from sklearn import metrics 
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Shuffling the dataframe
airbnbNew = airbnbNew.iloc[np.random.permutation(len(airbnbNew))]

# Finally dropping off roomID with roomID as new index
# airbnbNew.drop(['roomID'], axis=1, inplace=True)

# Removing the price column from the matrix.
airbnb_NormNoY = airbnb_Norm.drop(['price'], axis=1, inplace=False)
airbnb_Matrix = airbnb_NormNoY.as_matrix()
X = airbnb_Matrix
y = airbnbNew['price']

# Split the data into training/testing sets
airbnb_X_train = airbnb_Matrix[:-200]
airbnb_X_test = airbnb_Matrix[-200:]

# Split the targets into training/testing sets
airbnb_y_train = y[:-200]
airbnb_y_test = y[-200:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(airbnb_X_train, airbnb_y_train)

# Make predictions using the testing set
airbnb_y_pred = regr.predict(airbnb_X_test)

print('Variance score: %.2f' % r2_score(airbnb_y_test, airbnb_y_pred))
print("Mean squared error: %.2f"
      % mean_squared_error(airbnb_y_test, airbnb_y_pred))

zippedResult = list(zip(airbnb_y_test, airbnb_y_pred))

i = 0
len1 = len(airbnbNew)
for result in zippedResult:
    print(result)
    print(airbnbNew.index[len1-1-i])
    i += 1
print(len(zippedResult))

# 3.    Machine Learning on Dataset: Random Forest

Because our linear regression is not super accurate, let's go a little deeper. Random forest time.

In [None]:
import numpy
print(numpy.__version__)
import sys
print(sys.path)

In [None]:
import numpy as np
from numpy.core.umath_tests import inner1d
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#from sklearn.multioutput import MultiOutputRegressor

max_depth = 20
#regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
#                                                          random_state=0))
#regr_multirf.fit(airbnb_X_train, airbnb_y_train)

regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=50)
regr_rf.fit(airbnb_X_train, airbnb_y_train)

# Predict on new data
#y_multirf = regr_multirf.predict(airbnb_X_test)
airbnb_y_pred_rf = regr_rf.predict(airbnb_X_test)

zippedResult = list(zip(airbnb_y_test, airbnb_y_pred_rf))

i = 0
len1 = len(airbnbNew)
for result in zippedResult:
    print(result)
    print(airbnbNew.index[len1-1-i])
    i += 1

print('Variance score: %.2f' % r2_score(airbnb_y_test, airbnb_y_pred_rf))
print("Mean squared error: %.2f" % mean_squared_error(airbnb_y_test, airbnb_y_pred_rf))

Somethings are a little fishy here. A lot of the data is off. This is in part due to airbnb's fluxuations in pricing depending on the day. Another aspect that is poorly factored in is the quality of the apartment.

In [None]:
import matplotlib  
import matplotlib.pyplot as plt  
import pandas as pd
#Inline Plotting for Ipython Notebook 
%matplotlib inline 

#pd.options.display.mpl_style = 'default' #Better Styling  
new_style = {'grid': False} #Remove grid  
matplotlib.rc('axes', **new_style)  
from matplotlib import rcParams  
rcParams['figure.figsize'] = (17.5, 17) #Size of figure  
rcParams['figure.dpi'] = 250

print(airbnb['longitude'].head(5))

P=airbnbNew.plot(kind='scatter', x='longitude', y='latitude',color='white',
                 xlim=(-74.06,-73.9),ylim=(40.67, 40.85),s=5,alpha=1)
P.set_facecolor('black') #Background Color


The above map shows all the latitudes and longitudes for all of the datapoints I scraped.

The plot is not the prettiest, so let's bring this into Leaflet...

1. Bring datasets into R.  Also need:
    - airbnbNew (full dataset with labels)
    - airbnb (this has all the latitudes and longitudes we can use)
    - airbnb_NormNoY (training X data for the RF training)

In [None]:
airbnbNew.to_csv("airbnbNew_Data.csv")
airbnb.to_csv("airbnb_Data.csv")
airbnb_NormNoY.to_csv("airbnb_NormNoY_Data.csv")
airbnb_Norm.to_csv("airbnb_Norm_Data.csv")