In [None]:
# %% [markdown]
# 
# # BW AirBNB
# 
# ## Team Members
# - Jason Robinson [Model Prediction - NLP] 
# - Jimmy Slagle [Architecture]
# - Brandon Moore [Data Visualization]
# - Peter Geraghty [Hyperparamter Tuning]
# 
# ## Project Description
# 
# This project will involve training an Artifical Neural Network to predict the optimal pricing for specific properties of AirBnB locations within the city of New York. Modeling techniques of use will consist of precision-based distribution and accuratcy
# 

# %%
from __future__ import absolute_import, division, print_function

import pathlib

# Import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
%matplotlib inline
np.set_printoptions(precision=3, suppress=True)

#import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import r2_score

# Natural language processing
import spacy

# Neural networks
import tensorflow as tf
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers.experimental import preprocessing
from keras.wrappers.scikit_learn import KerasRegressor


print(f'Geopandas Version: {gpd.__version__}')
print(f'Tensorflow Version: {tf.__version__}')
print(f'Seaborn Version: {sns.__version__}')
print(f'Spacy Version: {spacy.__version__}')
print(f'Pandas Version: {pd.__version__}')
print(f'Numpy Version: {np.__version__}')

# %% [markdown]
# 
# ## Load Data
# 

# %%
# Load and pre-process data

df1 = pd.read_csv('/Users/jasonrobinson/Downloads/AB_NYC_2019.csv')
print(df1.shape)
df1.head(2)

# %%
df = df1.copy()

# %%
df.info()

# %%
df.isnull().sum()

# %%
print(df['neighbourhood_group'].unique())
print(df['room_type'].unique())
df['neighbourhood'][:10].unique()

# %%
df['room_type'].unique()

# %%
df['neighbourhood'][:10].unique()

# %%
# The labels with the highest correalation will have no impact on predicting price by location.
corr = df.corr(method='kendall')
plt.figure(figsize=(8,8))
sns.heatmap(corr, annot=True);

# %% [markdown]
# 
# ## Data Exploration
# 

# %%
# Retrieve a description of statistics.
df.describe()

# %%
# Filter out unnecessary columns
df = df.drop(columns=['name','id','host_id','host_name', 
                      'reviews_per_month', 'last_review'], axis=1)

# %%
# Visualization of neighbourhood locations.
plot_dims=(10,9)
plt.figure(figsize=plot_dims)
sns.scatterplot(df.longitude, df.latitude, hue=df.neighbourhood_group)
plt.ioff()

# %% [markdown]
# 
# ## Preprocessing
# 

# %%
# Normalize our categorical data to have a mean of 0 and std of 1.
df['neighbourhood_group']=pd.factorize(df.neighbourhood_group)[0]
df['neighbourhood']=pd.factorize(df.neighbourhood)[0]
df['room_type']=pd.factorize(df.room_type)[0]

# %%
# Normalizing the availability column with a mean of 0 and std of 1.
availability=df['availability_365']
availability=(availability-availability.mean())/availability.std()

# %%
# Split dataset get our target column.
Y = df['price']
X = df.drop(df['price'])
# Truncate mismatched datasets.
Y = Y.truncate(after=48220,axis=0)

X.shape,Y.shape

# %%
# Splitting longitude data to be equal.
max_long = df['longitude'].max()
min_long = df['longitude'].min()
diff = max_long - min_long
diff/100

long_boundaries = []
for i in np.arange(min_long, max_long, diff):
    long_boundaries.append(min_long + i * diff)

max_lat = df['latitude'].max()
min_lat = df['latitude'].min()
d = max_lat - min_lat
d/100

long_boundaries = []
for i in np.arange(min_lat, max_lat, d):
    long_boundaries.append(min_lat + i * d)

# %%
# Create a dense layer by defining a bucketed column.
longitude = tf.feature_column.bucketized_column(
              tf.feature_column.numeric_column('longitude'), boundaries=long_boundaries)

latitude = tf.feature_column.bucketized_column(
              tf.feature_column.numeric_column('latitude'), boundaries=long_boundaries)

crossed_feature = tf.feature_column.crossed_column([longitude, latitude], hash_bucket_size=50)
feature_layer = tf.keras.layers.DenseFeatures(tf.feature_column.indicator_column(crossed_feature))

# %% [markdown]
# ## Modeling Approach #1

# %%
# Split data into train test.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)

# %%
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X_train, X_test)

# %%
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_pred = lin_reg.predict(X_test)

# %%
# Build a sequential model.
tf.keras.backend.set_floatx('float64')
model = tf.keras.Sequential([
    Dense(64, activation='relu', input_shape=[len(X_train)[0]]),
    Dense(32, activation='relu'),
#    Dense(1, kerner_initializer='normal')
    Dense(1, activation='linear')
])

opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss='mean_squared_error')

# %%
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
# Incorporate reduce on learning rate to cease after reaching optimal improvement.
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.2)
early_stop = tf.keras.callbacks.EarlyStopping(patience=5)
callbacks = [lr_reduce, early_stop]

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=callbacks, epochs=50, batch_size=32, verbose=0)

# %%
model.summary()

# %%
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

# %%
model.evaluate(X_test, y_test)

# %% [markdown]
# ## Modeling Approach #2

# %%
# Load weights into new model.
loaded_model.load_weights("bnb_model.h5")
print("Loaded model from disk")

# %%

# Evaluate loaded model on test data.
loaded_model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['MAE'])
score = loaded_model.evaluate(X_train, y_train, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

# %%
def model_2():
    model2 = Sequential([
      Dense(128, activation='relu', input_shape=[len(X_train)]),
      Dense(64, activation='relu'),
      Dense(1)
    ])
    
    return model2  

# %%
model2 = model_2()

# %%
learning_rate=0.001
opt = SGD(lr=learning_rate)
model2.compile(optimizer=opt, 
              loss='mean_squared_error',
              metrics=['MAE'])

# %%
%%time
# Fit our model and demonstrate time.
history2 = model2.fit(
    X_train, y_train,
    epochs=30,
    verbose=0,
    validation_split = 0.2)

# %%
# Visualize the training progress using the information from the history object.
def plot_loss(history2):
  plt.plot(history.history2['loss'], label='loss')
  plt.plot(history.history2['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [price]')
  plt.legend()
  plt.grid(True)

plot_loss(history2)

# %% [markdown]
# ## Save Final Model

# %%
#pip install -q pyyaml h5py

# %%
import h5py
model.save('bnb_model.h5')

# %%
# Serialize model to JSON
model_json = model.to_json()
with open("bnb_model.json", "w") as json_file:
    json_file.write(model_json)
    
# Serialize weights to HDF5
model.save_weights("bnb_model.h5")
print("Saved model to disk")

# %%
# Load json and create model
json_file = open('bnb_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)


