In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [10]:
csv ="airbnb_dataset/dataset_nyc.csv"
csv_obj = open(csv)
csv_obj.readline()
data = pd.read_csv(csv,encoding ='utf8')
newData = data.drop(columns=["id","name","host_id","host_name","minimum_nights","number_of_reviews","last_review", "reviews_per_month","calculated_host_listings_count","availability_365"])
df = pd.DataFrame(newData)
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225
2,Manhattan,Harlem,40.80902,-73.9419,Private room,150
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80


In [11]:
uniqueNeighborhoods = df["neighbourhood"].unique()

roomTypes= df["room_type"]

    
# Creating a dictionary to map every unique neighborhood to a value
uniqueNeighborhoodsDict = {}
value = 0
for name in uniqueNeighborhoods : 
    uniqueNeighborhoodsDict[name] = value
    value += 1

# print(uniqueNeighborhoodsDict)  


# Converting room_type into a value from 0-2 and adding them to the data frame
roomValues = []
for room in roomTypes.values : 
    if room == "Entire home/apt" : 
        roomValues.append(0)
    if room == "Shared room" : 
        roomValues.append(1)
    if room == "Private room":
        roomValues.append(2)
                
df['room_values'] = roomValues

realEstate = df[["neighbourhood", 'room_values']].values

prices = df["price"].values


In [12]:
# # Splitting data into a 80/20 : 80 - training 20 - testing 
x_train, x_test, y_train, y_test = train_test_split(realEstate, prices, test_size=0.2)

# x_train_values = [neighborhood name, room_values] 


# converting the neighborhood names to respective unique values
x_train_values = []
x_test_values = []

for name,value in x_train : 
    x_train_values.append([uniqueNeighborhoodsDict[name],value])
    
for name,value in x_test : 
    x_test_values.append([uniqueNeighborhoodsDict[name],value])
    

#x_train_values = [neighborhood_value, room_value] 
#x_test_values = [neighborhood_value, room_value]   
#y_train = [price]
#y_test = [price]
    
# print(x_train_values)
# print(x_test_values)
# print(y_train)
# print(y_test)


In [13]:
regr = linear_model.Ridge()
regr.fit(x_train_values, y_train)
y_pred = regr.predict(x_test_values)
actual_vs_predicted = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
actual_vs_predicted

Unnamed: 0,Actual,Predicted
0,40,84.546579
1,30,95.256946
2,220,212.256106
3,150,216.808011
4,90,92.847113
...,...,...
9774,87,215.736975
9775,145,215.736975
9776,30,90.705040
9777,299,212.256106


In [14]:
print("Mean Squared Error : \n ", mean_squared_error(y_test, y_pred), "\n")
print("R2 Score : \n ", r2_score(y_test, y_pred), "\n")
print('Intercept:  \n', regr.intercept_, "\n")
print('Coefficients : \n', regr.coef_, "\n")

Mean Squared Error : 
  46735.12211395971 

R2 Score : 
  0.0725803670816847 

Intercept:  
 217.87904804111957 

Coefficients : 
 [ -0.26775917 -60.77553285] 



In [15]:
svr = SVR(C=3)
svr.fit(x_train_values, y_train)
y_pred = svr.predict(x_test_values)
actual_vs_predicted = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
actual_vs_predicted

Unnamed: 0,Actual,Predicted
0,40,79.581338
1,30,89.984890
2,220,144.971522
3,150,147.431068
4,90,87.165173
...,...,...
9774,87,149.288787
9775,145,149.288787
9776,30,82.484923
9777,299,144.971522


In [16]:
print("Mean Squared Error : \n ", mean_squared_error(y_test, y_pred), "\n")
print("R2 Score : \n ", r2_score(y_test, y_pred), "\n")
print('Intercept:  \n', svr.intercept_, "\n")

Mean Squared Error : 
  49362.839785122895 

R2 Score : 
  0.02043549513589804 

Intercept:  
 [85.8214448] 



In [17]:
scaler = StandardScaler()
scaler.fit(x_train_values)
x_train_values = scaler.transform(x_train_values)
x_test_values = scaler.transform(x_test_values)

In [18]:
sgd = SGDRegressor(loss = "squared_loss", penalty = None)
sgd.fit(x_train_values, y_train)
y_pred = sgd.predict(x_test_values)

actual_vs_predicted = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
actual_vs_predicted

Unnamed: 0,Actual,Predicted
0,40,81.441137
1,30,91.153645
2,220,217.323598
3,150,221.451414
4,90,88.968331
...,...,...
9774,87,220.480164
9775,145,220.480164
9776,30,87.025829
9777,299,217.323598


In [19]:
print("Mean Squared Error : \n ", mean_squared_error(y_test, y_pred), "\n")
print("R2 Score : \n ", r2_score(y_test, y_pred), "\n")
print('Intercept : \n', sgd.intercept_, "\n")
print('Coefficients : \n', sgd.coef_, "\n")

Mean Squared Error : 
  46769.72887967641 

R2 Score : 
  0.0718936245953975 

Intercept : 
 [154.17869333] 

Coefficients : 
 [ -8.48211996 -64.22917799] 



In [20]:
uniqueNeighborhoodsDict["Bensonhurst"]


user_data = [[76, 0]] 

lr_result = regr.predict(user_data)

svr_result = svr.predict(user_data)

scaler = StandardScaler()
scaler.fit(user_data)
user_data = scaler.transform(user_data)

sgd_result = sgd.predict(user_data)


print(lr_result)
print(svr_result)
print(sgd_result)

print("Final Price :" , (lr_result + svr_result + sgd_result) / 3)



[197.52935134]
[104.12798528]
[154.17869333]
Final Price : [151.94534331]


In [21]:
import anvil.server

anvil.server.connect("525VY5RLFTEPVKM3TPGHXGLD-APAOLI6YG344X4VA")


Connecting to wss://anvil.works/uplink
Anvil websocket open
Authenticated OK


In [22]:
import anvil.media

@anvil.server.callable 
def determine_price(data) : 
    
    lr_result = regr.predict(data)

    svr_result = svr.predict(data)

    scaler = StandardScaler()
    scaler.fit(data)
    data = scaler.transform(data)

    sgd_result = sgd.predict(data)
    
    result = (lr_result + svr_result + sgd_result) / 3
    
    return result
