In [512]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import sys


# Read Development Data File

In [513]:
dataset = pd.read_csv("/content/development.csv")
dataset

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,12783632,NYC Mini Hotel,57230304,Imanuelly,Queens,Elmhurst,40.74037,-73.88610,Private room,75,1,2,5/26/2019,0.92,3,351
1,3463385,Gorgeous room in Manhattan,10698270,Evgenia,Manhattan,Upper East Side,40.76717,-73.95532,Private room,95,1,202,5/27/2019,3.31,2,263
2,17572926,Great 1 Bedroom on Upper East,36578169,James,Manhattan,Upper East Side,40.77984,-73.94725,Entire home/apt,130,2,0,,,1,0
3,33913644,"Modern and bright 2Bed 2Bath Bushwick, Brooklyn",50981314,Ofier,Brooklyn,Bushwick,40.70205,-73.91338,Entire home/apt,150,2,4,7/7/2019,1.64,1,89
4,9405895,Stylish and zen Brooklyn retreat,48775347,Mathieu,Brooklyn,Fort Greene,40.68914,-73.97853,Entire home/apt,325,3,16,4/20/2019,0.42,1,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39111,20678811,Flatbush Comfy Room (Rm# 2),147972663,Hyacinth,Brooklyn,East Flatbush,40.64850,-73.93855,Private room,40,3,58,6/12/2019,2.57,3,306
39112,29630190,Cozy Brooklyn Heights - Private Room,223032162,Dot,Brooklyn,Boerum Hill,40.68602,-73.99023,Private room,65,2,15,5/19/2019,1.80,2,0
39113,3811639,Beautiful sun-filled Loft BROOKLYN,10603767,Samantha,Brooklyn,Bushwick,40.70784,-73.92224,Entire home/apt,150,3,203,6/22/2019,3.42,1,308
39114,2404708,Williamsburg 1 bedroom appartement,12290324,Lara,Brooklyn,Williamsburg,40.71952,-73.96281,Entire home/apt,250,5,21,3/28/2019,0.35,1,284


In [514]:
dataset.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [515]:
dataset.isnull().sum()

id                                   0
name                                13
host_id                              0
host_name                           19
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       8041
reviews_per_month                 8041
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [516]:
### After analysis, I have decided to drop these columns as they will not be useful in prediction
dataset.drop(["id","name","host_name","host_id","neighbourhood","neighbourhood_group", "last_review",'calculated_host_listings_count','availability_365'], axis=1, inplace=True)

In [517]:
dataset.isnull().sum()

latitude                0
longitude               0
room_type               0
price                   0
minimum_nights          0
number_of_reviews       0
reviews_per_month    8041
dtype: int64

# Preprocessing of Data

Removed **Outliers**

In [518]:
def find_outliers(df: pd.DataFrame) -> pd.DataFrame:
    #Find rows containing outliers with IQR
    q1 = df['price'].quantile(0.25)
    q3 = df['price'].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    low  = q1-1.5*iqr
    high = q3+1.5*iqr
    df = df[(df["price"] < high) & (df["price"] > low)]
    return df

In [519]:
### Find outliers
dataset = find_outliers(dataset)

#dataset = dataset[dataset['price'] < 400]
dataset


Unnamed: 0,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month
0,40.74037,-73.88610,Private room,75,1,2,0.92
1,40.76717,-73.95532,Private room,95,1,202,3.31
2,40.77984,-73.94725,Entire home/apt,130,2,0,
3,40.70205,-73.91338,Entire home/apt,150,2,4,1.64
4,40.68914,-73.97853,Entire home/apt,325,3,16,0.42
...,...,...,...,...,...,...,...
39111,40.64850,-73.93855,Private room,40,3,58,2.57
39112,40.68602,-73.99023,Private room,65,2,15,1.80
39113,40.70784,-73.92224,Entire home/apt,150,3,203,3.42
39114,40.71952,-73.96281,Entire home/apt,250,5,21,0.35


**Drop the rows contains null values**

In [520]:
#dataset['reviews_per_month'].fillna((dataset['reviews_per_month'].median()), inplace=True)
dataset.dropna(inplace=True)
dataset.isnull().sum()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



latitude             0
longitude            0
room_type            0
price                0
minimum_nights       0
number_of_reviews    0
reviews_per_month    0
dtype: int64

In [521]:
import plotly.express as px

df = dataset
fig = px.scatter(df, x=df['longitude'], y=df['latitude'], color="price",size_max=15,size='price')
fig.show()

In [None]:
dataset.drop(['longitude','latitude'], axis=1, inplace=True)

Handiling categorical data using **OneHotEncoding **

In [523]:
dataset_onehot = pd.get_dummies(dataset, columns=["room_type"], prefix = ["rt"],drop_first=False)

In [524]:
dataset_onehot.insert(6, "rt_Hotel room", value=0)
dataset_onehot

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,rt_Entire home/apt,rt_Private room,rt_Hotel room,rt_Shared room
0,75,1,2,0.92,0,1,0,0
1,95,1,202,3.31,0,1,0,0
3,150,2,4,1.64,1,0,0,0
4,325,3,16,0.42,1,0,0,0
5,225,2,3,0.06,1,0,0,0
...,...,...,...,...,...,...,...,...
39111,40,3,58,2.57,0,1,0,0
39112,65,2,15,1.80,0,1,0,0
39113,150,3,203,3.42,1,0,0,0
39114,250,5,21,0.35,1,0,0,0


**Vitualization of Test Data with prices**

In [525]:
X1= dataset_onehot.loc[:, dataset_onehot.columns != 'price']

Y1 = dataset_onehot["price"]

numeric_cols = ['minimum_nights', 'number_of_reviews','reviews_per_month']

scaler = StandardScaler()
scaler.fit(X1[numeric_cols])
X1[numeric_cols] = scaler.transform(X1[numeric_cols]) 
X1




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,rt_Entire home/apt,rt_Private room,rt_Hotel room,rt_Shared room
0,-0.271114,-0.568722,-0.273285,0,1,0,0
1,-0.271114,3.530335,1.135799,0,1,0,0
3,-0.215678,-0.527732,0.151209,1,0,0,0
4,-0.160241,-0.281788,-0.568072,1,0,0,0
5,-0.215678,-0.548227,-0.780319,1,0,0,0
...,...,...,...,...,...,...,...
39111,-0.160241,0.579014,0.699513,0,1,0,0
39112,-0.215678,-0.302283,0.245541,0,1,0,0
39113,-0.160241,3.550830,1.200652,1,0,0,0
39114,-0.049369,-0.179312,-0.609342,1,0,0,0


**Data Spliting for training and testing**

In [526]:
X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.20, random_state=42)

# Linear Regression

In [527]:
model=LinearRegression().fit(X_train,y_train)
pred_train= model.predict(X_train)
print("r2_train _score:",r2_score(y_train, pred_train))
print("Train Root Mean squared error",np.sqrt(mean_squared_error(y_train,pred_train)))
    
pred_test=model.predict(X_test)
print("r2_test_score:",r2_score(y_test,pred_test))
print("Test Root Mean squared error",np.sqrt(mean_squared_error(y_test,pred_test)))

df_pre = pd.DataFrame(pred_test)
df_pre['id'] = df_pre.index
df_pre.columns = ['price', 'id']
df_pre = df_pre[['id', 'price']]
df_pre


r2_train _score: 0.3937038893482818
Train Root Mean squared error 51.628885872130624
r2_test_score: 0.382572249408885
Test Root Mean squared error 53.31127745467123


Unnamed: 0,id,price
0,0,74.427537
1,1,78.562546
2,2,160.736165
3,3,159.591648
4,4,77.747235
...,...,...
5908,5908,78.518581
5909,5909,160.447354
5910,5910,160.450773
5911,5911,158.315131


In [528]:
model1=RandomForestRegressor(n_estimators=100).fit(X_train,y_train)
pred_train= model1.predict(X_train)
print("r2_train _score:",r2_score(y_train, pred_train))
print("Train Root Mean squared error",np.sqrt(mean_squared_error(y_train,pred_train)))
    
pred_test=model1.predict(X_test)
print("r2_test_score:",r2_score(y_test,pred_test))
print("Test Root Mean squared error",np.sqrt(mean_squared_error(y_test,pred_test)))


r2_train _score: 0.7788181209069445
Train Root Mean squared error 31.183519783146913
r2_test_score: 0.30660873882857564
Test Root Mean squared error 56.49567458338054


# Evaluation Data

In [529]:
Eval_data = pd.read_csv("/content/evaluation.csv")
Eval_data.dtypes

id                                  int64
host_id                             int64
host_name                          object
neighbourhood                      object
neighbourhood_group_cleansed      float64
latitude                          float64
longitude                         float64
room_type                          object
minimum_nights                      int64
availability_365                    int64
number_of_reviews                   int64
calculated_host_listings_count      int64
reviews_per_month                 float64
last_review                        object
dtype: object

In [530]:
Eval_data.drop(["id","host_name","host_id","neighbourhood","neighbourhood_group_cleansed", "last_review","latitude",'longitude','calculated_host_listings_count',"availability_365"], axis=1, inplace=True)

In [531]:
Eval_data.isnull().sum()

room_type               0
minimum_nights          0
number_of_reviews       0
reviews_per_month    1275
dtype: int64

In [532]:
Eval_data['reviews_per_month'].fillna((Eval_data['reviews_per_month'].mean()), inplace=True)
Eval_data.isnull().sum()

room_type            0
minimum_nights       0
number_of_reviews    0
reviews_per_month    0
dtype: int64

In [533]:
Eval_dataOnehot = pd.get_dummies(Eval_data, columns=['room_type'], prefix = ["rt"],drop_first=False)

In [534]:
Eval_dataOnehot

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,rt_Entire home/apt,rt_Hotel room,rt_Private room,rt_Shared room
0,3,18,0.170000,1,0,0,0
1,2,0,0.626964,1,0,0,0
2,2,0,0.626964,1,0,0,0
3,5,10,0.220000,1,0,0,0
4,14,56,1.360000,0,0,1,0
...,...,...,...,...,...,...,...
9982,2,18,0.480000,1,0,0,0
9983,2,11,0.310000,1,0,0,0
9984,4,0,0.626964,0,0,1,0
9985,2,5,0.100000,1,0,0,0


In [535]:
cols = ['minimum_nights', 'number_of_reviews','reviews_per_month']
Eval_dataOnehot[cols] = scaler.transform(Eval_dataOnehot[cols])
Eval_dataOnehot

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,rt_Entire home/apt,rt_Hotel room,rt_Private room,rt_Shared room
0,-0.160241,-0.240798,-0.715466,1,0,0,0
1,-0.215678,-0.609713,-0.446051,1,0,0,0
2,-0.215678,-0.609713,-0.446051,1,0,0,0
3,-0.049369,-0.404760,-0.685987,1,0,0,0
4,0.449559,0.538023,-0.013872,0,0,1,0
...,...,...,...,...,...,...,...
9982,-0.215678,-0.240798,-0.532697,1,0,0,0
9983,-0.215678,-0.384265,-0.632925,1,0,0,0
9984,-0.104805,-0.609713,-0.446051,0,0,1,0
9985,-0.215678,-0.507236,-0.756736,1,0,0,0


**Price Prediction using trained model**

In [536]:
pred= model.predict(Eval_dataOnehot)


In [537]:
df = pd.DataFrame(pred)


In [538]:
df['id'] = df.index
df.columns = ['price', 'id']
df = df[['id', 'price']]
df

Unnamed: 0,id,price
0,0,160.568859
1,1,160.516408
2,2,160.516408
3,3,160.329818
4,4,96.508454
...,...,...
9982,9982,160.534195
9983,9983,160.640149
9984,9984,98.238283
9985,9985,160.762910


In [543]:
df.to_csv('submission_reg.csv', index=False)

# Vitualization of Predicted Prices

In [540]:
EvalData = pd.read_csv("/content/evaluation.csv")
df = df
fig = px.scatter(df, x=EvalData['longitude'], y=EvalData['latitude'], color=df['price'])
fig.show()

# Price Prediction using RandomForestRegressor

In [541]:
pred= model1.predict(Eval_dataOnehot)
df = pd.DataFrame(pred)
df['id'] = df.index
df.columns = ['price', 'id']
df = df[['id', 'price']]
df

Unnamed: 0,id,price
0,0,181.563333
1,1,149.281667
2,2,149.281667
3,3,146.161714
4,4,87.430000
...,...,...
9982,9982,135.881333
9983,9983,110.177024
9984,9984,73.136000
9985,9985,185.959088


In [542]:
df = df
fig = px.scatter(df, x=EvalData['longitude'], y=EvalData['latitude'], color=df['price'])
fig.show()

In [544]:
df.to_csv('submission_RFR.csv', index=False)