<a href="https://colab.research.google.com/github/vardnan/Airbnb-Price-Prediction/blob/main/Economics_Coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1.0 Importing necessary libraries

In [1]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
[K     \ 21.9 MB 141 kB/s
Collecting PyYAML>=5.0.0
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 5.3 MB/s 
Collecting visions[type_image_path]==0.7.5
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 44.4 MB/s 
Collecting htmlmin>=0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
Collecting phik>=0.11.1
  Downloading phik-0.12.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (690 kB)
[K     |████████████████████████████████| 690 kB 38.1 MB/s 
[?25hCollecting tangled-up-in-unicode==0.2.0
  Downloading tangled_up_in_unicode-

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer

ImportError: ignored

## 2.0 Importing the Airbnb dataset

In [None]:
dataset = pd.read_csv('airbnb_dataset.csv')
df = pd.DataFrame(dataset)
display(df)

### 2.1 Setting "Price" as the last column for easier data processing

In [None]:
last_column = df.pop('Price')
df.insert(26, 'Price', last_column)
display(df)

### 2.2 Dropping listing column as it is unecessary for price prediction

In [None]:
df = df.drop(columns = 'Listing No.')
display(df)

### 2.3 Validating data types of each column in the dataset

In [None]:
dataTypeSeries = df.dtypes
print('Data type of each column in dataset:')
print(dataTypeSeries)

#### 2.3.1 Changing dummy variables into boolean data type 

In [None]:
df['Superhost'] = df['Superhost'].astype('bool')
df['Host identity verified'] = df['Host identity verified'].astype('bool')
df['Host listings count'] = df['Host listings count'].astype('bool')
df['Host identity verified'] = df['Host identity verified'].astype('bool')
df['Accommodation type 1'] = df['Accommodation type 1'].astype('bool')
df['Accommodation type 2'] = df['Accommodation type 2'].astype('bool')
df['Accommodation type 3'] = df['Accommodation type 3'].astype('bool')
df['Accommodation type 4'] = df['Accommodation type 4'].astype('bool')
df['Entire home/apartment'] = df['Entire home/apartment'].astype('bool')
df['Private room'] = df['Private room'].astype('bool')
df['Shared room'] = df['Shared room'].astype('bool')
df['Real bed'] = df['Real bed'].astype('bool')
df['Wireless Internet'] = df['Wireless Internet'].astype('bool')
df['Breakfast'] = df['Breakfast'].astype('bool')
df['Free parking'] = df['Free parking'].astype('bool')
df['Instant bookable'] = df['Instant bookable'].astype('bool')
df['Cancellation policy (Moderate plus strict)'] = df['Cancellation policy (Moderate plus strict)'].astype('bool')
df['Smoking allowed'] = df['Smoking allowed'].astype('bool')
df["Required guest's profile picture"] = df["Required guest's profile picture"].astype('bool')
df["Required guest's phone verification"] = df["Required guest's phone verification"].astype('bool')

## 3.0 Data understanding

### 3.1 Identifying missing data

In [None]:
df.isnull().sum().sort_values(ascending=False)/len(df)

### 3.2 Identifying duplicate data

In [None]:
df.duplicated()

### 3.3 Identifying outlier values

In [None]:
sns.boxplot(x = df['Price'])
plt.show()

### 3.4 Identifying invalid data, multi-collinearity, and other issues with Panda Profiling

In [None]:
profile = ProfileReport(df)
profile.to_notebook_iframe()

In [None]:
sns.set(font_scale = 1)

fig, ax = plt.subplots()
fig.set_size_inches(25, 12)
sns.heatmap(df.corr(),cmap='coolwarm',ax=ax,annot=True,linewidths=2)

#### 3.4.1 Identifying prices with an invalid value of zero

In [None]:
(df['Price'] == 0).sum()

### 3.5 Data description

In [None]:
df.describe()

## 4.0 Data preparation

### 4.1 Handling missing values

In [None]:
df['Review scores for overall rating'].fillna(df['Review scores for overall rating'].mean(), inplace = True)
df['Reviews per year'].fillna(df['Reviews per year'].mean(), inplace = True)
df['Bathrooms'].fillna(df['Bathrooms'].mean(), inplace = True)

In [None]:
df.isnull().sum().sort_values(ascending=False)/len(df)

### 4.2 Handling outlier values

In [None]:
df = df[df.Price < 2000]
sns.boxplot(x = df['Price'])
# We see that for the independent values that their mean value is not far from their max. Meaning they don't have any significant outliers.

###4.3 Dropping prices with values of 0

In [None]:
df.drop(df[df.Price == 0].index, inplace=True)
(df['Price'] == 0).sum()

###4.4 Handling multicollinearity


In [None]:
df = df.drop(columns = 'Accommodation type 3')
df = df.drop(columns = 'Private room')
df = df.drop(columns = 'Accommodates')
df = df.drop(columns = "Required guest's profile picture")

In [None]:
display(df)

In [None]:
print(len(df.columns))

## 5.0 Modelling

###5.1 Model training

#### 5.1.1 Setting independent and dependent variable for training

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(y)

#### 5.1.2 Splitting dataset into training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

### 5.2 Model building and prediction

#### 5.2.1 Multiple linear regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
lin_pred = lin_reg.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((lin_pred.reshape(len(lin_pred),1), y_test.reshape(len(y_test),1)),1))

#### 5.2.2 Decision tree regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
d_reg = DecisionTreeRegressor(random_state = 0)
d_reg.fit(X_train, y_train)

In [None]:
d_pred = d_reg.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((d_pred.reshape(len(d_pred),1), y_test.reshape(len(y_test),1)),1))

####5.2.3 Random forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
r_reg = RandomForestRegressor(n_estimators=200,max_depth=20,n_jobs=-1,random_state=0)
r_reg.fit(X_train, y_train)

In [None]:
r_pred = r_reg.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((r_pred.reshape(len(r_pred),1), y_test.reshape(len(y_test),1)),1))

### 5.2 Model selection

In [None]:
print("Multiple linear regression")
from sklearn.metrics import r2_score
r2_score(y_test, lin_pred)

In [None]:
print("Decision tree regression")
from sklearn.metrics import r2_score
r2_score(y_test, d_pred)

In [None]:
print("Random forest regression")
from sklearn.metrics import r2_score
r2_score(y_test, r_pred)

###5.2.2 k-fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lin_reg, X = X_train, y = y_train, scoring = 'r2', cv = 10)
print("r2: {:.2f} %".format(accuracies.mean()*100))
print("r2 standard deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
##from sklearn.model_selection import cross_val_score
##accuracies = cross_val_score(estimator = lin_reg, X = X_train, y = y_train, scoring='mean_squared_error', cv = 10)
##print("RMSE:" accuracies.mean())

In [None]:
##adj_r2 = 1-(1-r2_scoring)*(10-1)/(10-21-1)
##print(adj_r2)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = d_reg, X = X_train, y = y_train, scoring = 'r2', cv = 10)
print("r2: {:.2f} %".format(accuracies.mean()*100))
print("Standard deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = r_reg, X = X_train, y = y_train, scoring = 'r2', cv = 10)
print("r2: {:.2f} %".format(accuracies.mean()*100))
print("Standard deviation: {:.2f} %".format(accuracies.std()*100))