In [None]:
wget 'https://www.kaggle.com/stevezhenghp/airbnb-price-prediction/download'

In [None]:
!unzip 'download.ZIP'

In [None]:


# %%
import pandas as pd
import numpy as np
!pip install pandas-profiling==2.*
!pip install category_encoders==2.*

# %%
# Use jupyter instead of colab
#from google.colab import drive
#drive.mount('/content/drive')

# %%
df = pd.read_csv('train.csv', skiprows=[26044])
df.head()

# %%
import matplotlib.pyplot as plt
import datetime as dt
from pandas_profiling import ProfileReport
profile = ProfileReport(df, minimal=True).to_notebook_iframe()
profile

# %%
df['neighbourhood'].value_counts()[:10]

# %%
df_trainable = df.drop(['id', 'thumbnail_url', 'amenities','description', 'name',],axis = 1)
df_trainable['first_review'] = pd.to_datetime(df_trainable['first_review']).map(dt.datetime.toordinal)
df_trainable['host_since'] = pd.to_datetime(df_trainable['host_since']).map(dt.datetime.toordinal)
df_trainable['last_review'] = pd.to_datetime(df_trainable['last_review']).map(dt.datetime.toordinal)
df_trainable.head()

# %%
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from category_encoders import OrdinalEncoder
from xgboost import XGBRegressor

# %%
X = df_trainable.drop('log_price',axis=1)
y = df_trainable.log_price
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25,random_state=42)

# %%
pipe = Pipeline([
                ('encode',OrdinalEncoder()),
                ('impute', SimpleImputer()),
                ('scale',MinMaxScaler()),
                ('model',XGBRegressor(random_state=41))
                ])
pipe.fit(X_train,y_train)

# %%
pipe['model'].feature_importances_

# %%
from xgboost import plot_importance
plot_importance(pipe['model'], max_num_features=10)

# %%
X_train.columns[[16,15,2,14,1,3,21,0,19,17]]

# %%
X_train.columns

# %%
y_pred = pipe.predict(X_val)

# %%
from sklearn.metrics import mean_squared_error as MSE
MSE(y_pred,y_val)

# %%
from sklearn.linear_model import LinearRegression
pipe = Pipeline([
                ('encode',OrdinalEncoder()),
                ('impute', SimpleImputer()),
                ('scale',MinMaxScaler()),
                ('model',LinearRegression())
                ])
pipe.fit(X_train,y_train)

# %%
features = X_train.columns
coefficients = np.absolute(pipe.named_steps['model'].coef_)
pd.Series(coefficients, features).sort_values(ascending=False)


