In [None]:
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.tokenizer import Tokenizer
import spacy
import repip
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
import math
import numpy as np
import pandas as pd
!python - m spacy download en_core_web_lg

# %%

# %%
df = pd.read_csv('train.csv')
df.head()

# %%
df = df.drop(['bed_type', 'city', 'first_review', 'host_has_profile_pic', 'host_identity_verified',
              'host_response_rate', 'last_review', 'neighbourhood', 'thumbnail_url', 'host_since', 'id', 'latitude',
              'longitude', 'name', 'number_of_reviews', 'review_scores_rating', 'amenities', 'instant_bookable',
              'cleaning_fee', 'zipcode'], axis=1)
print(df.shape)
df.head()

# %%
df.dropna(inplace=True)
print(df.shape)

# %%
X = df.drop('log_price', axis=1)
y = df.log_price

# %%
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=.2, random_state=23)

# %%
X_train = X_train.copy()
X_train['description'] = X_train['description'].str.lower()

X_val = X_val.copy()
X_val['description'] = X_val['description'].str.lower()

# %%
X_train['description'] = X_train['description'].replace(
    '[^a-zA-Z0-9]', ' ', regex=True)
X_val['description'] = X_val['description'].replace(
    '[^a-zA-Z0-9]', ' ', regex=True)

# %%
tf = TfidfVectorizer()

# %%
# Make sure to only transform validation/testing data as the transformer has already been fit
X_train_tfidf = tf.fit_transform(X_train['description'])
X_val_tfidf = tf.transform(X_val['description'])

# %%
enc = DictVectorizer()

# %%
X_train_tfidf

# %%
X_train_categ.toarray()

# %%
# Make sure to only transform validation/testing data as the transformer has already been fit
X_train_categ = enc.fit_transform(X_train[['property_type', 'room_type', 'accommodates',
                                  'bathrooms', 'cancellation_policy', 'bedrooms', 'beds']].to_dict('records'))
X_val_categ = enc.transform(X_val[['property_type', 'room_type', 'accommodates',
                            'bathrooms', 'cancellation_policy', 'bedrooms', 'beds']].to_dict('records'))

# %%
# Using hstack to concat the 2 arrays
X_trained = hstack([X_train_tfidf, X_train_categ])
X_vals = hstack([X_val_tfidf, X_val_categ])


# %%
# Using ridge regression model
clf = Ridge(alpha=1.0, random_state=23)

# %%
clf.fit(X_trained, y_train)

# %%
rslt = clf.predict(X_vals)
rslt

# %%
# Adding predictions of validation data to DF and transforming price from log price to a dollar amount
X_val['pred_price'] = rslt
X_val['pred_price'] = np.exp(X_val['pred_price'])
X_val['actual_price'] = y_val
X_val['actual_price'] = np.exp(X_val['actual_price'])
X_val.head()

# %%
MSE(rslt, y_val)

# %%
clf.score(X_vals, y_val)

# %%
X_val.head(15)

# %%
X_val.tail(15)
