In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [3]:
# read data here
directory = "../data/archive/"
filename = "airbnb-listings.csv"
full_path = directory + filename

df = pd.read_csv(full_path, sep=';', nrows=100000)
x = len(df)
print(f"Read in {x} values")

  df = pd.read_csv(full_path, sep=';', nrows=100000)


Read in 100000 values


In [4]:
df = df[['ID', 'Summary', 'Price']]
df = df[df['Price']>0]
x = len(df)
print(f"Filtering out price=0 results in {x} values")
df = df[pd.notna(df['Summary'])]
x = len(df)
print(f"Filtering out N/A Summaries results in {x} values")

Filtering out price=0 results in 98498 values
Filtering out N/A Summaries results in 95235 values


In [5]:
df.head()

Unnamed: 0,ID,Summary,Price
0,4008728,‪This luxurious apartment is situated in the c...,600.0
1,7778612,"Beautiful, large (105m2, 2 floors) and quiet a...",175.0
2,8264596,This central located apartment will make you f...,125.0
3,2180729,Living like an Amsterdam resident in this ligh...,130.0
4,14463171,Studio Apartment Centre Amsterdam Perfect stud...,80.0


In [6]:
mean_len = df['Summary'].apply(len).mean()

mean_len

280.74930435239145

In [7]:
# load embeddings

embeddings_dict = {}

with open("../glove.6B/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [8]:
# testing embeddings
from scipy import spatial

def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

find_closest_embeddings(embeddings_dict["king"])[:5]


['king', 'prince', 'queen', 'uncle', 'ii']

In [9]:
def to_embedding(word):
    if word in embeddings_dict:
        return embeddings_dict[word]
    else:
        return None

In [10]:
def average_embedding(note_text):
    words = note_text.split(' ')
    lower_stripped = [word.strip().lower() for word in words]
    # print(lower_stripped)
    word_embeddings = [to_embedding(word) for word in lower_stripped if to_embedding(word) is not None]
    # print("Number of word embeddings found: " + str(len(word_embeddings)))
    return np.mean(word_embeddings, axis=0)

In [11]:
df['Embeddings']=df['Summary'].apply(average_embedding)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [12]:
print(f"Length before dropping = {len(df)}")
df = df.dropna()
print(f"Length after dropping = {len(df)}")

Length before dropping = 95235
Length after dropping = 94836


In [13]:
df.head()

Unnamed: 0,ID,Summary,Price,Embeddings
0,4008728,‪This luxurious apartment is situated in the c...,600.0,"[0.38274357, 0.4258689, -0.16039725, 0.0287610..."
1,7778612,"Beautiful, large (105m2, 2 floors) and quiet a...",175.0,"[0.41330466, 0.30931005, -0.000840808, 0.01371..."
2,8264596,This central located apartment will make you f...,125.0,"[0.49062666, 0.3795466, 0.063830316, -0.055309..."
3,2180729,Living like an Amsterdam resident in this ligh...,130.0,"[0.4084269, 0.41357902, -0.1686621, -0.0720989..."
4,14463171,Studio Apartment Centre Amsterdam Perfect stud...,80.0,"[0.40915027, 0.3366374, -0.048056837, -0.02218..."


In [14]:
expanded_embeddings = df['Embeddings'].apply(pd.Series)
expanded_embeddings.columns = {f'x_{i+1}' for i in range(expanded_embeddings.shape[1])}
cleaned_df = pd.concat([df['Price'], expanded_embeddings], axis=1)

cleaned_df.head()

Unnamed: 0,Price,x_27,x_25,x_4,x_50,x_17,x_23,x_24,x_10,x_42,...,x_44,x_28,x_41,x_30,x_21,x_45,x_34,x_11,x_39,x_33
0,600.0,0.382744,0.425869,-0.160397,0.028761,0.260482,0.072364,-0.398778,-0.30775,0.022348,...,0.020404,0.09556,0.127824,0.120138,-0.275871,0.119027,-0.009736,-0.496168,0.024827,-0.187749
1,175.0,0.413305,0.30931,-0.000841,0.013715,0.390502,0.198299,-0.442315,-0.21444,-0.35198,...,-0.14163,0.134946,0.239726,-0.163147,-0.191875,0.252649,-0.242484,-0.429803,0.062846,-0.030371
2,125.0,0.490627,0.379547,0.06383,-0.055309,0.316344,-0.11184,-0.490916,-0.197632,-0.149993,...,0.006888,0.011265,0.093517,0.148092,-0.253133,0.133435,-0.205685,-0.383044,0.040716,-0.125421
3,130.0,0.408427,0.413579,-0.168662,-0.072099,0.461521,0.009454,-0.512459,-0.227946,-0.128383,...,-0.030333,0.109661,0.07961,0.067164,-0.290861,0.230022,-0.159572,-0.52373,0.124678,-0.07703
4,80.0,0.40915,0.336637,-0.048057,-0.022188,0.27844,-0.061095,-0.42417,-0.259446,0.00186,...,0.080895,0.046259,0.001392,0.045865,-0.124832,0.233912,-0.031643,-0.539506,-0.033314,-0.202694


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_temp = df['Embeddings']
x = [np.array(embedding) for embedding in x_temp]
y = df['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [16]:
regr = RandomForestRegressor(random_state=42).fit(x_train, y_train)

In [17]:
regr.score(x_test, y_test)

0.06078558259784539

In [18]:
test_pred = regr.predict(x_test)

res_df = pd.DataFrame({
    'Predictions': test_pred,
    'Ground Truth': y_test
})

res_df.head(20)

Unnamed: 0,Predictions,Ground Truth
92122,163.12,64.0
16138,196.83,70.0
42887,77.28506,80.0
1870,245.312866,250.0
11793,61.56,50.0
22895,167.45,100.0
11175,86.75,123.0
8914,149.94,352.0
46452,208.89,120.0
63716,119.07,75.0


In [19]:
def get_mse(x, y):
    return (x-y)**2

In [20]:
res_df['mse'] = res_df.apply(lambda row: get_mse(row['Predictions'], row['Ground Truth']), axis=1)

np.mean(res_df['mse'])

20081.53600235904

In [21]:
from sklearn.ensemble import AdaBoostRegressor

regr = AdaBoostRegressor(random_state=42).fit(x_train, y_train)

In [22]:
regr.score(x_test, y_test)

-0.13575014814294928

In [23]:
test_pred = regr.predict(x_test)

res_df = pd.DataFrame({
    'Predictions': test_pred,
    'Ground Truth': y_test
})

res_df.head(20)

Unnamed: 0,Predictions,Ground Truth
92122,233.400612,64.0
16138,233.400612,70.0
42887,94.056407,80.0
1870,222.746323,250.0
11793,94.056407,50.0
22895,233.400612,100.0
11175,94.056407,123.0
8914,233.400612,352.0
46452,233.400612,120.0
63716,172.183561,75.0


In [24]:
res_df['mse'] = res_df.apply(lambda row: get_mse(row['Predictions'], row['Ground Truth']), axis=1)

np.mean(res_df['mse'])

24283.706752183985

In [25]:
from sklearn.linear_model import LogisticRegression

regr = LogisticRegression(random_state=42).fit(x_train, y_train)

regr.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.04222025391201653

In [26]:
test_pred = regr.predict(x_test)

res_df = pd.DataFrame({
    'Predictions': test_pred,
    'Ground Truth': y_test
})

res_df.head(20)

Unnamed: 0,Predictions,Ground Truth
92122,50.0,64.0
16138,150.0,70.0
42887,70.0,80.0
1870,50.0,250.0
11793,60.0,50.0
22895,60.0,100.0
11175,35.0,123.0
8914,100.0,352.0
46452,250.0,120.0
63716,100.0,75.0


In [27]:
res_df['mse'] = res_df.apply(lambda row: get_mse(row['Predictions'], row['Ground Truth']), axis=1)

np.mean(res_df['mse'])

23093.63908220507