In [43]:
import torch
import pandas as pd
import numpy as np

In [44]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [45]:
# read data here
directory = "/home/tgbergendahl/research/NLP/data/archive/"
filename = "airbnb-listings.csv"
full_path = directory + filename

df = pd.read_csv(full_path, sep=';', nrows=50000)
x = len(df)
print(f"Read in {x} values")

Read in 50000 values


  df = pd.read_csv(full_path, sep=';', nrows=50000)


In [46]:
df = df[['ID', 'Summary', 'Price']]
df = df[df['Price']>0]
x = len(df)
print(f"Filtering out price=0 results in {x} values")
df = df[pd.notna(df['Summary'])]
x = len(df)
print(f"Filtering out N/A Summaries results in {x} values")

Filtering out price=0 results in 49115 values
Filtering out N/A Summaries results in 47527 values


In [47]:
df.head()

Unnamed: 0,ID,Summary,Price
0,4008728,‪This luxurious apartment is situated in the c...,600.0
1,7778612,"Beautiful, large (105m2, 2 floors) and quiet a...",175.0
2,8264596,This central located apartment will make you f...,125.0
3,2180729,Living like an Amsterdam resident in this ligh...,130.0
4,14463171,Studio Apartment Centre Amsterdam Perfect stud...,80.0


In [48]:
mean_len = df['Summary'].apply(len).mean()

mean_len

281.05396932270077

In [49]:
# load embeddings

embeddings_dict = {}

with open("/home/tgbergendahl/research/NLP/word-embeddings/glove.6B/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [50]:
# testing embeddings
from scipy import spatial

def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

find_closest_embeddings(embeddings_dict["king"])[:5]


['king', 'prince', 'queen', 'uncle', 'ii']

In [51]:
def to_embedding(word):
    if word in embeddings_dict:
        return embeddings_dict[word]
    else:
        return None

In [52]:
def average_embedding(note_text):
    words = note_text.split(' ')
    lower_stripped = [word.strip().lower() for word in words]
    # print(lower_stripped)
    word_embeddings = [to_embedding(word) for word in lower_stripped if to_embedding(word) is not None]
    # print("Number of word embeddings found: " + str(len(word_embeddings)))
    return np.mean(word_embeddings, axis=0)

In [53]:
df['Embeddings']=df['Summary'].apply(average_embedding)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [54]:
print(f"Length before dropping = {len(df)}")
df = df.dropna()
print(f"Length after dropping = {len(df)}")

Length before dropping = 47527
Length after dropping = 47312


In [55]:
df.head()

Unnamed: 0,ID,Summary,Price,Embeddings
0,4008728,‪This luxurious apartment is situated in the c...,600.0,"[0.38274357, 0.4258689, -0.16039725, 0.0287610..."
1,7778612,"Beautiful, large (105m2, 2 floors) and quiet a...",175.0,"[0.41330466, 0.30931005, -0.000840808, 0.01371..."
2,8264596,This central located apartment will make you f...,125.0,"[0.49062666, 0.3795466, 0.063830316, -0.055309..."
3,2180729,Living like an Amsterdam resident in this ligh...,130.0,"[0.4084269, 0.41357902, -0.1686621, -0.0720989..."
4,14463171,Studio Apartment Centre Amsterdam Perfect stud...,80.0,"[0.40915027, 0.3366374, -0.048056837, -0.02218..."


In [56]:
expanded_embeddings = df['Embeddings'].apply(pd.Series)
expanded_embeddings.columns = {f'x_{i+1}' for i in range(expanded_embeddings.shape[1])}
cleaned_df = pd.concat([df['Price'], expanded_embeddings], axis=1)

cleaned_df.head()

Unnamed: 0,Price,x_8,x_35,x_36,x_40,x_2,x_14,x_48,x_15,x_17,...,x_45,x_49,x_37,x_5,x_1,x_39,x_42,x_29,x_23,x_16
0,600.0,0.382744,0.425869,-0.160397,0.028761,0.260482,0.072364,-0.398778,-0.30775,0.022348,...,0.020404,0.09556,0.127824,0.120138,-0.275871,0.119027,-0.009736,-0.496168,0.024827,-0.187749
1,175.0,0.413305,0.30931,-0.000841,0.013715,0.390502,0.198299,-0.442315,-0.21444,-0.35198,...,-0.14163,0.134946,0.239726,-0.163147,-0.191875,0.252649,-0.242484,-0.429803,0.062846,-0.030371
2,125.0,0.490627,0.379547,0.06383,-0.055309,0.316344,-0.11184,-0.490916,-0.197632,-0.149993,...,0.006888,0.011265,0.093517,0.148092,-0.253133,0.133435,-0.205685,-0.383044,0.040716,-0.125421
3,130.0,0.408427,0.413579,-0.168662,-0.072099,0.461521,0.009454,-0.512459,-0.227946,-0.128383,...,-0.030333,0.109661,0.07961,0.067164,-0.290861,0.230022,-0.159572,-0.52373,0.124678,-0.07703
4,80.0,0.40915,0.336637,-0.048057,-0.022188,0.27844,-0.061095,-0.42417,-0.259446,0.00186,...,0.080895,0.046259,0.001392,0.045865,-0.124832,0.233912,-0.031643,-0.539506,-0.033314,-0.202694


In [57]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

x_temp = df['Embeddings']
x = [np.array(embedding) for embedding in x_temp]
y = df['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [58]:
regr = MLPRegressor(random_state=42, max_iter=500).fit(x_train, y_train)



In [59]:
regr.score(x_test, y_test)

0.149443301187273

In [60]:
test_pred = regr.predict(x_test)

res_df = pd.DataFrame({
    'Predictions': test_pred,
    'Ground Truth': y_test
})

res_df.head(20)

Unnamed: 0,Predictions,Ground Truth
22740,158.091873,250.0
49054,198.964108,137.0
26967,185.621518,191.0
24838,181.340327,25.0
31730,118.44147,42.0
46777,177.00895,94.0
47246,126.728921,50.0
31060,116.749581,94.0
49057,152.106494,255.0
25840,117.113846,75.0
