ΜΕΛΗ ΟΜΑΔΑΣ: \
ΚΑΓΙΑΤΣΚΑ ΕΡΙΚ - 1115202100043 \
ΚΑΛΑΜΠΟΚΗΣ ΕΥΑΓΓΕΛΟΣ - 1115202100045

In [39]:
import pandas as pd
from collections import Counter
import re

# Load data
train_2019_df = pd.read_csv('data_train/train_2019.csv')
train_2023_df = pd.read_csv('data_train/train_2023.csv')

# Concatenate them in one dataframe
df = pd.concat([train_2019_df, train_2023_df], ignore_index=True)

# We want to drop lanes that are duplicate
initial_row_count = df.shape[0]                                 # Initial row count
df = df.drop_duplicates(subset=['id', 'comments'], keep='first')  # Remove duplicates that are shown in both 2019 and 2023
final_row_count = df.shape[0]                                   # Final row count
rows_dropped = initial_row_count - final_row_count              # Rows dropped
print(f"Number of rows dropped: {rows_dropped}")                # Print number of rows dropped

# Edit comments
comments = df['comments'].dropna().astype(str).tolist()

# Add all comments in one text
all_comments = ' '.join(comments)

# Remove special characters etc.
words = re.findall(r'\b\w+\b', all_comments.lower())

# Find how many times each word exists by counting them
word_counts = Counter(words)

# 300 most used words
most_common_words = [word for word, count in word_counts.most_common(300)]

# Print them
print("Most common words:\n", most_common_words)

Number of rows dropped: 92
Most common words:
 ['apartment', 'great', 'us', 'athens', 'stay', 'place', 'location', 'host', 'everything', 'clean', 'nice', 'would', 'recommend', 'really', 'good', 'well', 'perfect', 'helpful', 'also', 'acropolis', 'close', 'walk', 'time', 'comfortable', 'metro', 'restaurants', 'flat', 'amazing', 'area', 'city', 'even', 'easy', 'view', 'walking', 'highly', 'definitely', 'wonderful', 'located', 'one', 'around', 'need', 'lovely', 'get', 'like', 'home', 'airport', 'beautiful', 'best', 'distance', 'back', 'made', 'night', 'spacious', 'could', 'neighborhood', 'quiet', 'br', 'needed', 'excellent', 'staying', 'house', 'friendly', 'experience', 'minutes', 'thank', 'station', 'room', 'gave', 'much', 'day', 'enjoyed', 'stayed', 'away', 'street', 'go', 'fantastic', 'thanks', 'right', 'many', 'places', 'loved', 'within', 'little', 'kind', 'kitchen', 'central', 'visit', 'center', 'two', 'super', 'feel', 'felt', 'recommended', 'balcony', 'airbnb', 'see', 'arrived', 'loc

In [40]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Train Word2Vec model with all words
model = Word2Vec([words], vector_size=50, window=5, min_count=1, workers=4)

# embeddings for most used words
embeddings = {word: model.wv[word] for word in most_common_words if word in model.wv}

# similarity matrix
word_list = list(embeddings.keys())
vectors = np.array([embeddings[word] for word in word_list])
similarity_matrix = cosine_similarity(vectors)

# Create dataframe with similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=word_list, columns=word_list)

# Save to pickle file
similarity_df.to_pickle('data_similarity/similarity_matrix.pkl')

# Save also to csv file
similarity_df.to_csv('data_similarity/similarity_matrix.csv')

In [42]:
import random

def get_word_embedding(word, embedding_dict, vector_size=50):
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        return np.random.uniform(-1, 1, vector_size)

# Example of word that exists
word = 'apartment'
embedding = get_word_embedding(word, embeddings)
print("'apartment' embedding:\n", embedding)

# Example of word that does not exist
word = 'example'
embedding = get_word_embedding(word, embeddings)
print("\n'example' embedding:\n", embedding)

'apartment' embedding:
 [-0.07819521 -0.04649974 -0.0350303  -0.19499712 -0.09973695 -0.11212839
  0.24799238  0.3445658  -0.32557988 -0.1451737  -0.12056579 -0.25358903
  0.10670515  0.10988517 -0.08064184  0.12573186 -0.09241755  0.16522697
 -0.34453133 -0.05753677  0.18802524  0.19304834  0.13482523 -0.22676836
  0.08581773  0.14687108 -0.08488967 -0.08562099 -0.3848446  -0.11507275
 -0.1254433  -0.20515294  0.04733507  0.03186057 -0.07379585  0.27417827
  0.07303321  0.07910142  0.27029976 -0.14123401  0.16486734 -0.07165905
 -0.0663771   0.09932023  0.33153063 -0.05610104 -0.04067687 -0.05535131
  0.16023691 -0.03112776]

'example' embedding:
 [-0.37703016  0.07417317 -0.96813391 -0.65633938 -0.42028431  0.5741845
 -0.54542296 -0.4217084   0.93441761  0.07287706 -0.25512392  0.71332215
 -0.72980057  0.73043407  0.00705634  0.32479506  0.56099063 -0.93078315
  0.99997873  0.88800785 -0.7140831   0.09549437 -0.22583657  0.11740727
  0.90161081  0.33430726 -0.76605384 -0.44464761  0.

In [127]:
def word_similarity(word1, word2, df_similarity):
    # Check if words are in the DataFrame index (assuming both are present)
    if word1 in df_similarity.index and word2 in df_similarity.index:
        return df_similarity.loc[word1, word2]
    else:
        emb1 = get_word_embedding(word1, embeddings)
        emb2 = get_word_embedding(word2, embeddings)
        cos = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]
        return cos
    
word1 = "apartment"
word2 = "example"
print(f"Similarity between {word1} and {word2} is:")
print(word_similarity(word1, word2, similarity_df))     # We expect each time different result because words do not exist

word3 = "apartment"
word4 = "apartment"
print(f"Similarity between {word3} and {word4} is:")    # We expect same result because words exist.
print(word_similarity(word3, word4, similarity_df))     # Also approximately 1 because we have the same 2 words

Similarity between apartment and example is:
-0.12542367247911174
Similarity between apartment and apartment is:
0.99999994


In [147]:
# Semantic Neighbors
def top_n_neighbors(word, sm_df, N):
    if word in sm_df.index:
        # Get the row corresponding to the word from sm_df
        neighbors = sm_df.loc[word].sort_values(ascending=False).head(N)
        # Return list of (neighbor_word, similarity_score) tuples
        return list(neighbors.items())
    else:
        return None

# Maximum similarity of neighborhood
def max_similarity_of_neighborhoods(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    max_sim1 = max([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    max_sim2 = max([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    return max(max_sim1, max_sim2)

# Correlation of similarities of neighborhoods
def correlation_of_neighborhoods(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    similarities1 = np.array([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    similarities2 = np.array([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    return np.corrcoef(similarities1, similarities2)[0, 1]

# Sum of squared similarities of neighborhoods
def sum_squared_neighborhood_similarities(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    similarities1 = np.array([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    similarities2 = np.array([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    return np.sqrt(np.sum(similarities1**2 + similarities2**2))

word1 = "apartment"
word2 = "apartment"
N = 5

print("Maximum Similarity of Neighborhoods:", max_similarity_of_neighborhoods(word1, word2, similarity_df, N))
print("Correlation of Neighborhoods:", correlation_of_neighborhoods(word1, word2, similarity_df, N))
print("Sum of Squared Neighborhood Similarities:", sum_squared_neighborhood_similarities(word1, word2, similarity_df, N))

Maximum Similarity of Neighborhoods: None
Correlation of Neighborhoods: None
Sum of Squared Neighborhood Similarities: None
