ΜΕΛΗ ΟΜΑΔΑΣ: \
ΚΑΓΙΑΤΣΚΑ ΕΡΙΚ - 1115202100043 \
ΚΑΛΑΜΠΟΚΗΣ ΕΥΑΓΓΕΛΟΣ - 1115202100045

In [82]:
import pandas as pd
from collections import Counter
import re

# Load data
train_2019_df = pd.read_csv('data_train/train_2019.csv')
train_2023_df = pd.read_csv('data_train/train_2023.csv')

# Concatenate them in one dataframe
df = pd.concat([train_2019_df, train_2023_df], ignore_index=True)

# We want to drop lanes that are duplicate
initial_row_count = df.shape[0]                                 # Initial row count
df = df.drop_duplicates(subset=['id', 'comments'], keep='first')  # Remove duplicates that are shown in both 2019 and 2023
final_row_count = df.shape[0]                                   # Final row count
rows_dropped = initial_row_count - final_row_count              # Rows dropped
print(f"Number of rows dropped: {rows_dropped}")                # Print number of rows dropped

# Edit comments
comments = df['comments'].dropna().astype(str).tolist()

# Add all comments in one text
all_comments = ' '.join(comments)

# Remove special characters etc.
words = re.findall(r'\b\w+\b', all_comments.lower())

# Find how many times each word exists by counting them
word_counts = Counter(words)

# 300 most used words
most_common_words = [word for word, count in word_counts.most_common(300)]

# Print them
print("Most common words:\n", most_common_words)

Number of rows dropped: 93
Most common words:
 ['apartment', 'great', 'us', 'athens', 'stay', 'place', 'location', 'host', 'everything', 'clean', 'nice', 'would', 'recommend', 'really', 'good', 'well', 'perfect', 'helpful', 'also', 'acropolis', 'close', 'walk', 'time', 'comfortable', 'metro', 'restaurants', 'flat', 'amazing', 'area', 'city', 'even', 'easy', 'view', 'walking', 'highly', 'definitely', 'wonderful', 'located', 'one', 'around', 'need', 'lovely', 'get', 'like', 'home', 'airport', 'beautiful', 'best', 'distance', 'back', 'made', 'night', 'spacious', 'could', 'neighborhood', 'quiet', 'br', 'needed', 'excellent', 'staying', 'house', 'friendly', 'experience', 'minutes', 'thank', 'station', 'room', 'gave', 'much', 'day', 'enjoyed', 'stayed', 'away', 'street', 'go', 'fantastic', 'thanks', 'right', 'many', 'places', 'loved', 'within', 'little', 'kind', 'kitchen', 'central', 'visit', 'center', 'two', 'super', 'feel', 'felt', 'recommended', 'balcony', 'airbnb', 'see', 'arrived', 'loc

In [83]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Train Word2Vec model with all words
model = Word2Vec([words], vector_size=50, window=5, min_count=1, workers=4)

# embeddings for most used words
embeddings = {word: model.wv[word] for word in most_common_words if word in model.wv}

# similarity matrix
word_list = list(embeddings.keys())
vectors = np.array([embeddings[word] for word in word_list])
similarity_matrix = cosine_similarity(vectors)

# Create dataframe with similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=word_list, columns=word_list)

# Save to pickle file
similarity_df.to_pickle('data_similarity/similarity_matrix.pkl')

# Save also to csv file
similarity_df.to_csv('data_similarity/similarity_matrix.csv')

In [84]:
import random

def get_word_embedding(word, embedding_dict, vector_size=50):
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        return np.random.uniform(-1, 1, vector_size)

# Example of word that exists
word = 'apartment'
embedding = get_word_embedding(word, embeddings)
print("'apartment' embedding:\n", embedding)

# Example of word that does not exist
word = 'example'
embedding = get_word_embedding(word, embeddings)
print("\n'example' embedding:\n", embedding)

'apartment' embedding:
 [-0.08883116 -0.11179578  0.15214497 -0.1346948   0.00311453 -0.06754884
  0.20030154  0.33600235 -0.30232838 -0.20672289  0.0774607  -0.2883072
  0.05963396  0.12183825 -0.03778592  0.11020812  0.09395687  0.04795019
 -0.32755166 -0.15530117  0.09946761  0.19695462  0.26405153 -0.18531065
 -0.00494699 -0.02430619 -0.27107126 -0.01693872 -0.36834285  0.04933509
 -0.17254475 -0.10483955  0.03162792 -0.11344472  0.09230474  0.21511403
  0.15855888  0.1168959   0.17786603 -0.05843613  0.29568493 -0.00688602
 -0.10350201  0.01167428  0.34207278 -0.08155998 -0.10743887  0.06838772
  0.160287    0.0640679 ]

'example' embedding:
 [ 0.44853713 -0.23952605 -0.81495648 -0.76954004  0.85065496 -0.4022519
  0.91092584 -0.30012776 -0.88270384 -0.37424423 -0.92179863 -0.815033
  0.04999589  0.70865638  0.37688544 -0.6770899   0.848643   -0.69424419
  0.12622742 -0.54933092  0.44431685 -0.49162514 -0.12467297  0.22791623
 -0.72114342 -0.36445524  0.41353335 -0.83533108  0.637

In [None]:
def word_similarity(word1, word2, df_similarity):
    # Check if words are in the DataFrame index 
    if word1 in df_similarity.index and word2 in df_similarity.index:
        return df_similarity.loc[word1, word2]
    else:
        emb1 = get_word_embedding(word1, embeddings)
        emb2 = get_word_embedding(word2, embeddings)
        cos = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]
        return cos
    
word1 = "apartment"
word2 = "example"
print(f"Similarity between {word1} and {word2} is:")
print(word_similarity(word1, word2, similarity_df))     # We expect each time different result because words do not exist

word3 = "apartment"
word4 = "apartment"
print(f"Similarity between {word3} and {word4} is:")    # We expect same result because words exist.
print(word_similarity(word3, word4, similarity_df))     # Also approximately 1 because we have the same 2 words

In [None]:
# Semantic Neighbors
def top_n_neighbors(word, sm_df, N):
    if word in sm_df.index:
        # Get the row corresponding to the word from sm_df
        neighbors = sm_df.loc[word].sort_values(ascending=False).head(N)
        # Return list of (neighbor_word, similarity_score) tuples
        return list(neighbors.items())
    else:
        return None

# Maximum similarity of neighborhood
def max_similarity_of_neighborhoods(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    max_sim1 = max([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    max_sim2 = max([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    return max(max_sim1, max_sim2)

# Correlation of similarities of neighborhoods
def correlation_of_neighborhoods(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    similarities1 = np.array([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors1])
    similarities2 = np.array([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    first_cor = np.corrcoef(similarities1, similarities2)[0, 1]

    similarities3 = np.array([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    similarities4 = np.array([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors2])

    second_cor = np.corrcoef(similarities3, similarities4)[0, 1]

    return max(first_cor, second_cor)

# Sum of squared similarities of neighborhoods
def sum_squared_neighborhood_similarities(word1, word2, sm_df, N):
    neighbors1 = top_n_neighbors(word1, sm_df, N)
    neighbors2 = top_n_neighbors(word2, sm_df, N)
    
    if not neighbors1 or not neighbors2:
        return None
    
    similarities1 = np.array([word_similarity(word1, neighbor, sm_df) for neighbor, _ in neighbors2])
    similarities2 = np.array([word_similarity(word2, neighbor, sm_df) for neighbor, _ in neighbors1])
    
    sum_1 = sum(similarities1**2)
    sum_2 = sum(similarities2**2)
    
    return np.sqrt(sum_1 + sum_2)

word1 = "apartment"
word2 = "great"
N = 3

print(f"For words '{word1}' and '{word2}' and N={N}:")
print("Maximum Similarity of Neighborhoods:", max_similarity_of_neighborhoods(word1, word2, similarity_df, N))
print("Correlation of Neighborhoods:", correlation_of_neighborhoods(word1, word2, similarity_df, N))
print("Sum of Squared Neighborhood Similarities:", sum_squared_neighborhood_similarities(word1, word2, similarity_df, N))


word1 = "apartment"
word2 = "great"
N = 10

print(f"\nFor words '{word1}' and '{word2}' and N={N}:")
print("Maximum Similarity of Neighborhoods:", max_similarity_of_neighborhoods(word1, word2, similarity_df, N))
print("Correlation of Neighborhoods:", correlation_of_neighborhoods(word1, word2, similarity_df, N))
print("Sum of Squared Neighborhood Similarities:", sum_squared_neighborhood_similarities(word1, word2, similarity_df, N))


print(f"\nFor n={N}, neighbors of '{word2}' and it's similarities are:")
print(top_n_neighbors(word2, similarity_df, N))