In [1]:
# Import libraries
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pd.options.mode.copy_on_write = True

def filter_stopwords(text, stop_words):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

def clean_special_tags(text):
    # Remove HTML tags and special characters
    clean_text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    # Remove all characters except whitespace, English letters, and Greek letters
    clean_text = re.sub(r'[^\s\w\u0370-\u03FF]', '', clean_text)
    return clean_text

def clean(input_csv_path, output_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path, usecols=['id', 'name', 'description'])

    # Drop duplicates    
    df.drop_duplicates(subset=['id'], inplace=True)

    # Drop NaN values
    df.dropna(subset=['description'], inplace=True)
    df['description'] = df['description'].apply(clean_special_tags)

    # Edit dataframe to keep only descriptions about Athens
    athens_df = df[df['description'].str.contains('Αθήνα|Αθήνες|Αθηνών|Athens', case=False, na=False)]

    # Download stop words
    nltk.download('stopwords')

    # Remove greek stop words
    stop_words = set(stopwords.words('greek'))
    custom_stopwords = ['της', 'από', 'είναι', 'ένα', 'βρίσκεται', 'λεπτά', ',', '.','-', 'πολύ', 'σας', 'μια', 'τους', '2', 'στα', 'πλήρως', 'υπάρχει', 'ή',\
                        'χλμ', 'όλα', 'μέτρα', 'Αθήνας.', 'Αθήνας,', '1', 'πιο']
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Remove english stop words from 'description' column
    stop_words = set(stopwords.words('english'))
    # More stopwords that we want to remove
    custom_stopwords = [ 'floor', 'double', 'located','&', 'two','kitchen', 'walk', 'away', 'close', 'stay', 'away', 'spaceThe',\
                        'one', 'fully', 'equipped', 'living', 'minutes', 'area', 'also', 'bed', 'room']
    # Update stop_words
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Join all descriptions to one string
    all_text = ' '.join(athens_df['description'])
    
    # Print top words used for Athens
    word_freq = nltk.FreqDist(all_text.split())
    print(word_freq.most_common(20))

    # New column from concatinating 'name' and 'description'
    df['name_description'] = df['name'].fillna('NULL') + ' ' + athens_df['description'].fillna('NULL')

    # df.dropna(subset=['name_description'], inplace=True)

    # Output .csv file
    df.to_csv(output_csv_path, index=False)

# Path to input CSV file
input_csv_path_2019 = 'data_train/train_2019.csv'
input_csv_path_2023 = 'data_train/train_2023.csv'

# Path to the output CSV file
output_csv_path_2019 = 'data_rec/rec_2019.csv'
output_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
clean(input_csv_path_2019, output_csv_path_2019)

# 2020
print("Year 2023")
clean(input_csv_path_2023, output_csv_path_2023)

Year 2019


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('Athens', 13541), ('apartment', 13494), ('Acropolis', 5587), ('metro', 4438), ('station', 4161), ('center', 3700), ('bathroom', 3417), ('city', 3394), ('bedroom', 3299), ('renovated', 3114), ('walking', 2785), ('heart', 2680), ('building', 2568), ('access', 2561), ('distance', 2531), ('balcony', 2457), ('restaurants', 2277), ('comfortable', 2248), ('view', 2183), ('neighborhood', 2168)]
Year 2023


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('apartment', 13633), ('Athens', 13025), ('Acropolis', 5442), ('center', 3553), ('renovated', 3512), ('metro', 3375), ('heart', 3366), ('city', 3346), ('bedroom', 3294), ('bathroom', 3114), ('comfortable', 2958), ('station', 2951), ('modern', 2826), ('access', 2444), ('balcony', 2317), ('walking', 2243), ('spacious', 2205), ('building', 2200), ('restaurants', 2034), ('distance', 1970)]


In [5]:
# 2.1

def tfidf(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)
    
    # Define TfidfVectorizer and its parameters with stopwords
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, stop_words='english', ngram_range=(1, 2))

    # Transform vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # matrix TF-IDF
    print("TF-IDF Matrix shape:")
    print(tfidf_matrix.shape)

    print("TF-IDF Matrix array:")
    print(tfidf_matrix.toarray())

    # Get the names (words) from the TfidfVectorizer
    # feature_names = tfidf_vectorizer.get_feature_names_out()
    # print(feature_names)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
tfidf(input_csv_path_2019)

# 2020
print("Year 2023")
tfidf(input_csv_path_2023)


Year 2019
TF-IDF Matrix shape:
(10266, 1000)
TF-IDF Matrix array:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Year 2023
TF-IDF Matrix shape:
(14340, 1000)
TF-IDF Matrix array:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
# 2.2
from heapq import nlargest

def question2(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # Cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    ########## FOR EACH PROPERTY FIND TOP 100 MOST SIMILAR ############
    # # Dictionary to store similarities for each property
    # similar_properties = {}

    # # Loop through each property and find its similarity with all other properties
    # num_properties = len(cosine_sim)
    # for i in range(num_properties):
    #     # Store the similarities in a list
    #     similarities = [(j, cosine_sim[i][j]) for j in range(num_properties) if i != j]
    #     # Sort the similarities based on cosine similarity
    #     similarities.sort(key=lambda x: x[1], reverse=True)
    #     # Store the top 100 similar properties in the dictionary
    #     similar_properties[i] = similarities[:100]

    # return similar_properties
    
    ########### TOP 100 MOST SIMILAR IN GENERAL ######################
    # List for all similarities
    all_similarities = []

    # Loop through each property and find its similarity with all other properties
    num_properties = len(cosine_sim)
    for i in range(num_properties):
        for j in range(i+1, num_properties):
            all_similarities.append((i, j, cosine_sim[i][j]))

    # Sort the list based on cosine similarity
    all_similarities.sort(key=lambda x: x[2], reverse=True)

    print(all_similarities[:100])

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
question2(input_csv_path_2019)

# 2023
print("Year 2023")
question2(input_csv_path_2023)

Year 2019
[(941, 9539, 1.0000000000000004), (3079, 3149, 1.0000000000000004), (4216, 5016, 1.0000000000000004), (5318, 5319, 1.0000000000000004), (6141, 6143, 1.0000000000000004), (6911, 6912, 1.0000000000000004), (7827, 7831, 1.0000000000000004), (7827, 7832, 1.0000000000000004), (7831, 7832, 1.0000000000000004), (9061, 9062, 1.0000000000000004), (9061, 9063, 1.0000000000000004), (9062, 9063, 1.0000000000000004), (9193, 9194, 1.0000000000000004), (9969, 9970, 1.0000000000000004), (598, 7476, 1.0000000000000002), (965, 1243, 1.0000000000000002), (1166, 6359, 1.0000000000000002), (1245, 1248, 1.0000000000000002), (1680, 1728, 1.0000000000000002), (2041, 2042, 1.0000000000000002), (2370, 2371, 1.0000000000000002), (2923, 10037, 1.0000000000000002), (2961, 2962, 1.0000000000000002), (3770, 7898, 1.0000000000000002), (3784, 3879, 1.0000000000000002), (3974, 3975, 1.0000000000000002), (4194, 4195, 1.0000000000000002), (4398, 4429, 1.0000000000000002), (4542, 4544, 1.0000000000000002), (4710

In [8]:
# 2.3
def recommend(input_csv_path, item_id, num):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # TF-IDF matrix and descriptions
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))  # Assuming df is defined elsewhere

    # Cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Index of the item in the dataframe
    item_index = df[df['id'] == item_id].index[0]

    # Similarity scores of the item with all other items
    sim_scores = list(enumerate(cosine_sim[item_index]))

    # Sort similarity scores
    sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar items (excluding itself)
    top_N_similar_items = sim_scores_sorted[1:num+1]

    # Print
    print(f"Recommending {num} listings similar to {df.iloc[item_index]['name']}")
    print("-" * 60)
    for i, (index, score) in enumerate(top_N_similar_items):
        print(f"Recommended: {df.iloc[index]['name']}")
        print(f"Description: {df.iloc[index]['description']}")
        print(f"(score: {score})")
        print()

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
recommend(input_csv_path_2019, 10595, 5)

# 2020
print("Year 2023")
recommend(input_csv_path_2023, 10595, 5)

Year 2019
Recommending 5 listings similar to 96m2, 3BR, 2BA, Metro, WI-FI etc...
------------------------------------------------------------
Recommended: 50m2, Metro, WI-FI, cableTV, more
Description: Athens Furnished Apartment No3 is 1bedroom apartment 50 square meters  excellent located  close to metro station  lovely  very clean  with all the facilities that you will need nice balcony excellent WiFi cable tv fully air conditioned Athens Furnished Apartment No3 is an excellent located close to metro lovely very clean 1bedroom apartment 50 square meters with all the facilities that you will need and a very nice balcony facing the inner garden to enjoy your breakfast in the morning or relax in the evening Fully equipped kitchen with everything you need to prepare your lunchdinner Nice Living room to relax and enjoy a movie or a sport event Clean nice bathroom For more than 2 people there is a great double sofabed in the living room Apartment No3 has everything you will need Telephone 

In [9]:
def collocation(input_csv_path):
    
    nltk.download('punkt')

    # Read .csv file
    df = pd.read_csv(input_csv_path)

    # Join texts in one text
    text_corpus = ' '.join(df['description'].astype(str))

    # List for words of text
    words = nltk.word_tokenize(text_corpus)

    #BigramCollocationFinder
    bigram_finder = BigramCollocationFinder.from_words(words)

    # Top 10 word collocations
    top_10_bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    print("Top 10 collocations:")
    for bigram in top_10_bigrams:
        print(bigram)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
collocation(input_csv_path_2019)

# 2020
print("\nYear 2023")
collocation(input_csv_path_2023)


Year 2019


[nltk_data] Downloading package punkt to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 collocations:
('living', 'room')
('of', 'Athens')
('fully', 'equipped')
('in', 'the')
('walking', 'distance')
('metro', 'station')
('The', 'apartment')
('double', 'bed')
('heart', 'of')
('apartment', 'is')

Year 2023


[nltk_data] Downloading package punkt to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 collocations:
('living', 'room')
('fully', 'equipped')
('of', 'Athens')
('in', 'the')
('heart', 'of')
('walking', 'distance')
('double', 'bed')
('metro', 'station')
('apartment', 'is')
('the', 'heart')
