In [43]:
# Import libraries
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
pd.options.mode.copy_on_write = True

def filter_stopwords(text, stop_words):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

def clean_special_tags(text):
    # Remove HTML tags and special characters
    clean_text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    # Remove all characters except whitespace, English letters, and Greek letters
    clean_text = re.sub(r'[^\s\w\u0370-\u03FF]', '', clean_text)
    return clean_text

def clean(input_csv_path, output_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path, usecols=['id', 'name', 'description'])

    # Drop duplicates    
    df.drop_duplicates(subset=['id'], inplace=True)

    # Drop NaN values
    df.dropna(subset=['description'], inplace=True)
    df['description'] = df['description'].apply(clean_special_tags)

    # Edit dataframe to keep only descriptions about Athens
    athens_df = df[df['description'].str.contains('Αθήνα|Αθήνες|Αθηνών|Athens', case=False, na=False)]

    # Download stop words
    nltk.download('stopwords')

    # Remove greek stop words
    stop_words = set(stopwords.words('greek'))
    custom_stopwords = ['της', 'από', 'είναι', 'ένα', 'βρίσκεται', 'λεπτά', ',', '.','-', 'πολύ', 'σας', 'μια', 'τους', '2', 'στα', 'πλήρως', 'υπάρχει', 'ή',\
                        'χλμ', 'όλα', 'μέτρα', 'Αθήνας.', 'Αθήνας,', '1', 'πιο']
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Remove english stop words from 'description' column
    stop_words = set(stopwords.words('english'))
    # More stopwords that we want to remove
    custom_stopwords = [ 'floor', 'double', 'located','&', 'two','kitchen', 'walk', 'away', 'close', 'stay', 'away', 'spaceThe',\
                        'one', 'fully', 'equipped', 'living', 'minutes', 'area', 'also', 'bed', 'room']
    # Update stop_words
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Join all descriptions to one string
    all_text = ' '.join(athens_df['description'])
    
    # Print top words used for Athens
    word_freq = nltk.FreqDist(all_text.split())
    print(word_freq.most_common(20))

    # New column from concatinating 'name' and 'description'
    df['name_description'] = df['name'].fillna('NULL') + ' ' + athens_df['description'].fillna('NULL')

    # df.dropna(subset=['name_description'], inplace=True)

    # Output .csv file
    df.to_csv(output_csv_path, index=False)

# Path to input CSV file
input_csv_path_2019 = 'data_train/train_2019.csv'
input_csv_path_2023 = 'data_train/train_2023.csv'

# Path to the output CSV file
output_csv_path_2019 = 'data_rec/rec_2019.csv'
output_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
clean(input_csv_path_2019, output_csv_path_2019)

# 2020
print("Year 2023")
clean(input_csv_path_2023, output_csv_path_2023)

Year 2019


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('Athens', 13541), ('apartment', 13494), ('Acropolis', 5587), ('metro', 4438), ('station', 4161), ('center', 3700), ('bathroom', 3417), ('city', 3394), ('bedroom', 3299), ('renovated', 3114), ('walking', 2785), ('heart', 2680), ('building', 2568), ('access', 2561), ('distance', 2531), ('balcony', 2457), ('restaurants', 2277), ('comfortable', 2248), ('view', 2183), ('neighborhood', 2168)]
Year 2023


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('apartment', 13633), ('Athens', 13025), ('Acropolis', 5442), ('center', 3553), ('renovated', 3512), ('metro', 3375), ('heart', 3366), ('city', 3346), ('bedroom', 3294), ('bathroom', 3114), ('comfortable', 2958), ('station', 2951), ('modern', 2826), ('access', 2444), ('balcony', 2317), ('walking', 2243), ('spacious', 2205), ('building', 2200), ('restaurants', 2034), ('distance', 1970)]


In [45]:
# 2.1

def tfidf(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)
    print(df)
    # Ορίζουμε τον TfidfVectorizer με τις κατάλληλες παραμέτρους, συμπεριλαμβάνοντας τα stop words
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, stop_words='english', ngram_range=(1, 2))

    # Εκπαιδεύουμε τον vectorizer στα δεδομένα μας
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # matrix TF-IDF
    print(tfidf_matrix.shape)

    print("TF-IDF Matrix:")
    print(tfidf_matrix.toarray())
    # Get the feature names (words) from the TfidfVectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    # print(feature_names)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
tfidf(input_csv_path_2019)

# 2020
print("Year 2023")
tfidf(input_csv_path_2023)


Year 2019
             id                                               name  \
0         10595                96m2, 3BR, 2BA, Metro, WI-FI etc...   
1         10988                 75m2, 2-br, metro, wi-fi, cable TV   
2         10990                  50m2, Metro, WI-FI, cableTV, more   
3         10993                Studio, metro, cable tv, wi-fi, etc   
4         10995                47m2, close to metro,cable TV,wi-fi   
...         ...                                                ...   
10261  32956579  Flat for 3 persons 10 minutes for center of At...   
10262  32958153               TRADITIONAL VINTAGE ROOM IN ATHENS 2   
10263  32958286                         TRADITIONAL VINTAGE ROOM 3   
10264  32958368        TRADITIONAL SINGLE ROOM WITH PRIVET BALCONY   
10265  32959125                                         ATH Hostel   

                                             description  \
0      Athens Furnished Apartment No6 is 3bedroom apa...   
1      Athens Furnished Apart

In [41]:
# 2.2
from heapq import nlargest

def question2(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Step 1: Compute TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # Step 2: Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    ########## FOR EACH PROPERTY FIND TOP 100 MOST SIMILAR ############
    # # Step 3: Create a dictionary to store similarities for each property
    # similar_properties = {}

    # # Step 4: Loop through each property and find its similarity with all other properties
    # num_properties = len(cosine_sim)
    # for i in range(num_properties):
    #     # Store the similarities in a list for easy sorting later
    #     similarities = [(j, cosine_sim[i][j]) for j in range(num_properties) if i != j]
    #     # Sort the similarities based on cosine similarity
    #     similarities.sort(key=lambda x: x[1], reverse=True)
    #     # Store the top 100 similar properties in the dictionary
    #     similar_properties[i] = similarities[:100]

    # return similar_properties
    
    ########### TOP 100 MOST SIMILAR IN GENERAL ######################
    # Step 3: Create a list to store all similarities
    all_similarities = []

    # Step 4: Loop through each property and find its similarity with all other properties
    num_properties = len(cosine_sim)
    for i in range(num_properties):
        for j in range(i+1, num_properties):
            all_similarities.append((i, j, cosine_sim[i][j]))

    # Step 5: Sort the list based on cosine similarity
    all_similarities.sort(key=lambda x: x[2], reverse=True)

    print(all_similarities[:100])

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
question2(input_csv_path_2019)

# 2023
print("Year 2023")
question2(input_csv_path_2023)

Year 2019
[(5809, 5811, 1.0000000000000007), (3104, 3722, 1.0000000000000004), (6454, 6496, 1.0000000000000004), (6454, 6499, 1.0000000000000004), (6496, 6499, 1.0000000000000004), (6705, 6706, 1.0000000000000004), (7340, 7650, 1.0000000000000004), (708, 7146, 1.0000000000000002), (2134, 7505, 1.0000000000000002), (2184, 2587, 1.0000000000000002), (2782, 2860, 1.0000000000000002), (3245, 3267, 1.0000000000000002), (3336, 5456, 1.0000000000000002), (3938, 3939, 1.0000000000000002), (4387, 4760, 1.0000000000000002), (4387, 4909, 1.0000000000000002), (4409, 4417, 1.0000000000000002), (4409, 4418, 1.0000000000000002), (4417, 4418, 1.0000000000000002), (4751, 5659, 1.0000000000000002), (4760, 4909, 1.0000000000000002), (4921, 5949, 1.0000000000000002), (5542, 6974, 1.0000000000000002), (5628, 5630, 1.0000000000000002), (5628, 5636, 1.0000000000000002), (5628, 5819, 1.0000000000000002), (5628, 5835, 1.0000000000000002), (5628, 5935, 1.0000000000000002), (5630, 5636, 1.0000000000000002), (563

In [46]:
# 2.3
def recommend(input_csv_path, item_id, num):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Step 1: Retrieve the TF-IDF matrix and property descriptions
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))  # Assuming df is defined elsewhere

    # Step 2: Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Step 3: Get the index of the item in the dataframe
    item_index = df[df['id'] == item_id].index[0]

    # Step 4: Get the similarity scores of the item with all other items
    sim_scores = list(enumerate(cosine_sim[item_index]))

    # Step 5: Sort the similarity scores
    sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Step 6: Get the top N similar items (excluding the item itself)
    top_N_similar_items = sim_scores_sorted[1:num+1]

    # Step 7: Print the recommendations
    print(f"Recommending {num} listings similar to {df.iloc[item_index]['name']}")
    print("-" * 60)
    for i, (index, score) in enumerate(top_N_similar_items):
        print(f"Recommended: {df.iloc[index]['name']}")
        print(f"Description: {df.iloc[index]['description']}")
        print(f"(score: {score})")
        print()

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
recommend(input_csv_path_2019, 10595, 5)

# 2020
print("Year 2023")
recommend(input_csv_path_2023, 10595, 5)

Year 2019
Recommending 5 listings similar to 96m2, 3BR, 2BA, Metro, WI-FI etc...
------------------------------------------------------------
Recommended: 50m2, Metro, WI-FI, cableTV, more
Description: Athens Furnished Apartment No3 is 1bedroom apartment 50 square meters  excellent located  close to metro station  lovely  very clean  with all the facilities that you will need nice balcony excellent WiFi cable tv fully air conditioned Athens Furnished Apartment No3 is an excellent located close to metro lovely very clean 1bedroom apartment 50 square meters with all the facilities that you will need and a very nice balcony facing the inner garden to enjoy your breakfast in the morning or relax in the evening Fully equipped kitchen with everything you need to prepare your lunchdinner Nice Living room to relax and enjoy a movie or a sport event Clean nice bathroom For more than 2 people there is a great double sofabed in the living room Apartment No3 has everything you will need Telephone 

In [54]:
def collocation(input_csv_path):
    
    nltk.download('punkt')

    # Διαβάζουμε το CSV αρχείο
    df = pd.read_csv(input_csv_path)

    # Συνδυάζουμε όλα τα κείμενα σε ένα μεγάλο κείμενο
    text_corpus = ' '.join(df['description'].astype(str))   #maybe name_description but it shows garbage bcause we have not clean name column

    # Δημιουργούμε μια λίστα λέξεων από το κείμενο
    words = nltk.word_tokenize(text_corpus)

    # Δημιουργούμε τον BigramCollocationFinder
    bigram_finder = BigramCollocationFinder.from_words(words)

    # Επιλέγουμε τους 10 πιο συχνούς συνδυασμούς λέξεων
    top_10_bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    print("Top 10 collocations:")
    for bigram in top_10_bigrams:
        print(bigram)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
collocation(input_csv_path_2019)

# 2020
print("\nYear 2023")
collocation(input_csv_path_2023)


Year 2019


[nltk_data] Downloading package punkt to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 collocations:
('living', 'room')
('of', 'Athens')
('fully', 'equipped')
('in', 'the')
('walking', 'distance')
('metro', 'station')
('The', 'apartment')
('double', 'bed')
('heart', 'of')
('apartment', 'is')

Year 2023


[nltk_data] Downloading package punkt to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 collocations:
('living', 'room')
('fully', 'equipped')
('of', 'Athens')
('in', 'the')
('heart', 'of')
('walking', 'distance')
('double', 'bed')
('metro', 'station')
('apartment', 'is')
('the', 'heart')
