In [80]:
# Import libraries
import pandas as pd

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [128]:
pd.options.mode.copy_on_write = True

def filter_stopwords(text, stop_words):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

def clean_special_tags(text):
    # Remove HTML tags and special characters
    clean_text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    # Remove special characters except whitespace, English letters, Greek letters
    clean_text = re.sub(r'[^\w\s\u0370-\u03FF]', '', clean_text)
    return clean_text

def clean(input_csv_path, output_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path, usecols=['id', 'name', 'description'])
    
    # Drop NaN values
    df.dropna(subset=['description'], inplace=True)
    df['description'] = df['description'].apply(clean_special_tags)

    # Edit dataframe to keep only descriptions about Athens
    athens_df = df[df['description'].str.contains('Αθήνα|Αθήνες|Αθηνών|Athens', case=False, na=False)]

    # Download stop words
    nltk.download('stopwords')

    # Remove greek stop words
    stop_words = set(stopwords.words('greek'))
    custom_stopwords = ['της', 'από', 'είναι', 'ένα', 'βρίσκεται', 'λεπτά', ',', '.','-', 'πολύ', 'σας', 'μια', 'τους', '2', 'στα', 'πλήρως', 'υπάρχει', 'ή',\
                        'χλμ', 'όλα', 'μέτρα', 'Αθήνας.', 'Αθήνας,', '1', 'πιο']
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Remove english stop words from 'description' column
    stop_words = set(stopwords.words('english'))
    # More stopwords that we want to remove
    custom_stopwords = [ 'floor', 'double', 'located','&', 'two','kitchen', 'walk', 'away', 'close', 'stay', 'away', 'spaceThe',\
                        'one', 'fully', 'equipped', 'living', 'minutes', 'area', 'also', 'bed', 'room']
    # Update stop_words
    stop_words.update(custom_stopwords)
    
    athens_df['description'] = athens_df['description'].apply(filter_stopwords, stop_words=stop_words)

    # Join all descriptions to one string
    all_text = ' '.join(athens_df['description'])

    # Print top words used for Athens
    word_freq = nltk.FreqDist(all_text.split())
    print(word_freq.most_common(20))

    # New column from concatinating 'name' and 'description'
    df['name_description'] = df['name'].fillna('NULL') + ' ' + athens_df['description'].fillna('NULL')

    # Output .csv file
    df.to_csv(output_csv_path, index=False)

# Path to input CSV file
input_csv_path_2019 = 'data_train/train_2019.csv'
input_csv_path_2023 = 'data_train/train_2023.csv'

# Path to the output CSV file
output_csv_path_2019 = 'data_rec/rec_2019.csv'
output_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
clean(input_csv_path_2019, output_csv_path_2019)

# 2020
print("Year 2023")
clean(input_csv_path_2023, output_csv_path_2023)

Year 2019


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('apartment', 36704), ('Athens', 36696), ('Acropolis', 15449), ('metro', 12166), ('station', 11348), ('center', 9981), ('bathroom', 9313), ('city', 9203), ('bedroom', 9089), ('renovated', 8386), ('walking', 7595), ('heart', 7275), ('access', 6956), ('distance', 6951), ('building', 6912), ('balcony', 6731), ('restaurants', 6196), ('comfortable', 6108), ('view', 6030), ('neighborhood', 5864)]
Year 2023


[nltk_data] Downloading package stopwords to C:\Users\Erik
[nltk_data]     Kajacka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('apartment', 35448), ('Athens', 33897), ('Acropolis', 14523), ('renovated', 9502), ('center', 9451), ('heart', 8817), ('metro', 8792), ('bedroom', 8723), ('city', 8664), ('bathroom', 8128), ('station', 7723), ('comfortable', 7555), ('modern', 7236), ('access', 6393), ('balcony', 6059), ('building', 5980), ('walking', 5951), ('spacious', 5868), ('view', 5285), ('restaurants', 5233)]


In [129]:
# 2.1

def tfidf(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Ορίζουμε τον TfidfVectorizer με τις κατάλληλες παραμέτρους, συμπεριλαμβάνοντας τα stop words
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, stop_words='english', ngram_range=(1, 2))

    # Εκπαιδεύουμε τον vectorizer στα δεδομένα μας
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # matrix TF-IDF
    print(tfidf_matrix.shape)

    print("TF-IDF Matrix:")
    print(tfidf_matrix.toarray())
    # Get the feature names (words) from the TfidfVectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    # print(feature_names)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
tfidf(input_csv_path_2019)

# 2020
print("Year 2023")
tfidf(input_csv_path_2023)


Year 2019
(27833, 1000)
TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['10' '10 min' '100' '12' '15' '15 min' '150' '1st' '20' '200' '2017'
 '2018' '24' '247' '25' '2nd' '30' '32' '35' '3rd' '40' '400' '45' '4th'
 '50' '500' '5min' '5th' '6th' 'able' 'ac' 'access' 'access apartment'
 'access entire' 'accessible' 'accommodate' 'accommodate people'
 'accommodation' 'acropolis' 'acropolis acropolis' 'acropolis apartment'
 'acropolis hill' 'acropolis metro' 'acropolis museum' 'acropolis plaka'
 'acropolis view' 'activities' 'additional' 'adults' 'adventurers'
 'adventurers business' 'agora' 'air' 'air condition' 'air conditioned'
 'air conditioning' 'airbnb' 'aircondition' 'airconditioned'
 'airconditioning' 'airport' 'airport port' 'airy' 'akropolis' 'allowed'
 'alternative' 'amazing' 'amazing view' 'amenities' 'ancient'
 'ancient agora' 'apart' 'apartment' 'apartme

In [1]:
# 2.2

def question2(input_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Step 1: Compute TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['name_description'].values.astype('U'))

    # Step 2: Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Step 3: Store similarity scores in a dictionary
    similar_properties = {}
    for i in range(len(df)):
        similar_properties[df.iloc[i]['id']] = cosine_sim[i]

    # Step 4: Sort the dictionary by similarity scores and keep the top 100 most similar properties
    top_100_similar_properties = {}
    for property_id, sim_scores in similar_properties.items():
        similar_properties_sorted = sorted(sim_scores, reverse=True)[1:101]  # Exclude self-similarity
        top_100_similar_properties[property_id] = similar_properties_sorted

    # Print or further process the top 100 similar properties
    print(top_100_similar_properties)

# Path to input CSV file
input_csv_path_2019 = 'data_rec/rec_2019.csv'
input_csv_path_2023 = 'data_rec/rec_2023.csv'

# 2019
print("Year 2019")
question2(input_csv_path_2019)

# 2020
print("Year 2023")
question2(input_csv_path_2023)

Year 2019


NameError: name 'tfidf' is not defined