In [None]:
# Import libraries
import pandas as pd
import sys, os
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

import folium
from folium import plugins

from wordcloud import WordCloud, STOPWORDS

from transformers import pipeline

import re

In [None]:
# Function for output csv file
pd.options.mode.copy_on_write = True


def data_exploration(df_combined, months_folders, output_csv_path, columns_to_select):

    # Number of NaN values replaced
    nan_replaced_count = 0

    # For each month
    for month_folder in months_folders:
        # Start by importing "listings.csv"
        input_csv_path = os.path.join(month_folder, 'listings.csv')
        
        # The dataframe
        df = pd.read_csv(input_csv_path, low_memory=False)
        
        # Include only the columns that we want and are in the .csv file
        columns_to_keep = [col for col in columns_to_select if col in df.columns]
        df_selected = df[columns_to_keep]
        
        # Columns from other .csv files
        additional_columns = set()
        
        # Check all other .csv files in the folder
        for filename in os.listdir(month_folder):
            if filename.endswith('.csv') and filename != 'listings.csv':  # We want .csv files but no the starting one, aka "listings.csv"
                file_path = os.path.join(month_folder, filename)
                other_df = pd.read_csv(file_path)

                # Iterate over each column
                for col in other_df.columns:
                    # If we find a column that we want but is not already in our dataframe, add it from the other .csv file
                    if col not in df_selected.columns and col in columns_to_select:
                        additional_columns.add(col)
                        # Add the column and its data to our dataframe
                        df_selected[col] = other_df[col]

                    elif  col in df_selected.columns and col in columns_to_select:
                        # Check if any value of the column in our dataframe is NaN
                        nan_mask = df_selected[col].isna()
                        
                        # Check if the corresponding value in the other dataframes is not NaN
                        non_nan_mask = ~other_df[col].isna()
                        
                        # Replace NaN values with not NaN ones
                        df_selected.loc[nan_mask & non_nan_mask, col] = other_df.loc[nan_mask & non_nan_mask, col]

                        # Number of NaN values replaced
                        nan_replaced_count += sum(nan_mask & non_nan_mask)

        # Concatenate every months dataframes
        df_combined = pd.concat([df_combined, df_selected], ignore_index=True)

    # Write the final dataframe to the output .csv file
    df_combined.to_csv(output_csv_path, index=False)

    print(f"Data from all months has been written to {output_csv_path}")
    print(f"The output CSV file has {df_selected.shape[1]} columns.")
    print(f"Number of NaN values replaced: {nan_replaced_count}")

    return df_combined

# Initialize a DataFrame to store data for 2019
dataframe_2019 = pd.DataFrame()

# Initialize a DataFrame to store data for 2023
dataframe_2023 = pd.DataFrame()

# Columns we want to select
columns_to_select_2019 = ['id', 'comments']

columns_to_select_2023 = ['id', 'comments']

### 2019 ###

# Each month's folder
months_folders = ['data/2019/april', 'data/2019/febrouary', 'data/2019/march']

# Path to the output CSV file
output_csv_path_2019 = 'data_train/train_2019.csv'

print("For 2019:")
dataframe_2019 = data_exploration(dataframe_2019, months_folders, output_csv_path_2019, columns_to_select_2019)

### 2023 ###

# Each month's folder
months_folders = ['data/2023/june', 'data/2023/march', 'data/2023/september']

# Path to the output CSV file
output_csv_path_2023 = 'data_train/train_2023.csv'

print("\nFor 2023:")
dataframe_2023 = data_exploration(dataframe_2023, months_folders, output_csv_path_2023, columns_to_select_2023)

In [None]:
# Προεπεξεργασία σχολίων
def preprocess_text(text):
    # Αφαίρεση αριθμών
    text = re.sub(r'\d+', '', text)
    # Αφαίρεση σημείων στίξης και μετατροπή σε μικρούς χαρακτήρες
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Αφαίρεση URL
    text = re.sub(r'http\S+', '', text)
    
    return text

def filter_stopwords(text, stop_words):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

def clean_special_tags(text):
    # Αφαίρεση HTML tags
    clean_text = re.sub(r'<[^>]+>', '', text)
    # Αφαίρεση όλων των χαρακτήρων εκτός από διαστήματα και αγγλικούς χαρακτήρες
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    return clean_text

dataframe_2019.dropna(subset=['comments'], inplace=True)
dataframe_2023.dropna(subset=['comments'], inplace=True)

# Download stop words
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

dataframe_2019['comments'] = dataframe_2019['comments'].apply(preprocess_text)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(preprocess_text)

dataframe_2019['comments'] = dataframe_2019['comments'].apply(filter_stopwords, stop_words = stop_words)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(filter_stopwords, stop_words = stop_words)

dataframe_2019['comments'] = dataframe_2019['comments'].apply(clean_special_tags)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(clean_special_tags)

dataframe_2019 = dataframe_2019[dataframe_2019['comments'].str.strip().astype(bool)]
dataframe_2023 = dataframe_2023[dataframe_2023['comments'].str.strip().astype(bool)]

dataframe_2019.to_csv(output_csv_path_2019, index=False)
dataframe_2023.to_csv(output_csv_path_2023, index=False)


In [72]:
# Φόρτωση μοντέλου ανάλυσης συναισθήματος
# sentiment_analysis = pipeline('sentiment-analysis', model='siebert/sentiment-roberta-large-english')
from transformers import pipeline, AutoTokenizer

# Load the sentiment analysis model and tokenizer
model_name = 'finiteautomata/bertweet-base-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analysis = pipeline('sentiment-analysis', model=model_name, tokenizer=tokenizer, truncation=True)


def analyze_sentiment(data):
    results = []
    for idx, comment in zip(data['id'], data['comments']):
        result = sentiment_analysis(comment)[0]
        sentiment = result['label']
        if sentiment == 'LABEL_0':
            sentiment = 'negative'
        elif sentiment == 'LABEL_1':
            sentiment = 'neutral'
        else:
            sentiment = 'positive'
        results.append({'id': idx, 'review': comment, 'sentiment': sentiment})
    return pd.DataFrame(results)

# Ανάλυση συναισθήματος για τα έτη 2019 και 2023
sample_df_2019 = dataframe_2019.sample(n=1000)
sample_df_2023 = dataframe_2023.sample(n=1000)

sentiment_df_2019 = analyze_sentiment(sample_df_2019[['id', 'comments']])
sentiment_df_2023 = analyze_sentiment(sample_df_2023[['id', 'comments']])

sentiment_df_2019.to_csv("data_sentiment/sentiment_2019.csv", index = False)
sentiment_df_2023.to_csv("data_sentiment/sentiment_2023.csv", index = False)

# Calculate sentiment counts
sentiment_counts_2019 = sentiment_df_2019['sentiment'].value_counts()
sentiment_counts_2023 = sentiment_df_2023['sentiment'].value_counts()

# Print sentiment counts
print("Sentiment counts for 2019:")
print(sentiment_counts_2019)
print("\nSentiment counts for 2023:")
print(sentiment_counts_2023)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Sentiment counts for 2019:
sentiment
positive    1000
Name: count, dtype: int64

Sentiment counts for 2023:
sentiment
positive    1000
Name: count, dtype: int64
