In [1]:
# Import libraries
import pandas as pd
import sys, os
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

import folium
from folium import plugins

from wordcloud import WordCloud, STOPWORDS

from transformers import pipeline

import re

  torch.utils._pytree._register_pytree_node(


In [2]:
# Function for output csv file
pd.options.mode.copy_on_write = True


def data_exploration(df_combined, months_folders, output_csv_path, columns_to_select):

    # Number of NaN values replaced
    nan_replaced_count = 0

    # For each month
    for month_folder in months_folders:
        # Start by importing "listings.csv"
        input_csv_path = os.path.join(month_folder, 'listings.csv')
        
        # The dataframe
        df = pd.read_csv(input_csv_path, low_memory=False)
        
        # Include only the columns that we want and are in the .csv file
        columns_to_keep = [col for col in columns_to_select if col in df.columns]
        df_selected = df[columns_to_keep]
        
        # Columns from other .csv files
        additional_columns = set()
        
        # Check all other .csv files in the folder
        for filename in os.listdir(month_folder):
            if filename.endswith('.csv') and filename != 'listings.csv':  # We want .csv files but no the starting one, aka "listings.csv"
                file_path = os.path.join(month_folder, filename)
                other_df = pd.read_csv(file_path)

                # Iterate over each column
                for col in other_df.columns:
                    # If we find a column that we want but is not already in our dataframe, add it from the other .csv file
                    if col not in df_selected.columns and col in columns_to_select:
                        additional_columns.add(col)
                        # Add the column and its data to our dataframe
                        df_selected[col] = other_df[col]

                    elif  col in df_selected.columns and col in columns_to_select:
                        # Check if any value of the column in our dataframe is NaN
                        nan_mask = df_selected[col].isna()
                        
                        # Check if the corresponding value in the other dataframes is not NaN
                        non_nan_mask = ~other_df[col].isna()
                        
                        # Replace NaN values with not NaN ones
                        df_selected.loc[nan_mask & non_nan_mask, col] = other_df.loc[nan_mask & non_nan_mask, col]

                        # Number of NaN values replaced
                        nan_replaced_count += sum(nan_mask & non_nan_mask)

        # Concatenate every months dataframes
        df_combined = pd.concat([df_combined, df_selected], ignore_index=True)

    # Write the final dataframe to the output .csv file
    df_combined.to_csv(output_csv_path, index=False)

    print(f"Data from all months has been written to {output_csv_path}")
    print(f"The output CSV file has {df_selected.shape[1]} columns.")
    print(f"Number of NaN values replaced: {nan_replaced_count}")

    return df_combined

# Initialize a DataFrame to store data for 2019
dataframe_2019 = pd.DataFrame()

# Initialize a DataFrame to store data for 2023
dataframe_2023 = pd.DataFrame()

# Columns we want to select
columns_to_select_2019 = ['id', 'comments']

columns_to_select_2023 = ['id', 'comments']

### 2019 ###

# Each month's folder
months_folders = ['data/2019/april', 'data/2019/febrouary', 'data/2019/march']

# Path to the output CSV file
output_csv_path_2019 = 'data_train/train_2019.csv'

print("For 2019:")
dataframe_2019 = data_exploration(dataframe_2019, months_folders, output_csv_path_2019, columns_to_select_2019)

### 2023 ###

# Each month's folder
months_folders = ['data/2023/june', 'data/2023/march', 'data/2023/september']

# Path to the output CSV file
output_csv_path_2023 = 'data_train/train_2023.csv'

print("\nFor 2023:")
dataframe_2023 = data_exploration(dataframe_2023, months_folders, output_csv_path_2023, columns_to_select_2023)

For 2019:
Data from all months has been written to data_train/train_2019.csv
The output CSV file has 2 columns.
Number of NaN values replaced: 0

For 2023:
Data from all months has been written to data_train/train_2023.csv
The output CSV file has 2 columns.
Number of NaN values replaced: 0


In [3]:
# Προεπεξεργασία σχολίων
def preprocess_text(text):
    # Αφαίρεση αριθμών
    text = re.sub(r'\d+', '', text)
    # Αφαίρεση σημείων στίξης και μετατροπή σε μικρούς χαρακτήρες
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Αφαίρεση URL
    text = re.sub(r'http\S+', '', text)
    
    return text

def filter_stopwords(text, stop_words):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

def clean_special_tags(text):
    # Αφαίρεση HTML tags
    clean_text = re.sub(r'<[^>]+>', '', text)
    # Αφαίρεση όλων των χαρακτήρων εκτός από διαστήματα και αγγλικούς χαρακτήρες
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    return clean_text

dataframe_2019.dropna(subset=['comments'], inplace=True)
dataframe_2023.dropna(subset=['comments'], inplace=True)

# Download stop words
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

dataframe_2019['comments'] = dataframe_2019['comments'].apply(preprocess_text)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(preprocess_text)

dataframe_2019['comments'] = dataframe_2019['comments'].apply(filter_stopwords, stop_words = stop_words)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(filter_stopwords, stop_words = stop_words)

dataframe_2019['comments'] = dataframe_2019['comments'].apply(clean_special_tags)
dataframe_2023['comments'] = dataframe_2023['comments'].apply(clean_special_tags)

dataframe_2019 = dataframe_2019[dataframe_2019['comments'].str.strip().astype(bool)]
dataframe_2023 = dataframe_2023[dataframe_2023['comments'].str.strip().astype(bool)]

dataframe_2019.to_csv(output_csv_path_2019, index=False)
dataframe_2023.to_csv(output_csv_path_2023, index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaggelis_kalabokis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:




# Φόρτωση μοντέλου ανάλυσης συναισθήματος
# sentiment_analysis = pipeline('sentiment-analysis', model='siebert/sentiment-roberta-large-english')
from transformers import pipeline, AutoTokenizer

# Load the sentiment analysis model and tokenizer
model_name = 'finiteautomata/bertweet-base-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analysis = pipeline('sentiment-analysis', model=model_name, tokenizer=tokenizer, truncation=True)


def analyze_sentiment(data):
    results = []
    for idx, comment in zip(data['id'], data['comments']):
        result = sentiment_analysis(comment)[0]
        sentiment = result['label']
        if sentiment == 'LABEL_0':
            sentiment = 'negative'
        elif sentiment == 'LABEL_1':
            sentiment = 'neutral'
        else:
            sentiment = 'positive'
        results.append({'id': idx, 'review': comment, 'sentiment': sentiment})
    return pd.DataFrame(results)

# Ανάλυση συναισθήματος για τα έτη 2019 και 2023
sample_df_2019 = dataframe_2019.sample(n=1000)
sample_df_2023 = dataframe_2023.sample(n=1000)

sentiment_df_2019 = analyze_sentiment(sample_df_2019[['id', 'comments']])
sentiment_df_2023 = analyze_sentiment(sample_df_2023[['id', 'comments']])

sentiment_df_2019.to_csv("data_sentiment/sentiment_2019.csv", index = False)
sentiment_df_2023.to_csv("data_sentiment/sentiment_2023.csv", index = False)

# Calculate sentiment counts
sentiment_counts_2019 = sentiment_df_2019['sentiment'].value_counts()
sentiment_counts_2023 = sentiment_df_2023['sentiment'].value_counts()

# Print sentiment counts
print("Sentiment counts for 2019:")
print(sentiment_counts_2019)
print("\nSentiment counts for 2023:")
print(sentiment_counts_2023)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Sentiment counts for 2019:
sentiment
positive    1000
Name: count, dtype: int64

Sentiment counts for 2023:
sentiment
positive    1000
Name: count, dtype: int64


In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax
import numpy as np


# positive_words = ["great", "awesome", "glad", "happy", "fantastic", "good", "excellent", "amazing"]
# negative_words = ["terrible", "awful", "gross", "dirty", "disappointed", "bad", "poor", "horrible"]



positive_words = ["great", "awesome", "glad", "happy", "fantastic", "good", "excellent", "amazing",
                  "beautiful", "wonderful", "superb", "perfect", "delightful", "joyful", "splendid",
                  "marvelous", "terrific", "brilliant", "phenomenal", "fabulous", "vibrant", "positive",
                  "uplifting", "lovely", "ideal", "satisfying", "outstanding", "magnificent", "stellar",
                  "glorious", "radiant", "blissful", "ecstatic", "content", "thrilled", "charming", "sweet",
                  "heartwarming", "kind", "grateful", "optimistic", "inspiring", "remarkable", "captivating"]

negative_words = ["terrible", "awful", "gross", "dirty", "disappointed", "bad", "poor", "horrible",
                  "unpleasant", "unfortunate", "miserable", "distressing", "inferior", "grim", "bleak",
                  "unacceptable", "dreadful", "lousy", "painful", "sorrowful", "regrettable", "tragic",
                  "depressing", "dismal", "unsatisfactory", "heartbreaking", "unfavorable", "atrocious",
                  "abysmal", "deplorable", "pitiful", "abominable", "gloomy", "disheartening", "dreary",
                  "negative", "displeasing", "repugnant", "appalling", "detestable", "horrifying", "dire",
                  "shameful", "wretched", "unsuitable", "disgusting", "offensive"]


def contains_positive_words(text):
    return any(word in text for word in positive_words)

def contains_negative_words(text):
    return any(word in text for word in negative_words)

# Load model, tokenizer, and config
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Function to analyze sentiment
def analyze_sentiment(data):
    results = []
    for idx, comment in zip(data['id'], data['comments']):
        comment = preprocess(comment)
        encoded_input = tokenizer(comment, return_tensors='pt', truncation=True, max_length=128)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        ranking = np.argsort(scores)[::-1]
        sentiment = config.id2label[ranking[0]]
        results.append({'id': idx, 'review': comment, 'sentiment': sentiment})
    return pd.DataFrame(results)

# Sample data for analysis
# Assuming dataframe_2019 and dataframe_2023 are already defined
sample_df_2019 = dataframe_2019.sample(n=3000)
sample_df_2023 = dataframe_2023.sample(n=3000)



positive_reviews_2019 = sample_df_2019[sample_df_2019['comments'].apply(contains_positive_words)]
negative_reviews_2019 = sample_df_2019[sample_df_2019['comments'].apply(contains_negative_words)]
neutral_reviews_2019 = sample_df_2019[~(sample_df_2019['comments'].apply(contains_positive_words) | sample_df_2019['comments'].apply(contains_negative_words))]


positive_reviews_2023 = sample_df_2023[sample_df_2023['comments'].apply(contains_positive_words)]
negative_reviews_2023 = sample_df_2023[sample_df_2023['comments'].apply(contains_negative_words)]
neutral_reviews_2023 = sample_df_2023[~(sample_df_2023['comments'].apply(contains_positive_words) | sample_df_2023['comments'].apply(contains_negative_words))]

sentiment_df_positive_2019 = analyze_sentiment(positive_reviews_2019[['id', 'comments']])
sentiment_df_negative_2019 = analyze_sentiment(negative_reviews_2019[['id', 'comments']])
sentiment_df_neutral_2019 = analyze_sentiment(neutral_reviews_2019[['id', 'comments']])

sentiment_df_positive_2023 = analyze_sentiment(positive_reviews_2023[['id', 'comments']])
sentiment_df_negative_2023 = analyze_sentiment(negative_reviews_2023[['id', 'comments']])
sentiment_df_neutral_2023 = analyze_sentiment(neutral_reviews_2023[['id', 'comments']])

# # Analyze sentiment for the sampled data
# sentiment_df_2019 = analyze_sentiment(sample_df_2019[['id', 'comments']])
# sentiment_df_2023 = analyze_sentiment(sample_df_2023[['id', 'comments']])
sentiment_df_2019 = pd.concat([sentiment_df_positive_2019, sentiment_df_negative_2019, sentiment_df_neutral_2019])
sentiment_df_2023 = pd.concat([sentiment_df_positive_2023, sentiment_df_negative_2023, sentiment_df_neutral_2023])

sentiment_df_2019.to_csv("data_sentiment/sentiment_2019.csv", index = False)
sentiment_df_2023.to_csv("data_sentiment/sentiment_2023.csv", index = False)

# Calculate sentiment counts
sentiment_counts_2019 = sentiment_df_2019['sentiment'].value_counts()
sentiment_counts_2023 = sentiment_df_2023['sentiment'].value_counts()

# Print sentiment counts
print("Sentiment counts for 2019:")
print(sentiment_counts_2019)
print("\nSentiment counts for 2023:")
print(sentiment_counts_2023)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sentiment counts for 2019:
sentiment
positive    2596
neutral      507
negative      42
Name: count, dtype: int64

Sentiment counts for 2023:
sentiment
positive    2648
neutral      466
negative      18
Name: count, dtype: int64
