In [22]:
import pandas as pd
import numpy as np

In [31]:
from sklearn.model_selection import train_test_split

sentiment_df_2019 = pd.read_csv("data_sentiment/sentiment_2019.csv", low_memory=False)
sentiment_df_2023 = pd.read_csv("data_sentiment/sentiment_2023.csv", low_memory=False)

# Υποθέτουμε ότι sentiment_df_2019 και sentiment_df_2023 έχουν συγχωνευθεί σε ένα ενιαίο dataframe `df`
df = pd.concat([sentiment_df_2019, sentiment_df_2023])

# Αποθήκευση του αρχικού πλήθους των γραμμών
initial_row_count = df.shape[0]

# Αφαίρεση των διπλότυπων γραμμών όπου οι γραμμές του 2019 εμφανίζονται στο 2023
df = df.drop_duplicates(subset=['id', 'review'], keep='first')

# Αποθήκευση του τελικού πλήθους των γραμμών
final_row_count = df.shape[0]

# Υπολογισμός του αριθμού των γραμμών που αφαιρέθηκαν
rows_dropped = initial_row_count - final_row_count

# Εκτύπωση του αριθμού των γραμμών που αφαιρέθηκαν
print(f"Number of rows dropped: {rows_dropped}")

# Διαχωρισμός των δεδομένων σε training (80%) και testing (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Αποθήκευση σε αρχεία tsv
train_df.to_csv('data_tsv/train.tsv', index=False)
test_df.to_csv('data_tsv/test.tsv', index=False)

Number of rows dropped: 3


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from gensim.models import Word2Vec

# Tf-idf Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(train_df['review'])
tfidf_test = tfidf_vectorizer.transform(test_df['review'])

# Αποθήκευση των Tf-idf χαρακτηριστικών
with open('data_pkl/tfidf_train.pkl', 'wb') as f:
    pickle.dump(tfidf_train, f)
with open('data_pkl/tfidf_test.pkl', 'wb') as f:
    pickle.dump(tfidf_test, f)


# Word2Vec
tokenized_reviews = [review.split() for review in train_df['review']]
word2vec_model = Word2Vec(tokenized_reviews, vector_size=50, window=5, min_count=1, workers=4)

# Μέθοδος για την εξαγωγή των embeddings από το Word2Vec μοντέλο
def get_word2vec_embeddings(reviews, model):
    embeddings = []
    for review in reviews:
        words = review.split()
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if word_vecs:
            embeddings.append(np.mean(word_vecs, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

word2vec_train = get_word2vec_embeddings(train_df['review'], word2vec_model)
word2vec_test = get_word2vec_embeddings(test_df['review'], word2vec_model)

# Αποθήκευση των Word2Vec χαρακτηριστικών
with open('data_pkl/word2vec_train.pkl', 'wb') as f:
    pickle.dump(word2vec_train, f)
with open('data_pkl/word2vec_test.pkl', 'wb') as f:
    pickle.dump(word2vec_test, f)

In [33]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

target_names = ['positive', 'negative', 'neutral']

# Φόρτωση των χαρακτηριστικών Tf-idf
with open('data_pkl/tfidf_train.pkl', 'rb') as f:
    x_tfidf_train = pickle.load(f)

with open('data_pkl/tfidf_test.pkl', 'rb') as f:
    x_tfidf_test = pickle.load(f)

# Φόρτωση των χαρακτηριστικών Word Embeddings
with open('data_pkl/word2vec_train.pkl', 'rb') as f:
    x_word2vec_train = pickle.load(f)

with open('data_pkl/word2vec_test.pkl', 'rb') as f:
    x_word2vec_test = pickle.load(f)

# Φόρτωση των labels
train_df = pd.read_csv('data_tsv/train.tsv')
test_df = pd.read_csv('data_tsv/test.tsv')

y_train = train_df['sentiment']
y_test = test_df['sentiment']


# Λίστα ταξινομητών
classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=15)
}


def evaluate_classifier(clf, x_train, y_train, x_test, y_test):

    # 10-fold Cross Validation
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Assuming y_test and y_pred are your true labels and predictions respectively
    report = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)

    # Check class distribution in predictions
    unique, counts = np.unique(y_pred, return_counts=True)
    class_counts = dict(zip(unique, counts))

    # Τελική αξιολόγηση στο σύνολο ελέγχου
    test_score = clf.score(x_test, y_test)

    return {
        'Results Cross Validation': scores,
        'Mean': np.mean(scores),
        'Final Results': test_score,
        'Class Counts in Predictions': class_counts,
        'Classification Report\n': report
    }

results = {}

for name, clf in classifiers.items():
    print(f"Evaluating {name} with TFIDF features...")
    results[f'{name} with TFIDF'] = evaluate_classifier(clf, x_tfidf_train, y_train, x_tfidf_test, y_test)
    print("Done.")

    print(f"Evaluating {name} with Word2Vec features...")
    results[f'{name} with Word2Vec'] = evaluate_classifier(clf, x_word2vec_train, y_train, x_word2vec_test, y_test)
    print("Done.")

# Εκτύπωση των αποτελεσμάτων
for key, value in results.items():
    print(f"\nResults for {key}:")
    for metric, score in value.items():
        print(f"{metric}: {score}")

Evaluating SVM with TFIDF features...
Done.
Evaluating SVM with Word2Vec features...
Done.
Evaluating Random Forest with TFIDF features...
Done.
Evaluating Random Forest with Word2Vec features...
Done.
Evaluating KNN with TFIDF features...
Done.
Evaluating KNN with Word2Vec features...
Done.

Results for SVM with TFIDF:
Results Cross Validation: [0.93125    0.90625    0.93125    0.91875    0.95       0.90625
 0.90625    0.93081761 0.91194969 0.91823899]
Mean: 0.9211006289308175
Final Results: 0.9425
Class Counts in Predictions: {'negative': 22, 'neutral': 26, 'positive': 352}
Classification Report
:               precision    recall  f1-score   support

    positive       1.00      0.81      0.90        27
    negative       1.00      0.59      0.74        44
     neutral       0.93      1.00      0.97       329

    accuracy                           0.94       400
   macro avg       0.98      0.80      0.87       400
weighted avg       0.95      0.94      0.94       400


Results for