In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.model_selection import train_test_split

sentiment_df_2019 = pd.read_csv("data_sentiment/sentiment_2019.csv", low_memory=False)
sentiment_df_2023 = pd.read_csv("data_sentiment/sentiment_2023.csv", low_memory=False)

# Υποθέτουμε ότι sentiment_df_2019 και sentiment_df_2023 έχουν συγχωνευθεί σε ένα ενιαίο dataframe `df`
df = pd.concat([sentiment_df_2019, sentiment_df_2023])

# Αποθήκευση του αρχικού πλήθους των γραμμών
initial_row_count = df.shape[0]

# Αφαίρεση των διπλότυπων γραμμών όπου οι γραμμές του 2019 εμφανίζονται στο 2023
df = df.drop_duplicates(subset=['id', 'review'], keep='first')

# Αποθήκευση του τελικού πλήθους των γραμμών
final_row_count = df.shape[0]

# Υπολογισμός του αριθμού των γραμμών που αφαιρέθηκαν
rows_dropped = initial_row_count - final_row_count

# Εκτύπωση του αριθμού των γραμμών που αφαιρέθηκαν
print(f"Number of rows dropped: {rows_dropped}")

# Διαχωρισμός των δεδομένων σε training (80%) και testing (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Αποθήκευση σε αρχεία tsv
train_df.to_csv('data_tsv/train.tsv', sep='\t', index=False)
test_df.to_csv('data_tsv/test.tsv', sep='\t', index=False)

Number of rows dropped: 281


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from gensim.models import Word2Vec

# Tf-idf Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(train_df['review'])
tfidf_test = tfidf_vectorizer.transform(test_df['review'])

# Αποθήκευση των Tf-idf χαρακτηριστικών
with open('data_pkl/tfidf_train.pkl', 'wb') as f:
    pickle.dump(tfidf_train, f)
with open('data_pkl/tfidf_test.pkl', 'wb') as f:
    pickle.dump(tfidf_test, f)


# Word2Vec
tokenized_reviews = [review.split() for review in train_df['review']]
word2vec_model = Word2Vec(tokenized_reviews, vector_size=50, window=5, min_count=1, workers=4)

# Μέθοδος για την εξαγωγή των embeddings από το Word2Vec μοντέλο
def get_word2vec_embeddings(reviews, model):
    embeddings = []
    for review in reviews:
        words = review.split()
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if word_vecs:
            embeddings.append(np.mean(word_vecs, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

word2vec_train = get_word2vec_embeddings(train_df['review'], word2vec_model)
word2vec_test = get_word2vec_embeddings(test_df['review'], word2vec_model)

# Αποθήκευση των Word2Vec χαρακτηριστικών
with open('data_pkl/word2vec_train.pkl', 'wb') as f:
    pickle.dump(word2vec_train, f)
with open('data_pkl/word2vec_test.pkl', 'wb') as f:
    pickle.dump(word2vec_test, f)

In [4]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Φόρτωση των χαρακτηριστικών Tf-idf
with open('data_pkl/tfidf_train.pkl', 'rb') as f:
    tfidf_train = pickle.load(f)

with open('data_pkl/tfidf_test.pkl', 'rb') as f:
    tfidf_test = pickle.load(f)

# Φόρτωση των χαρακτηριστικών Word Embeddings
with open('data_pkl/word2vec_train.pkl', 'rb') as f:
    word2vec_train = pickle.load(f)

with open('data_pkl/word2vec_test.pkl', 'rb') as f:
    word2vec_test = pickle.load(f)

# Φόρτωση των labels
train_df = pd.read_csv('data_tsv/train.tsv', sep='\t')
test_df = pd.read_csv('data_tsv/test.tsv', sep='\t')

y_train = train_df['sentiment']
y_test = test_df['sentiment']


# Λίστα ταξινομητών
classifiers = {
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

def evaluate_classifier(clf, x_train, y_train, x_test, y_test):
    # 10-fold Cross Validation
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Assuming y_test and y_pred are your true labels and predictions respectively
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    fscore = f1_score(y_test, y_pred, average=None)

    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'CV Accuracy': np.mean(scores),
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fscore
    }

results = {}

for name, clf in classifiers.items():
    print(f"Evaluating {name} with TFIDF features")
    results[f'{name} with TFIDF'] = evaluate_classifier(clf, tfidf_train, y_train, tfidf_test, y_test)
    
    print(f"Evaluating {name} with Word2Vec features")
    results[f'{name} with Word2Vec'] = evaluate_classifier(clf, word2vec_train, y_train, word2vec_test, y_test)

# Εκτύπωση των αποτελεσμάτων
for key, value in results.items():
    print(f"\nResults for {key}:")
    for metric, score in value.items():
        print(f"{metric}: {score}")



Evaluating SVM with TFIDF features
Evaluating SVM with Word2Vec features


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating Random Forest with TFIDF features
Evaluating Random Forest with Word2Vec features
Evaluating KNN with TFIDF features


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating KNN with Word2Vec features

Results for SVM with TFIDF:
CV Accuracy: 0.9572851426583158
Test Accuracy: 0.9675
Precision: [1.         0.93617021 0.97326733]
Recall: [0.14285714 0.9119171  0.98992951]
F-Measure: [0.25       0.92388451 0.98152771]

Results for SVM with Word2Vec:
CV Accuracy: 0.9545755045233124
Test Accuracy: 0.9608333333333333
Precision: [0.         0.94382022 0.96379648]
Recall: [0.         0.87046632 0.99194361]
F-Measure: [0.         0.90566038 0.97766749]

Results for Random Forest with TFIDF:
CV Accuracy: 0.9606193458594292
Test Accuracy: 0.9708333333333333
Precision: [1.         0.94652406 0.97519841]
Recall: [0.35714286 0.91709845 0.98992951]
F-Measure: [0.52631579 0.93157895 0.98250875]

Results for Random Forest with Word2Vec:
CV Accuracy: 0.9618693458594292
Test Accuracy: 0.9675
Precision: [1.         0.93582888 0.97321429]
Recall: [0.35714286 0.90673575 0.98791541]
F-Measure: [0.52631579 0.92105263 0.98050975]

Results for KNN with TFIDF:
CV Accuracy

In [17]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

target_names = ['positive', 'negative', 'neutral']

# Φόρτωση των χαρακτηριστικών Tf-idf
with open('data_pkl/tfidf_train.pkl', 'rb') as f:
    x_tfidf_train = pickle.load(f)

with open('data_pkl/tfidf_test.pkl', 'rb') as f:
    x_tfidf_test = pickle.load(f)

# Φόρτωση των χαρακτηριστικών Word Embeddings
with open('data_pkl/word2vec_train.pkl', 'rb') as f:
    x_word2vec_train = pickle.load(f)

with open('data_pkl/word2vec_test.pkl', 'rb') as f:
    x_word2vec_test = pickle.load(f)

# Φόρτωση των labels
train_df = pd.read_csv('data_tsv/train.tsv', sep='\t')
test_df = pd.read_csv('data_tsv/test.tsv', sep='\t')

y_train = train_df['sentiment']
y_test = test_df['sentiment']


# Λίστα ταξινομητών
classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=15)
}

def evaluate_classifier(clf, x_train, y_train, x_test, y_test):

    # 10-fold Cross Validation
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Assuming y_test and y_pred are your true labels and predictions respectively
    report = classification_report(y_test, y_pred, target_names=target_names, zero_division=np.nan)

    # Τελική αξιολόγηση στο σύνολο ελέγχου
    test_score = clf.score(x_test, y_test)

    return {
        'Results Cross Validation': scores,
        'Mean': np.mean(scores),
        'Final Results': test_score,
        'Classification Report\n': report
    }

results = {}

for name, clf in classifiers.items():
    print(f"Evaluating {name} with TFIDF features")
    results[f'{name} with TFIDF'] = evaluate_classifier(clf, x_tfidf_train, y_train, x_tfidf_test, y_test)
    
    print(f"Evaluating {name} with Word2Vec features")
    results[f'{name} with Word2Vec'] = evaluate_classifier(clf, x_word2vec_train, y_train, x_word2vec_test, y_test)

# Εκτύπωση των αποτελεσμάτων
for key, value in results.items():
    print(f"\nResults for {key}:")
    for metric, score in value.items():
        print(f"{metric}: {score}")

Evaluating SVM with TFIDF features
Evaluating SVM with Word2Vec features
Evaluating Random Forest with TFIDF features
Evaluating Random Forest with Word2Vec features
Evaluating KNN with TFIDF features
Evaluating KNN with Word2Vec features

Results for SVM with TFIDF:
Results Cross Validation: [0.95       0.95208333 0.96666667 0.94583333 0.94791667 0.9625
 0.94791667 0.94791667 0.96875    0.96450939]
Mean: 0.9554092727905358
Final Results: 0.96
Classification Report
:               precision    recall  f1-score   support

    positive       1.00      0.07      0.13        14
    negative       0.95      0.85      0.90       193
     neutral       0.96      0.99      0.98       993

    accuracy                           0.96      1200
   macro avg       0.97      0.64      0.67      1200
weighted avg       0.96      0.96      0.95      1200


Results for SVM with Word2Vec:
Results Cross Validation: [0.94791667 0.94583333 0.95625    0.95       0.94791667 0.9625
 0.95208333 0.94791667 0.9