In [2]:
import pandas as pd
import numpy as np


In [3]:
from sklearn.model_selection import train_test_split

sentiment_df_2019 = pd.read_csv("data_sentiment/sentiment_2019.csv", low_memory=False)
sentiment_df_2023 = pd.read_csv("data_sentiment/sentiment_2023.csv", low_memory=False)

# Υποθέτουμε ότι sentiment_df_2019 και sentiment_df_2023 έχουν συγχωνευθεί σε ένα ενιαίο dataframe `df`
df = pd.concat([sentiment_df_2019, sentiment_df_2023])

# Αποθήκευση του αρχικού πλήθους των γραμμών
initial_row_count = df.shape[0]

# Αφαίρεση των διπλότυπων γραμμών όπου οι γραμμές του 2019 εμφανίζονται στο 2023
df = df.drop_duplicates(subset=['id', 'review'], keep='first')

# Αποθήκευση του τελικού πλήθους των γραμμών
final_row_count = df.shape[0]

# Υπολογισμός του αριθμού των γραμμών που αφαιρέθηκαν
rows_dropped = initial_row_count - final_row_count

# Εκτύπωση του αριθμού των γραμμών που αφαιρέθηκαν
print(f"Number of rows dropped: {rows_dropped}")

# Διαχωρισμός των δεδομένων σε training (80%) και testing (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Αποθήκευση σε αρχεία tsv
train_df.to_csv('data_tsv/train.tsv', sep='\t', index=False)
test_df.to_csv('data_tsv/test.tsv', sep='\t', index=False)

Number of rows dropped: 281


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from gensim.models import Word2Vec

# Tf-idf Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(train_df['review'])
tfidf_test = tfidf_vectorizer.transform(test_df['review'])

# Αποθήκευση των Tf-idf χαρακτηριστικών
with open('data_pkl/tfidf_train.pkl', 'wb') as f:
    pickle.dump(tfidf_train, f)
with open('data_pkl/tfidf_test.pkl', 'wb') as f:
    pickle.dump(tfidf_test, f)


# Word2Vec
tokenized_reviews = [review.split() for review in train_df['review']]
word2vec_model = Word2Vec(tokenized_reviews, vector_size=50, window=5, min_count=1, workers=4)

# Μέθοδος για την εξαγωγή των embeddings από το Word2Vec μοντέλο
def get_word2vec_embeddings(reviews, model):
    embeddings = []
    for review in reviews:
        words = review.split()
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if word_vecs:
            embeddings.append(np.mean(word_vecs, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

word2vec_train = get_word2vec_embeddings(train_df['review'], word2vec_model)
word2vec_test = get_word2vec_embeddings(test_df['review'], word2vec_model)

# Αποθήκευση των Word2Vec χαρακτηριστικών
with open('data_pkl/word2vec_train.pkl', 'wb') as f:
    pickle.dump(word2vec_train, f)
with open('data_pkl/word2vec_test.pkl', 'wb') as f:
    pickle.dump(word2vec_test, f)

In [18]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Φόρτωση των χαρακτηριστικών Tf-idf
with open('data_pkl/tfidf_train.pkl', 'rb') as f:
    tfidf_train = pickle.load(f)

with open('data_pkl/tfidf_test.pkl', 'rb') as f:
    tfidf_test = pickle.load(f)

# Φόρτωση των χαρακτηριστικών Word Embeddings
with open('data_pkl/word2vec_train.pkl', 'rb') as f:
    word2vec_train = pickle.load(f)

with open('data_pkl/word2vec_test.pkl', 'rb') as f:
    word2vec_test = pickle.load(f)

# Φόρτωση των labels
train_df = pd.read_csv('data_tsv/train.tsv', sep='\t')
test_df = pd.read_csv('data_tsv/test.tsv', sep='\t')

y_train = train_df['sentiment']
y_test = test_df['sentiment']


# Λίστα ταξινομητών
classifiers = {
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

def evaluate_classifier(clf, x_train, y_train, x_test, y_test):
    # 10-fold Cross Validation
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Assuming y_test and y_pred are your true labels and predictions respectively
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    fscore = f1_score(y_test, y_pred, average=None)

    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'CV Accuracy': np.mean(scores),
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fscore
    }

results = {}

for name, clf in classifiers.items():
    print(f"Evaluating {name} with TFIDF features")
    results[f'{name} with TFIDF'] = evaluate_classifier(clf, tfidf_train, y_train, tfidf_test, y_test)
    
    print(f"Evaluating {name} with Word2Vec features")
    results[f'{name} with Word2Vec'] = evaluate_classifier(clf, word2vec_train, y_train, word2vec_test, y_test)

# Εκτύπωση των αποτελεσμάτων
for key, value in results.items():
    print(f"\nResults for {key}:")
    for metric, score in value.items():
        print(f"{metric}: {score}")



Evaluating SVM with TFIDF features
Evaluating SVM with Word2Vec features


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating Random Forest with TFIDF features
Evaluating Random Forest with Word2Vec features
Evaluating KNN with TFIDF features
Evaluating KNN with Word2Vec features

Results for SVM with TFIDF:
CV Accuracy: 0.9572851426583158
Test Accuracy: 0.9675
Precision: [1.         0.93617021 0.97326733]
Recall: [0.14285714 0.9119171  0.98992951]
F-Measure: [0.25       0.92388451 0.98152771]

Results for SVM with Word2Vec:
CV Accuracy: 0.9545755045233124
Test Accuracy: 0.9591666666666666
Precision: [0.         0.93820225 0.962818  ]
Recall: [0.         0.86528497 0.99093656]
F-Measure: [0.         0.90026954 0.97667494]

Results for Random Forest with TFIDF:
CV Accuracy: 0.9595776791927626
Test Accuracy: 0.9666666666666667
Precision: [1.         0.95505618 0.96853491]
Recall: [0.35714286 0.88082902 0.99194361]
F-Measure: [0.52631579 0.91644205 0.9800995 ]

Results for Random Forest with Word2Vec:
CV Accuracy: 0.9608272442588726
Test Accuracy: 0.9716666666666667
Precision: [1.         0.95652174 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
#Παραδειγμα με cross_validation

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Φόρτωση δεδομένων
iris = load_iris()
X, y = iris.data, iris.target

# Διαχωρισμός δεδομένων σε εκπαιδευτικό και σύνολο ελέγχου
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Δημιουργία μοντέλου
model = RandomForestClassifier()

# cross validation
cv_scores = cross_val_score(model, X_train, y_train, cv=10)
print("results cross validation:", cv_scores)
print("mean:", cv_scores.mean())

# Εκπαίδευση του μοντέλου στο πλήρες εκπαιδευτικό σύνολο
model.fit(X_train, y_train)

# Τελική αξιολόγηση στο σύνολο ελέγχου
test_score = model.score(X_test, y_test)
print("final results:", test_score)

results cross validation: [0.91666667 1.         1.         1.         0.66666667 0.83333333
 1.         1.         1.         0.91666667]
mean: 0.9333333333333332
final results: 1.0
