In [1]:
import pandas as pd
import numpy as np

In [13]:
from sklearn.model_selection import train_test_split

# Import from CSV sentiment data
sentiment_df_2019 = pd.read_csv("data_sentiment/sentiment_2019.csv", low_memory=False)
sentiment_df_2023 = pd.read_csv("data_sentiment/sentiment_2023.csv", low_memory=False)

# Concatenate sentiment_df_2019 and sentiment_df_2023 in one dataframe
df = pd.concat([sentiment_df_2019, sentiment_df_2023])

# We want to drop lanes that are duplicate
initial_row_count = df.shape[0]                                 # Initial row count
df = df.drop_duplicates(subset=['id', 'review'], keep='first')  # Remove duplicates that are shown in both 2019 and 2023
final_row_count = df.shape[0]                                   # Final row count
rows_dropped = initial_row_count - final_row_count              # Rows dropped
print(f"Number of rows dropped: {rows_dropped}")                # Print number of rows dropped

# Split data: training (80%) and testing (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Drop some data so we have good analogy between positive, negative and neutral sentiments
positive_comments_train = train_df[train_df['sentiment'] == 'positive'].sample(n=400, random_state=42)
train_df = train_df.drop(positive_comments_train.index)

positive_comments_test = test_df[test_df['sentiment'] == 'positive'].sample(n=0, random_state=42)
test_df = test_df.drop(positive_comments_test.index)

# Print sentiments counts
print("Train value counts:")
display(train_df['sentiment'].value_counts())
print("Test value counts:")
display(test_df['sentiment'].value_counts())


# Αποθήκευση σε αρχεία tsv
train_df.to_csv('data_tsv/train.tsv', index=False)
test_df.to_csv('data_tsv/test.tsv', index=False)

Number of rows dropped: 1
Train value counts:


sentiment
positive    196
neutral     111
negative     94
Name: count, dtype: int64

Test value counts:


sentiment
positive    165
neutral      43
negative     32
Name: count, dtype: int64

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from gensim.models import Word2Vec

# Tf-idf Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(train_df['review'])
tfidf_test = tfidf_vectorizer.transform(test_df['review'])

# Save/store Tf-idf characteristics
with open('data_pkl/tfidf_train.pkl', 'wb') as f:
    pickle.dump(tfidf_train, f)
with open('data_pkl/tfidf_test.pkl', 'wb') as f:
    pickle.dump(tfidf_test, f)


# Word2Vec
tokenized_reviews = [review.split() for review in train_df['review']]
word2vec_model = Word2Vec(tokenized_reviews, vector_size=50, window=5, min_count=1, workers=4)

# Extract embeddings from Word2Vec model
def get_word2vec_embeddings(reviews, model):
    embeddings = []                                                         # List to store the embeddings
    for review in reviews:                                                  # Loop through each review in the reviews list
        words = review.split()                                              # Split the review into words
        word_vecs = [model.wv[word] for word in words if word in model.wv]  # Get each word's vector if it's in vocabulary of the model
        if word_vecs:
            embeddings.append(np.mean(word_vecs, axis=0))                   # Compute the mean of the word vectors
        else:                                                               # If there are no word vectors
            embeddings.append(np.zeros(model.vector_size))                  # Append a zero vector
    return np.array(embeddings)                                             # Return list of embeddings


word2vec_train = get_word2vec_embeddings(train_df['review'], word2vec_model)
word2vec_test = get_word2vec_embeddings(test_df['review'], word2vec_model)

# Save/store Word2Vec characteristics
with open('data_pkl/word2vec_train.pkl', 'wb') as f:
    pickle.dump(word2vec_train, f)
with open('data_pkl/word2vec_test.pkl', 'wb') as f:
    pickle.dump(word2vec_test, f)

In [18]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle

target_names = ['positive', 'negative', 'neutral']

# Load characteristics - Tf-idf
with open('data_pkl/tfidf_train.pkl', 'rb') as f:
    x_tfidf_train = pickle.load(f)
with open('data_pkl/tfidf_test.pkl', 'rb') as f:
    x_tfidf_test = pickle.load(f)

# Load characteristics - Word Embeddings
with open('data_pkl/word2vec_train.pkl', 'rb') as f:
    x_word2vec_train = pickle.load(f)
with open('data_pkl/word2vec_test.pkl', 'rb') as f:
    x_word2vec_test = pickle.load(f)

# Load labels
train_df = pd.read_csv('data_tsv/train.tsv')
test_df = pd.read_csv('data_tsv/test.tsv')

y_train = train_df['sentiment'] # Column 'sentiment'
y_test = test_df['sentiment']   # Column 'sentiment'

# Print class distribution in training data
print("Class distribution in training data:")
print(y_train.value_counts())

# Standardize the Word2Vec features
scaler = StandardScaler()
x_word2vec_train_scaled = scaler.fit_transform(x_word2vec_train)
x_word2vec_test_scaled = scaler.transform(x_word2vec_test)

# List od classifiers
classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
     "KNN": KNeighborsClassifier(n_neighbors=15)
}

def evaluate_classifier(clf, x_train, y_train, x_test, y_test):
    try:
        # 10-fold Cross Validation
        scores = cross_val_score(clf, x_train, y_train, cv=10)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        # Assuming y_test and y_pred are your true labels and predictions respectively
        report = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)

        # Check class distribution in predictions
        unique, counts = np.unique(y_pred, return_counts=True)
        class_counts = dict(zip(unique, counts))

        # Final score
        test_score = clf.score(x_test, y_test)

        # Return all values wanted and classification report
        return {
            'Results Cross Validation': scores,
            'Mean': np.mean(scores),
            'Final Results': test_score,
            'Class Counts in Predictions': class_counts,
            'Classification Report\n': report
        }
    except Exception as e:
        print(f"An error occurred during the evaluation of {clf.__class__.__name__}: {e}")
        return None

results = {}
for name, clf in classifiers.items():
    # Evaluate with tfidf
    print(f"Evaluating {name} with TFIDF features...")
    tfidf_results = evaluate_classifier(clf, x_tfidf_train, y_train, x_tfidf_test, y_test)
    if tfidf_results:
        results[f'{name} with TFIDF'] = tfidf_results
    print("Done.")

    # Evaluate with w2v
    print(f"Evaluating {name} with Word2Vec features...")
    word2vec_results = evaluate_classifier(clf, x_word2vec_train_scaled, y_train, x_word2vec_test_scaled, y_test)
    if word2vec_results:
        results[f'{name} with standardized Word2Vec'] = word2vec_results
    print("Done.")

# Print results
for key, value in results.items():
    print(f"\nResults for {key}:")
    for metric, score in value.items():
        print(f"{metric}: {score}")



Class distribution in training data:
sentiment
positive    196
neutral     111
negative     94
Name: count, dtype: int64
Evaluating SVM with TFIDF features...
Done.
Evaluating SVM with Word2Vec features...
Done.
Evaluating Random Forest with TFIDF features...
Done.
Evaluating Random Forest with Word2Vec features...
Done.
Evaluating KNN with TFIDF features...
Done.
Evaluating KNN with Word2Vec features...
Done.

Results for SVM with TFIDF:
Results Cross Validation: [0.73170732 0.775      0.875      0.825      0.7        0.875
 0.775      0.775      0.75       0.875     ]
Mean: 0.7956707317073171
Final Results: 0.8791666666666667
Class Counts in Predictions: {'negative': 22, 'neutral': 25, 'positive': 193}
Classification Report
:               precision    recall  f1-score   support

    positive       1.00      0.69      0.81        32
    negative       0.96      0.56      0.71        43
     neutral       0.85      1.00      0.92       165

    accuracy                           0.88 