In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.base import clone
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("tweet_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,author_id,author_name,date,emails,hashtags,language,likes,location,mentions,...,n_all_caps,n_excl_marks,n_words,n_urls,n_mentions,n_hashtags,text_semantic,n_nouns,n_verbs,n_adjectives
0,0,@Acutoronto,Acutoronto,"Mar 27, 2022",0.0,"motivation,acutoronto,acupuncture,tcm,fertilit...",en,0,,,...,0,0,22,0,0,19,sunday inspirationalquotes reproductivehealth,19,1,2
1,1,@LorraineZiff,Lorraine Ziff,"Mar 26, 2022",0.0,"lorraineziff,dearlorraine,smile,friendship,lua...",en,5,,larryziff,...,0,1,24,1,1,18,luau ready marinemax sarasota,22,0,2
2,2,@ImpactWellness,Chris Caito,"Mar 27, 2022",0.0,"MuhammadAli,john316,isaiah4031,philippians413,...",en,0,,,...,2,0,24,0,0,11,hated every minute training said quit suffer l...,19,3,1
3,3,@hiromiyoshihair,Hiro Miyoshi,"Mar 27, 2022",0.0,"mothersday,grateful,thankful",en,0,"London, England",,...,1,1,18,0,0,3,mother greateful wishing happiest day hiromiyo...,12,2,3
4,4,@lunaxbrightwin,Liz lvs Joy,"Mar 27, 2022",0.0,"WildsideOutNow,JOY,RedVelvet",en,6,,RVsmtown0,...,14,0,19,0,1,3,joy eres la mujer talentosa que existe red vel...,12,1,3


In [3]:
n_splits = 5
n_repeats = 2
rskf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, random_state=1410)

clfs = {"MNB" : MultinomialNB(), "SVM" : SVC(random_state=1234)}

scores = np.zeros(shape=(3, len(clfs), n_repeats * n_splits))

# Metoda 1 -- TF-IDF

In [4]:
X = df['text_semantic'].apply(lambda txt: np.str_(txt))
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.feature_extraction.text import TfidfVectorizer

for clf_id, clf_name in enumerate(clfs):
    for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X[train_idx])
        clf = clone(clfs[clf_name])
        y_pred = clf.fit(X_train, y[train_idx]).predict(vectorizer.transform(X[test_idx]))
        scores[0, clf_id, fold_id] = accuracy_score(y[test_idx], y_pred)


# Metoda 2 -- metadane

In [5]:
X = df[['n_words', 'n_nouns', 'n_verbs', 'n_adjectives', 'n_all_caps', 'n_excl_marks', 'n_hashtags', 'n_mentions', 'n_urls', 'emails', 'quotes', 'retweets', 'likes']].astype(int).to_numpy()
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.preprocessing import MinMaxScaler

# MNB
for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
    clf = clone(clfs['MNB'])
    y_pred = clf.fit(X[train_idx], y[train_idx]).predict(X[test_idx])
    scores[1, 0, fold_id] = accuracy_score(y[test_idx], y_pred)

# SVM
for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
    clf = clone(clfs['SVM'])
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X[train_idx])
    y_pred = clf.fit(X_train, y[train_idx]).predict(scaler.transform(X[test_idx]))
    scores[1, 1, fold_id] = accuracy_score(y[test_idx], y_pred)

# Metoda 3 -- mieszana

In [6]:
X = df['text'].apply(lambda txt: np.str_(txt))
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.feature_extraction.text import TfidfVectorizer

for clf_id, clf_name in enumerate(clfs):
    for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X[train_idx])
        clf = clone(clfs[clf_name])
        y_pred = clf.fit(X_train, y[train_idx]).predict(vectorizer.transform(X[test_idx]))
        scores[2, clf_id, fold_id] = accuracy_score(y[test_idx], y_pred)

In [7]:
np.set_printoptions(precision=3)
print(scores)

[[[0.675 0.671 0.678 0.673 0.673 0.702 0.641 0.666 0.663 0.682]
  [0.666 0.648 0.669 0.667 0.638 0.685 0.651 0.667 0.646 0.643]]

 [[0.415 0.411 0.482 0.411 0.442 0.422 0.409 0.439 0.432 0.442]
  [0.527 0.516 0.522 0.543 0.528 0.526 0.532 0.55  0.491 0.527]]

 [[0.672 0.681 0.673 0.669 0.673 0.696 0.634 0.669 0.651 0.667]
  [0.677 0.672 0.643 0.658 0.652 0.681 0.641 0.697 0.649 0.657]]]


In [11]:
print(np.mean(scores, axis=2))

[[0.672 0.658]
 [0.431 0.526]
 [0.668 0.663]]


# Analiza statystyczna

In [14]:
results = np.array([[0.617, 0.652, 0.596, 0.402, 0.579, 0.626, 0.577, 0.655, 0.554, 0.670],
                    [0.689, 0.682, 0.683, 0.671, 0.681, 0.701, 0.652, 0.680, 0.677, 0.681],
                    [0.675, 0.671, 0.678, 0.673, 0.673, 0.702, 0.641, 0.666, 0.663, 0.682],
                    [0.666, 0.648, 0.669, 0.667, 0.638, 0.685, 0.651, 0.667, 0.646, 0.643]])

methods = ["RNN", "CNN", "MNB", "SVM"]

alpha = 0.05
w_statistic = np.zeros((len(methods), len(methods)))
p_value = np.zeros((len(methods), len(methods)))

from scipy.stats import ttest_rel

for i in range(len(methods)):
    for j in range(len(methods)):
        w_statistic[i, j], p_value[i, j] = ttest_rel(results[i], results[j])

from tabulate import tabulate

headers = list(methods)
names_column = np.expand_dims(np.array(list(methods)), axis=1)
w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
w_statistic_table = tabulate(w_statistic_table, headers, floatfmt=".3f")
p_value_table = np.concatenate((names_column, p_value), axis=1)
p_value_table = tabulate(p_value_table, headers, floatfmt=".3f")
print("\nw-statistic:\n", w_statistic_table, "\n\np-value:\n", p_value_table)

0 0 nan
-----
0 1 0.0044724039768617475
-----
0 2 0.008779459975274592
-----
0 3 0.02943243708353987
-----
1 0 0.0044724039768617475
-----
1 1 nan
-----
1 2 0.00675103896460789
-----
1 3 0.0010253512203780924
-----
2 0 0.008779459975274592
-----
2 1 0.00675103896460789
-----
2 2 nan
-----
2 3 0.015040670245087963
-----
3 0 0.02943243708353987
-----
3 1 0.0010253512203780924
-----
3 2 0.015040670245087963
-----
3 3 nan
-----

w-statistic:
          RNN      CNN      MNB      SVM
---  -------  -------  -------  -------
RNN  nan       -3.762   -3.331   -2.585
CNN    3.762  nan        3.497    4.763
MNB    3.331   -3.497  nan        2.997
SVM    2.585   -4.763   -2.997  nan 

p-value:
          RNN      CNN      MNB      SVM
---  -------  -------  -------  -------
RNN  nan        0.004    0.009    0.029
CNN    0.004  nan        0.007    0.001
MNB    0.009    0.007  nan        0.015
SVM    0.029    0.001    0.015  nan
