In [62]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.base import clone
from sklearn.metrics import accuracy_score

In [63]:
df = pd.read_csv("tweet_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,author_id,author_name,date,emails,hashtags,language,likes,location,mentions,...,n_all_caps,n_excl_marks,n_words,n_urls,n_mentions,n_hashtags,text_semantic,n_nouns,n_verbs,n_adjectives
0,0,@Acutoronto,Acutoronto,"Mar 27, 2022",0.0,"motivation,acutoronto,acupuncture,tcm,fertilit...",en,0,,,...,0,0,22,0,0,19,sunday inspirationalquotes reproductivehealth,19,1,2
1,1,@LorraineZiff,Lorraine Ziff,"Mar 26, 2022",0.0,"lorraineziff,dearlorraine,smile,friendship,lua...",en,5,,larryziff,...,0,1,24,1,1,18,luau ready marinemax sarasota,22,0,2
2,2,@ImpactWellness,Chris Caito,"Mar 27, 2022",0.0,"MuhammadAli,john316,isaiah4031,philippians413,...",en,0,,,...,2,0,24,0,0,11,hated every minute training said quit suffer l...,19,3,1
3,3,@hiromiyoshihair,Hiro Miyoshi,"Mar 27, 2022",0.0,"mothersday,grateful,thankful",en,0,"London, England",,...,1,1,18,0,0,3,mother greateful wishing happiest day hiromiyo...,12,2,3
4,4,@lunaxbrightwin,Liz lvs Joy,"Mar 27, 2022",0.0,"WildsideOutNow,JOY,RedVelvet",en,6,,RVsmtown0,...,14,0,19,0,1,3,joy eres la mujer talentosa que existe red vel...,12,1,3


In [64]:
n_splits = 5
n_repeats = 2
rskf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, random_state=1410)

clfs = {"MNB" : MultinomialNB(), "SVM" : SVC(random_state=1234)}

scores = np.zeros(shape=(3, len(clfs), n_repeats * n_splits))

# Metoda 1 -- TF-IDF

In [65]:
X = df['text_semantic'].apply(lambda txt: np.str_(txt))
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.feature_extraction.text import TfidfVectorizer

for clf_id, clf_name in enumerate(clfs):
    for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X[train_idx])
        clf = clone(clfs[clf_name])
        y_pred = clf.fit(X_train, y[train_idx]).predict(vectorizer.transform(X[test_idx]))
        scores[0, clf_id, fold_id] = accuracy_score(y[test_idx], y_pred)


# Metoda 2 -- metadane

In [66]:
X = df[['n_words', 'n_nouns', 'n_verbs', 'n_adjectives', 'n_all_caps', 'n_excl_marks', 'n_hashtags', 'n_mentions', 'n_urls', 'emails', 'quotes', 'retweets', 'likes']].astype(int).to_numpy()
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.preprocessing import MinMaxScaler

# MNB
for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
    clf = clone(clfs['MNB'])
    y_pred = clf.fit(X[train_idx], y[train_idx]).predict(X[test_idx])
    scores[1, 0, fold_id] = accuracy_score(y[test_idx], y_pred)

# SVM
for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
    clf = clone(clfs['SVM'])
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X[train_idx])
    y_pred = clf.fit(X_train, y[train_idx]).predict(scaler.transform(X[test_idx]))
    scores[1, 1, fold_id] = accuracy_score(y[test_idx], y_pred)

# Metoda 3 -- mieszana

In [67]:
X = df['text'].apply(lambda txt: np.str_(txt))
y = df['sentiment'].map({'negative' : 0, 'neutral' : 1, 'positive' : 2}).astype(int).to_numpy()
from sklearn.feature_extraction.text import TfidfVectorizer

for clf_id, clf_name in enumerate(clfs):
    for fold_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X[train_idx])
        clf = clone(clfs[clf_name])
        y_pred = clf.fit(X_train, y[train_idx]).predict(vectorizer.transform(X[test_idx]))
        scores[2, clf_id, fold_id] = accuracy_score(y[test_idx], y_pred)

In [68]:
print(scores)

[[[0.67503693 0.67060561 0.67751479 0.67307692 0.67307692 0.70162482
   0.64106352 0.66568047 0.66272189 0.68195266]
  [0.6661743  0.64844904 0.66863905 0.66715976 0.63757396 0.68537666
   0.65140325 0.66715976 0.6464497  0.64349112]]

 [[0.41506647 0.41063516 0.48224852 0.4112426  0.44230769 0.42245199
   0.40915805 0.43934911 0.43195266 0.44230769]
  [0.52732644 0.5155096  0.52218935 0.54289941 0.52810651 0.52584934
   0.53175775 0.55029586 0.49112426 0.52662722]]

 [[0.67208272 0.68094535 0.67307692 0.66863905 0.67307692 0.6957164
   0.63367799 0.66863905 0.65088757 0.66715976]
  [0.67651403 0.67208272 0.64349112 0.65828402 0.65236686 0.68094535
   0.64106352 0.69674556 0.64940828 0.65680473]]]
