In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

from datasets import load_dataset

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.decomposition import NMF
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec, Doc2Vec

In [2]:
dataset = load_dataset("MrbBakh/Twitter_Sentiment")

In [3]:
X_train, y_train = dataset["train"][:5000].values()
X_test, y_test = dataset["test"][:1000].values()
len(X_train), len(X_test)

(5000, 1000)

## NB


In [4]:
m = make_pipeline(CountVectorizer(), MultinomialNB())
m.fit(X_train, y_train)

pred = m.predict(X_test)
accuracy_score(y_test, pred)

0.732

In [5]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75       507
           1       0.76      0.66      0.71       493

    accuracy                           0.73      1000
   macro avg       0.74      0.73      0.73      1000
weighted avg       0.74      0.73      0.73      1000



In [6]:
confusion_matrix(y_test, pred)

array([[406, 101],
       [167, 326]], dtype=int64)

## NB 2


In [7]:
w2v = Word2Vec(pd.Series(X_train).str.split())
print(w2v.wv.vectors.shape)
w2v_dict = dict(zip(w2v.wv.index_to_key, w2v.wv.vectors))

(1422, 100)


In [10]:
max_len = pd.Series(X_train).str.len().max()


def sentence_2_vec(txt):
    arr = np.zeros((max_len))

    for i, c in enumerate(txt):

        stoi = w2v.wv.key_to_index.get(c)
        arr[i] = w2v.wv.vectors[stoi].mean()
    return arr.reshape(1, -1)


arr = np.zeros((len(X_train), max_len))


for i, ch in enumerate(X_train):
    arr[i] = sentence_2_vec(ch)
arr.shape

(5000, 222)

In [26]:
m = GaussianNB().fit(arr, y_train)
pred = m.predict(arr)
accuracy_score(y_train, pred)

0.5056

In [14]:
class MeanEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec, size=100):
        self.word2vec = word2vec
        self.dim = size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array(
            [
                np.mean(
                    [self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)],
                    axis=0,
                )
                for words in X
            ]
        )

In [23]:
pipeline = make_pipeline(
    MeanEmbeddingVectorizer(w2v_dict),
    GaussianNB(),
)

pipeline = pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)


accuracy_score(y_test, pred)

0.507

In [16]:
class TfidfEmbeddingVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, word2vec, size=100):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 100

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]
        )
        return self

    def transform(self, X):
        return np.array(
            [
                np.mean(
                    [
                        self.word2vec[w] * self.word2weight[w]
                        for w in words
                        if w in self.word2vec
                    ]
                    or [np.zeros(self.dim)],
                    axis=0,
                )
                for words in X
            ]
        )

In [17]:
pipeline = make_pipeline(
    TfidfEmbeddingVectorizer(w2v_dict),
    GaussianNB(),
)


pipeline = pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)


accuracy_score(y_test, pred)

0.544

## KNN


In [18]:
m = make_pipeline(TfidfVectorizer(), KNeighborsClassifier())
m.fit(X_train, y_train)
pred = m.predict(X_test)
accuracy_score(y_test, pred)

0.674

## SVM


In [19]:
m = make_pipeline(TfidfVectorizer(), SVC())
m.fit(X_train, y_train)
pred = m.predict(X_test)
accuracy_score(y_test, pred)

0.762

## TfidfVectorizer + NB


In [20]:
vectorizer = TfidfVectorizer(max_df=0.5, stop_words="english", use_idf=True)
m = make_pipeline(
    vectorizer,
    NMF(n_components=100, random_state=42),
    Normalizer(copy=False),
    MultinomialNB(alpha=0.01),
)
m = m.fit(X_train, y_train)
pred = m.predict(X_test)
accuracy_score(y_test, pred)

0.671

# Logistic


In [21]:
m = make_pipeline(TfidfVectorizer(), LogisticRegression())
m = m.fit(X_train, y_train)
pred = m.predict(X_test)
accuracy_score(y_test, pred)

0.756

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

LR_pipeline = Pipeline(
    steps=[("tf", TfidfVectorizer()), ("lgrg", LogisticRegression())]
)

pgrid_lgrg = {
    "tf__max_features": [1000, 2000, 3000],
    "tf__ngram_range": [(1, 1), (1, 2)],
    "tf__use_idf": [True, False],
    "lgrg__penalty": ["l2", "none"],
    "lgrg__class_weight": ["balanced", None],
}

gs_lgrg = GridSearchCV(LR_pipeline, pgrid_lgrg, cv=2, n_jobs=-1, verbose=2)
gs_lgrg.fit(X_train, y_train)  # Train LR model
pred = gs_lgrg.predict(X_test)
accuracy_score(y_test, pred)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


0.754