In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from transformers import pipeline

In [4]:
X_train = pd.read_csv('data/X_train.csv').to_numpy().reshape(-1)
X_test = pd.read_csv('data/X_test.csv', usecols=['text']).to_numpy().reshape(-1)
y_train = pd.read_csv('data/y_train.csv').to_numpy().reshape(-1)
y_test = pd.read_csv('data/y_test.csv', usecols=['label']).to_numpy().reshape(-1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((200,), (1115,), (200,), (1115,))

In [5]:
# Train baseline tf-idf model
def train_baseline_tfidf(X_train, y_train, X_test, y_test):
    """Trains a tf-idf model and evaluates its performance."""
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print("Baseline Model Performance:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    return tfidf

In [6]:
# Extract keywords using transformer-based model
def extract_keywords(X_train, existing_vocab):
    """Extracts new keywords using a transformer-based model."""
    keyword_extractor = pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english')
    
    new_keywords = set()
    for text in X_train:
        entities = keyword_extractor(text)
        for entity in entities:
            word = entity['word'].lower()
            if word not in existing_vocab:
                new_keywords.add(word)

    return new_keywords

In [7]:
# Update tf-idf vocabulary
def update_tfidf_vocabulary(tfidf, new_keywords):
    """Updates the tf-idf model vocabulary with new keywords."""
    existing_vocab = set(tfidf.vocabulary_.keys())
    updated_vocab = existing_vocab.union(new_keywords)
    return TfidfVectorizer(vocabulary=updated_vocab)

In [8]:
# Train updated tf-idf model
def train_updated_tfidf(X_train, y_train, X_test, y_test, updated_tfidf):
    """Trains and evaluates the model with the updated tf-idf vocabulary."""
    X_train_tfidf = updated_tfidf.fit_transform(X_train)
    X_test_tfidf = updated_tfidf.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print("Updated Model Performance:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [9]:
tfidf = train_baseline_tfidf(X_train, y_train, X_test, y_test)

Baseline Model Performance:
[[965   0]
 [145   5]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       1.00      0.03      0.06       150

    accuracy                           0.87      1115
   macro avg       0.93      0.52      0.50      1115
weighted avg       0.89      0.87      0.81      1115



In [10]:
new_keywords = extract_keywords(X_train, tfidf.vocabulary_)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [11]:
updated_tfidf = update_tfidf_vocabulary(tfidf, new_keywords)

In [12]:
train_updated_tfidf(X_train, y_train, X_test, y_test, updated_tfidf)

Updated Model Performance:
[[965   0]
 [146   4]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       1.00      0.03      0.05       150

    accuracy                           0.87      1115
   macro avg       0.93      0.51      0.49      1115
weighted avg       0.89      0.87      0.81      1115

