In [1]:
!pip install lbl2vec

Collecting lbl2vec
  Using cached lbl2vec-1.0.2-py3-none-any.whl (24 kB)
Collecting gensim>=4.0.1
  Using cached gensim-4.3.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
Collecting transformers>=4.24.0
  Using cached transformers-4.26.0-py3-none-any.whl (6.3 MB)
Collecting torch>=1.13
  Using cached torch-1.13.1-cp39-cp39-manylinux1_x86_64.whl (887.4 MB)
Collecting syntok>=1.4.4
  Using cached syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting sentence-transformers>=2.2.2
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ray>=2.1.0
  Using cached ray-2.2.0-cp39-cp39-manylinux2014_x86_64.whl (57.4 MB)
Collecting FuzzyTM>=0.4.0
  Using cached FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Collecting click>=7.0
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting aiosignal
  Using cached aiosignal-1.3.1-py3-non

In [14]:
from ipywidgets import FloatProgress

In [2]:
!pip install scikit-learn



In [3]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [8]:
# load data
train = fetch_20newsgroups(subset='train', shuffle=False)
test = fetch_20newsgroups(subset='test', shuffle=False)

# parse data to pandas DataFrames
newsgroup_test = pd.DataFrame({'article':test.data, 'class_index':test.target})
newsgroup_train = pd.DataFrame({'article':train.data, 'class_index':train.target})

# load labels with keywords
labels = pd.read_csv('./keywords/20newsgroups_keywords.csv',sep=';')


In [16]:
newsgroup_test 

Unnamed: 0,article,class_index,data_set_type
0,From: stimpy@dev-null.phys.psu.edu (Gregory Na...,10,test
1,From: kennejs@a.cs.okstate.edu (KENNEDY JAMES ...,16,test
2,From: perky@acs.bu.edu (Melissa Sherrin)\nSubj...,14,test
3,From: evansmp@uhura.aston.ac.uk (Mark Evans)\n...,18,test
4,From: sxs@extol.Convergent.Com (S. Sridhar)\nS...,5,test
...,...,...,...
7527,From: fennell@well.sf.ca.us (Michael Daniel Fe...,12,test
7528,From: tony@morgan.demon.co.uk (Tony Kidson)\nS...,8,test
7529,From: sadams@eis.calstate.edu (Steven Adams)\n...,4,test
7530,From: jtobias@cs.tamu.edu (Jason T Tobias)\nSu...,6,test


In [9]:
# split keywords by separator and save them as array
labels['keywords'] = labels['keywords'].apply(lambda x: x.split(' '))

# convert description keywords to lowercase
labels['keywords'] = labels['keywords'].apply(lambda description_keywords: [keyword.lower() for keyword in description_keywords])

# get number of keywords for each class
labels['number_of_keywords'] = labels['keywords'].apply(lambda row: len(row))

# lets check our keywords
print(labels)

   class_index          class_name               keywords  number_of_keywords
0            8     rec.motorcycles    [bikes, motorcycle]                   2
1            9  rec.sport.baseball             [baseball]                   1
2           10    rec.sport.hockey               [hockey]                   1
3           11           sci.crypt  [encryption, privacy]                   2


In [10]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.models.doc2vec import TaggedDocument

# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

# add data set type column
newsgroup_train['data_set_type'] = 'train'
newsgroup_test['data_set_type'] = 'test'

# concat train and test data
newsgroup_full_corpus = pd.concat([newsgroup_train,newsgroup_test]).reset_index(drop=True)

# reduce dataset to only articles that belong to classes where we defined our keywords
newsgroup_full_corpus = newsgroup_full_corpus[newsgroup_full_corpus['class_index'].isin(list(labels['class_index']))]

# tokenize and tag documents for Lbl2Vec training
newsgroup_full_corpus['tagged_docs'] = newsgroup_full_corpus.apply(lambda row: TaggedDocument(tokenize(row['article']), [str(row.name)]), axis=1)

# add doc_key column
newsgroup_full_corpus['doc_key'] = newsgroup_full_corpus.index.astype(str)

# add class_name column
newsgroup_full_corpus = newsgroup_full_corpus.merge(labels, left_on='class_index', right_on='class_index', how='left')

print(newsgroup_full_corpus.head())

                                             article  class_index  \
0  From: cubbie@garnet.berkeley.edu (            ...            9   
1  From: crypt-comments@math.ncsu.edu\nSubject: C...           11   
2  From: george@minster.york.ac.uk\nSubject: Non-...           11   
3  From: williac@govonca.gov.on.ca (Chris William...           10   
4  From: ayari@judikael.loria.fr (Ayari Iskander)...           10   

  data_set_type                                        tagged_docs doc_key  \
0         train  ([from, cubbie, garnet, berkeley, edu, subject...       0   
1         train  ([from, crypt, comments, math, ncsu, edu, subj...       2   
2         train  ([from, george, minster, york, ac, uk, subject...      11   
3         train  ([from, williac, govonca, gov, on, ca, chris, ...      12   
4         train  ([from, ayari, judikael, loria, fr, ayari, isk...      15   

           class_name               keywords  number_of_keywords  
0  rec.sport.baseball             [baseball]     

In [15]:
from lbl2vec import Lbl2Vec

# init model with parameters
Lbl2Vec_model = Lbl2Vec(keywords_list=list(labels.keywords), tagged_documents=newsgroup_full_corpus['tagged_docs'][newsgroup_full_corpus['data_set_type'] == 'train'], label_names=list(labels.class_name), similarity_threshold=0.43, min_num_docs=100, epochs=10)

# train model
Lbl2Vec_model.fit()

2023-01-31 02:32:18,472 - Lbl2Vec - INFO - Train document and word embeddings
2023-01-31 02:32:18,472 - Lbl2Vec - INFO - Train document and word embeddings
2023-01-31 02:32:26,286 - Lbl2Vec - INFO - Train label embeddings
2023-01-31 02:32:26,286 - Lbl2Vec - INFO - Train label embeddings


In [12]:
from sklearn.metrics import f1_score

# predict similarity scores
model_docs_lbl_similarities = Lbl2Vec_model.predict_model_docs()

# merge DataFrames to compare the predicted and true category labels
evaluation_train = model_docs_lbl_similarities.merge(newsgroup_full_corpus[newsgroup_full_corpus['data_set_type'] == 'train'], left_on='doc_key', right_on='doc_key')
y_true_train = evaluation_train['class_name']
y_pred_train = evaluation_train['most_similar_label']

print('F1 score:',f1_score(y_true_train, y_pred_train, average='micro'))

2023-01-31 02:31:24,251 - Lbl2Vec - INFO - Get document embeddings from model
2023-01-31 02:31:24,255 - Lbl2Vec - INFO - Calculate document<->label similarities


F1 score: 0.896234309623431


In [18]:
from lbl2vec import Lbl2TransformerVec

# init model using the default transformer-embedding model ("sentence-transformers/all-MiniLM-L6-v2")
model = Lbl2TransformerVec(keywords_list=["Not Found","Not Specified"], documents=["Nor Found" ,"Not Found","Not done." , "Not found",
"Not found."])

# train model
model.fit()

ValueError: keywords_list has to be an iterable list of lists with descriptive keywords of type str