In [None]:
from keybert import KeyBERT
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import torch
import os
from tqdm import tqdm, trange
from random import randint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
tqdm.pandas()
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess(text, filter = ['NN', 'NNS', 'NNP', 'NNPS']):
    # NOUN: 'NN', 'NNS', 'NNP', 'NNPS',
    # VERB: 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
    # ADJS: 'JJ', 'JJR', 'JJS',
    # ADVS: 'RB', 'RBR', 'RBS',

    text = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text)]                     # Tokenization & Lemmatization
    text = [word for word in text if word not in stopwords.words('english') and word.isalpha()]     # Exclude Stopwords & Non-English words
    text = [word for word, tag in pos_tag(text) if tag in filter]                                   # Filter words by tags.

    return text

In [None]:
models = [
    'all-mpnet-base-v2',
    'multi-qa-mpnet-base-dot-v1',
    'all-distilroberta-v1',
    'all-MiniLM-L12-v2',
    'multi-qa-distilbert-cos-v1'
]

In [None]:
for model_name in models:
    print(f'TRAINING ON {model_name}')

    model = KeyBERT(model_name)
    
    df1 = pd.read_csv('./datasets/original/roblox1.csv', index_col = 0, low_memory = False)
    df2 = pd.read_csv('./datasets/original/roblox2.csv', index_col = 0, low_memory = False)
    df3 = pd.read_csv('./datasets/original/roblox3.csv', index_col = 0, low_memory = False)
    df4 = pd.read_csv('./datasets/original/roblox4.csv', index_col = 0, low_memory = False)
    df5 = pd.read_csv('./datasets/original/roblox5.csv', index_col = 0, low_memory = False)
    df6 = pd.read_csv('./datasets/original/zepeto.csv', index_col = 0, low_memory = False)

    df1['keybert_keywords'] = df1['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))
    df2['keybert_keywords'] = df2['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))
    df3['keybert_keywords'] = df3['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))
    df4['keybert_keywords'] = df4['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))
    df5['keybert_keywords'] = df5['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))
    df6['keybert_keywords'] = df6['content'].progress_apply(lambda x : model.extract_keywords(preprocess(x), top_n = 10))

    os.makedirs(f'./datasets/keybert-{model_name}-tokenized')

    df1.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/roblox1.csv')
    df2.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/roblox2.csv')
    df3.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/roblox3.csv')
    df4.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/roblox4.csv')
    df5.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/roblox5.csv')
    df6.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}-tokenized/zepeto.csv')