In [1]:
from keybert import KeyBERT
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import pandas as pd
import torch
import os
from tqdm import tqdm, trange
from random import randint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

2023-09-05 04:24:48.917394: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-05 04:24:48.970257: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tqdm.pandas()

In [3]:
def morphs(text, noun = True, verb = False, adjective = False, adverb = False):
    tokens = word_tokenize(text)
    poses = pos_tag(tokens, tagset = 'universal')
    filters = []

    if noun:
        filters.append('NOUN')
    if verb:
        filters.append('VERB')
    if adjective:
        filters.append('ADJ')
    if adverb:
        filters.append('ADV')

    return [pos[0] for pos in poses if pos[1] in filters]

In [4]:
models = [
    'all-mpnet-base-v2',
    'multi-qa-mpnet-base-dot-v1',
    'all-distilroberta-v1',
    'all-MiniLM-L12-v2',
    'multi-qa-distilbert-cos-v1'
]

In [None]:
for model_name in models:
    # BASIC OPTION
    print(f'TRAINING ON {model_name}')

    model = KeyBERT(model_name)
    
    df1 = pd.read_csv('./datasets/original/roblox1.csv', index_col = 0, low_memory = False)
    df2 = pd.read_csv('./datasets/original/roblox2.csv', index_col = 0, low_memory = False)
    df3 = pd.read_csv('./datasets/original/roblox3.csv', index_col = 0, low_memory = False)
    df4 = pd.read_csv('./datasets/original/roblox4.csv', index_col = 0, low_memory = False)
    df5 = pd.read_csv('./datasets/original/roblox5.csv', index_col = 0, low_memory = False)
    df6 = pd.read_csv('./datasets/original/zepeto.csv', index_col = 0, low_memory = False)

    df1['keybert_keywords'] = df1['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))
    df2['keybert_keywords'] = df2['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))
    df3['keybert_keywords'] = df3['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))
    df4['keybert_keywords'] = df4['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))
    df5['keybert_keywords'] = df5['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))
    df6['keybert_keywords'] = df6['content'].progress_apply(lambda x : model.extract_keywords(x, top_n = 10))

    os.makedirs(f'./datasets/keybert-{model_name}/original')
    os.makedirs(f'./datasets/keybert-{model_name}/')

    df1.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/roblox1.csv')
    df2.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/roblox2.csv')
    df3.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/roblox3.csv')
    df4.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/roblox4.csv')
    df5.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/roblox5.csv')
    df6.reset_index(drop = True).to_csv(f'./datasets/keybert-{model_name}/zepeto.csv')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./datasets/keybert-distilbert-base-nli-mean-tokens/roblox1.csv', index_col = 0)

In [None]:
for i in range(5):
    print(df.loc[i].content)
    print(df.loc[i].keybert_keywords)
    print()

In [None]:
df = pd.read_csv('./datasets/keybert-all-mpnet-base-v2/roblox1.csv', index_col = 0)

In [None]:
for i in range(5):
    print(df.loc[i].content)
    print(df.loc[i].keybert_keywords)
    print()