ENVIRON

In [1]:
import networkx as nx
import pandas as pd
import os
import numpy as np
import seaborn as sns
from tqdm import tqdm, trange
from itertools import combinations

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag

In [3]:
tqdm.pandas()

In [4]:
lemmatizer = WordNetLemmatizer()
stopword_list = stopwords.words('english')

In [5]:
def flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

In [6]:
def morphs(text, noun = True, verb = False, adjective = False, adverb = False):
    poses = pos_tag(text, tagset = 'universal')
    filters = []

    if noun:
        filters.append('NOUN')
    if verb:
        filters.append('VERB')
    if adjective:
        filters.append('ADJ')
    if adverb:
        filters.append('ADV')

    return [pos[0] for pos in poses if pos[1] in filters]

In [7]:
for index, model_option in enumerate(os.listdir('./datasets/')):
    # INDEXING
    print(f'[{index}/{len(os.listdir("./datasets/"))}] - {model_option}')
    if model_option == 'original' or model_option == 'tfidf':
        continue

    # ------------------------------

    # DATASET - ROBLOX
    roblox1_df = pd.read_csv(f'./datasets/{model_option}/roblox1.csv', index_col = 0, low_memory = False)
    roblox2_df = pd.read_csv(f'./datasets/{model_option}/roblox2.csv', index_col = 0, low_memory = False)
    roblox3_df = pd.read_csv(f'./datasets/{model_option}/roblox3.csv', index_col = 0, low_memory = False)
    roblox4_df = pd.read_csv(f'./datasets/{model_option}/roblox4.csv', index_col = 0, low_memory = False)
    roblox5_df = pd.read_csv(f'./datasets/{model_option}/roblox5.csv', index_col = 0, low_memory = False)

    df = pd.concat([roblox1_df, roblox2_df, roblox3_df, roblox4_df, roblox5_df]).dropna(subset = ['keybert_keywords']).reset_index(drop = True)

    # PREPROCESS
    df['keybert_keywords'] = df['keybert_keywords'].progress_apply(eval)                                                    # str2list
    df['keywords'] = df['keybert_keywords'].progress_apply(lambda x : list(flatten(x))[0::2] if len(x) > 0 else '')         # select only keywords
    df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
    df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
    df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
    df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
    df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun

    # GRAPH FORMULATION - ROBLOX
    G = nx.MultiGraph()
    counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
    counts = counts[counts > 10]

    # ADD NODES
    for item in tqdm(counts.index):
        G.add_node(item, weight = counts[item])

    # ADD EDGES
    for keywords in tqdm(df['keywords']):
        keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

        pairs = list(combinations(keyword_selected, 2))
        pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
        
        G.add_edges_from(pairs)

    # MULTI to SINGLE - ROBLOX
    H = nx.Graph()

    # MULTI to SINGLE - ADD NODES
    for item in tqdm(counts.index):
        H.add_node(item, weight = counts[item])

    # MULTI to SINGLE - ADD EDGES
    for u, v, data in tqdm(G.edges(data = True)):
        w = data['weight'] if 'weight' in data else 1.0
        if H.has_edge(u, v):
            H[u][v]['weight'] += w
        else:
            H.add_edge(u, v, weight=w)

    # WRITE GRAPH
    nx.write_graphml_lxml(H, f'./graph/roblox-{model_option}.graphml')

    # ------------------------------

    # DATASET - ZEPETO
    df = pd.read_csv(f'./datasets/{model_option}/zepeto.csv', index_col = 0, low_memory = False)

    # PREPROCESS
    df['keybert_keywords'] = df['keybert_keywords'].progress_apply(eval)                                                    # str2list
    df['keywords'] = df['keybert_keywords'].progress_apply(lambda x : list(flatten(x))[0::2] if len(x) > 0 else '')         # select only keywords
    df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
    df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
    df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
    df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
    df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun
    df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y not in stopword_list])

    # GRAPH FORMULATION - ZEPETO
    G = nx.MultiGraph()
    counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
    counts = counts[counts > 10]

    # ADD NODES
    for item in tqdm(counts.index):
        G.add_node(item, weight = counts[item])

    # ADD EDGES
    for keywords in tqdm(df['keywords']):
        keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

        pairs = list(combinations(keyword_selected, 2))
        pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
        
        G.add_edges_from(pairs)

    # MULTI to SINGLE - ZEPETO
    H = nx.Graph()

    # MULTI to SINGLE - ADD NODES
    for item in tqdm(counts.index):
        H.add_node(item, weight = counts[item])

    # MULTI to SINGLE - ADD EDGES
    for u, v, data in tqdm(G.edges(data = True)):
        w = data['weight'] if 'weight' in data else 1.0
        if H.has_edge(u, v):
            H[u][v]['weight'] += w
        else:
            H.add_edge(u, v, weight=w)

    # WRITE GRAPH
    nx.write_graphml_lxml(H, f'./graph/zepeto-{model_option}.graphml')

[0/8] - tfidf
[1/8] - keybert-all-MiniLM-L12-v2


100%|██████████| 446700/446700 [00:19<00:00, 22877.32it/s]
100%|██████████| 446700/446700 [00:04<00:00, 94529.80it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 284136.30it/s]
100%|██████████| 446700/446700 [00:17<00:00, 26239.80it/s]
100%|██████████| 446700/446700 [00:02<00:00, 209409.78it/s]
100%|██████████| 446700/446700 [00:01<00:00, 345648.04it/s]
100%|██████████| 446700/446700 [04:03<00:00, 1831.80it/s]
100%|██████████| 4810/4810 [00:00<00:00, 184488.93it/s]
100%|██████████| 446700/446700 [00:34<00:00, 12971.67it/s]
100%|██████████| 4810/4810 [00:00<00:00, 158208.60it/s]
100%|██████████| 6806260/6806260 [00:16<00:00, 417388.07it/s]
100%|██████████| 115407/115407 [00:02<00:00, 40042.48it/s]
100%|██████████| 115407/115407 [00:02<00:00, 57413.50it/s]
100%|██████████| 115407/115407 [00:00<00:00, 489599.20it/s]
100%|██████████| 115407/115407 [00:02<00:00, 53854.59it/s]
100%|██████████| 115407/115407 [00:00<00:00, 528396.68it/s]
100%|██████████| 115407/115407 [00:01<00:00, 75946.78i

[2/8] - keybert-all-distilroberta-v1


100%|██████████| 446700/446700 [00:19<00:00, 22706.64it/s]
100%|██████████| 446700/446700 [00:04<00:00, 96836.20it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 253221.10it/s]
100%|██████████| 446700/446700 [00:15<00:00, 28695.04it/s]
100%|██████████| 446700/446700 [00:02<00:00, 181292.99it/s]
100%|██████████| 446700/446700 [00:01<00:00, 340063.39it/s]
100%|██████████| 446700/446700 [04:04<00:00, 1830.12it/s]
100%|██████████| 4729/4729 [00:00<00:00, 183954.22it/s]
100%|██████████| 446700/446700 [00:35<00:00, 12737.50it/s]
100%|██████████| 4729/4729 [00:00<00:00, 172150.74it/s]
100%|██████████| 6725091/6725091 [00:16<00:00, 416749.85it/s]
100%|██████████| 115407/115407 [00:02<00:00, 39786.70it/s]
100%|██████████| 115407/115407 [00:00<00:00, 170358.02it/s]
100%|██████████| 115407/115407 [00:01<00:00, 67912.35it/s]
100%|██████████| 115407/115407 [00:02<00:00, 52290.72it/s]
100%|██████████| 115407/115407 [00:00<00:00, 464851.80it/s]
100%|██████████| 115407/115407 [00:01<00:00, 75208.21i

[3/8] - keybert-all-mpnet-base-v2


100%|██████████| 446700/446700 [00:19<00:00, 22845.81it/s]
100%|██████████| 446700/446700 [00:04<00:00, 98362.79it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 250827.80it/s]
100%|██████████| 446700/446700 [00:15<00:00, 28652.74it/s]
100%|██████████| 446700/446700 [00:01<00:00, 259753.15it/s]
100%|██████████| 446700/446700 [00:02<00:00, 209106.56it/s]
100%|██████████| 446700/446700 [04:00<00:00, 1854.40it/s]
100%|██████████| 4682/4682 [00:00<00:00, 187640.76it/s]
100%|██████████| 446700/446700 [00:34<00:00, 13119.81it/s]
100%|██████████| 4682/4682 [00:00<00:00, 176504.65it/s]
100%|██████████| 6750182/6750182 [00:16<00:00, 419983.01it/s]
100%|██████████| 115407/115407 [00:02<00:00, 39982.43it/s]
100%|██████████| 115407/115407 [00:02<00:00, 52713.16it/s]
100%|██████████| 115407/115407 [00:00<00:00, 471770.90it/s]
100%|██████████| 115407/115407 [00:02<00:00, 53077.47it/s]
100%|██████████| 115407/115407 [00:01<00:00, 75798.66it/s]
100%|██████████| 115407/115407 [00:00<00:00, 404368.41i

[4/8] - keybert-distilbert-base-nli-mean-tokens


100%|██████████| 446700/446700 [00:18<00:00, 23921.67it/s]
100%|██████████| 446700/446700 [00:04<00:00, 97526.31it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 250635.33it/s]
100%|██████████| 446700/446700 [00:16<00:00, 26980.39it/s]
100%|██████████| 446700/446700 [00:00<00:00, 462477.34it/s]
100%|██████████| 446700/446700 [00:01<00:00, 225249.34it/s]
100%|██████████| 446700/446700 [04:03<00:00, 1838.08it/s]
100%|██████████| 4969/4969 [00:00<00:00, 180179.10it/s]
100%|██████████| 446700/446700 [00:34<00:00, 13106.93it/s]
100%|██████████| 4969/4969 [00:00<00:00, 186386.00it/s]
100%|██████████| 6604921/6604921 [00:15<00:00, 418286.40it/s]
100%|██████████| 115407/115407 [00:02<00:00, 39773.24it/s]
100%|██████████| 115407/115407 [00:00<00:00, 173083.68it/s]
100%|██████████| 115407/115407 [00:01<00:00, 67940.97it/s]
100%|██████████| 115407/115407 [00:02<00:00, 54701.52it/s]
100%|██████████| 115407/115407 [00:00<00:00, 511742.97it/s]
100%|██████████| 115407/115407 [00:01<00:00, 72234.20i

[5/8] - keybert-multi-qa-distilbert-cos-v1


100%|██████████| 446700/446700 [00:19<00:00, 22944.20it/s]
100%|██████████| 446700/446700 [00:04<00:00, 97187.59it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 249439.62it/s]
100%|██████████| 446700/446700 [00:15<00:00, 28657.38it/s]
100%|██████████| 446700/446700 [00:01<00:00, 254467.02it/s]
100%|██████████| 446700/446700 [00:02<00:00, 216288.02it/s]
100%|██████████| 446700/446700 [04:00<00:00, 1855.29it/s]
100%|██████████| 4713/4713 [00:00<00:00, 190843.44it/s]
100%|██████████| 446700/446700 [00:31<00:00, 14023.75it/s]
100%|██████████| 4713/4713 [00:00<00:00, 189380.78it/s]
100%|██████████| 6315579/6315579 [00:14<00:00, 423198.66it/s]
100%|██████████| 115407/115407 [00:04<00:00, 27285.03it/s]
100%|██████████| 115407/115407 [00:00<00:00, 173848.22it/s]
100%|██████████| 115407/115407 [00:00<00:00, 456116.29it/s]
100%|██████████| 115407/115407 [00:03<00:00, 33966.85it/s]
100%|██████████| 115407/115407 [00:00<00:00, 509457.14it/s]
100%|██████████| 115407/115407 [00:00<00:00, 431361.6

[6/8] - keybert-multi-qa-mpnet-base-dot-v1


100%|██████████| 446700/446700 [00:22<00:00, 20184.92it/s]
100%|██████████| 446700/446700 [00:04<00:00, 97754.88it/s] 
100%|██████████| 446700/446700 [00:01<00:00, 250396.14it/s]
100%|██████████| 446700/446700 [00:14<00:00, 31324.14it/s]
100%|██████████| 446700/446700 [00:02<00:00, 202065.66it/s]
100%|██████████| 446700/446700 [00:01<00:00, 403232.67it/s]
100%|██████████| 446700/446700 [03:58<00:00, 1871.31it/s]
100%|██████████| 4470/4470 [00:00<00:00, 183302.43it/s]
100%|██████████| 446700/446700 [00:32<00:00, 13733.17it/s]
100%|██████████| 4470/4470 [00:00<00:00, 189337.11it/s]
100%|██████████| 6347736/6347736 [00:14<00:00, 427758.58it/s]
100%|██████████| 115407/115407 [00:02<00:00, 40457.24it/s]
100%|██████████| 115407/115407 [00:00<00:00, 174857.43it/s]
100%|██████████| 115407/115407 [00:00<00:00, 486309.00it/s]
100%|██████████| 115407/115407 [00:03<00:00, 33203.14it/s]
100%|██████████| 115407/115407 [00:00<00:00, 518834.17it/s]
100%|██████████| 115407/115407 [00:00<00:00, 424442.4

[7/8] - original
