ENVIRON

In [1]:
import networkx as nx
import pandas as pd
import os
import numpy as np
import seaborn as sns
from tqdm import tqdm, trange
from itertools import combinations

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
tqdm.pandas()

In [3]:
def flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

In [4]:
for index, model_option in enumerate(os.listdir('./datasets/')):
    # INDEXING
    print(f'[{index}/{len(os.listdir("./datasets/"))}] - {model_option}')
    if model_option == 'original':
        continue

    # DATASET - ROBLOX
    roblox1_df = pd.read_csv(f'./datasets/{model_option}/roblox1.csv', index_col = 0, low_memory = False)
    roblox2_df = pd.read_csv(f'./datasets/{model_option}/roblox2.csv', index_col = 0, low_memory = False)
    roblox3_df = pd.read_csv(f'./datasets/{model_option}/roblox3.csv', index_col = 0, low_memory = False)
    roblox4_df = pd.read_csv(f'./datasets/{model_option}/roblox4.csv', index_col = 0, low_memory = False)
    roblox5_df = pd.read_csv(f'./datasets/{model_option}/roblox5.csv', index_col = 0, low_memory = False)

    df = pd.concat([roblox1_df, roblox2_df, roblox3_df, roblox4_df, roblox5_df]).dropna(subset = ['keybert_keywords']).reset_index(drop = True)

    # PREPROCESS
    tqdm.pandas(desc = '\t(01/10)')
    df['keybert_keywords'] = df['keybert_keywords'].progress_apply(eval)

    tqdm.pandas(desc = '\t(02/10)')
    df['keywords'] = df['keybert_keywords'].progress_apply(lambda x : list(flatten(x))[0::2] if len(x) > 0 else '')

    count_df = df[['keywords', 'userName']].explode('keywords').groupby('keywords').count()

    count_df.columns = ['count']

    # GRAPH FORMULATION - ROBLOX
    G = nx.MultiGraph()
    unique_items = df['keywords'].explode().unique()

    # ADD NODES
    for item in tqdm(unique_items, desc = '\t(03/10)'):
        if count_df.loc[item]['count'] > np.percentile(count_df['count'], 99):
            G.add_node(item, keyword = item, weight = count_df.loc[item]['count'])
    
    selected = list(count_df[count_df['count'] > np.percentile(count_df['count'], 99)].index)
    selected = [word for word in selected if word.isalpha()]

    # ADD EDGES
    for keywords in tqdm(df['keywords'], desc = '\t(04/10)'):
        keyword_selected = [keyword for keyword in keywords if keyword in selected]

        pairs = list(combinations(keyword_selected, 2))
        pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
        
        G.add_edges_from(pairs)
    
    # MULTI to SINGLE - ROBLOX
    H = nx.Graph()
    unique_items = df['keywords'].explode().unique()

    H.add_nodes_from([node for node in G.nodes(data = True) if node[1]['keyword'] in selected])

    for u, v, data in tqdm(G.edges(data = True), desc = '\t(05/10)'):
        w = data['weight'] if 'weight' in data else 1.0
        if H.has_edge(u, v):
            H[u][v]['weight'] += w
        else:
            H.add_edge(u, v, weight=w)

    nx.write_graphml_lxml(H, f'./graph/roblox-{model_option}.graphml')

    # print(list(H.nodes(data = True))[:10])
    # print(list(H.edges(data = True))[:10])

    # DATASET - ZEPETO
    df = pd.read_csv(f'./datasets/{model_option}/zepeto.csv', index_col = 0, low_memory = False)

    # PREPROCESS
    tqdm.pandas(desc = '\t(06/10)')
    df['keybert_keywords'] = df['keybert_keywords'].progress_apply(eval)

    tqdm.pandas(desc = '\t(07/10)')
    df['keywords'] = df['keybert_keywords'].progress_apply(lambda x : list(flatten(x))[0::2] if len(x) > 0 else '')

    count_df = df[['keywords', 'userName']].explode('keywords').groupby('keywords').count()

    count_df.columns = ['count']
    
    # GRAPH FORMULATION - ZEPETO
    G = nx.MultiGraph()
    unique_items = df['keywords'].explode().unique()

    # ADD NODES
    for item in tqdm(unique_items, desc = '\t(08/10)'):
        if count_df.loc[item]['count'] > np.percentile(count_df['count'], 99):
            G.add_node(item, keyword = item, weight = count_df.loc[item]['count'])
    
    selected = list(count_df[count_df['count'] > np.percentile(count_df['count'], 99)].index)
    selected = [word for word in selected if word.isalpha()]

    # ADD EDGES
    for keywords in tqdm(df['keywords'], desc = '\t(09/10)'):
        keyword_selected = [keyword for keyword in keywords if keyword in selected]

        pairs = list(combinations(keyword_selected, 2))
        pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
        
        G.add_edges_from(pairs)
    
    # MULTI to SINGLE - ZEPETO
    H = nx.Graph()
    unique_items = df['keywords'].explode().unique()

    H.add_nodes_from([node for node in G.nodes(data = True) if node[1]['keyword'] in selected])

    for u, v, data in tqdm(G.edges(data = True), desc = '\t(10/10)'):
        w = data['weight'] if 'weight' in data else 1.0
        if H.has_edge(u, v):
            H[u][v]['weight'] += w
        else:
            H.add_edge(u, v, weight=w) 
    nx.write_graphml_lxml(H, f'./graph/zepeto-{model_option}.graphml')

    # print(list(H.nodes(data = True))[:10])
    # print(list(H.edges(data = True))[:10])


[0/18] - keybert-all-MiniLM-L12-v2-post-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:16<00:00, 27672.14it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 110516.94it/s]
	(03/10): 100%|██████████| 71316/71316 [00:58<00:00, 1218.47it/s]
	(04/10): 100%|██████████| 446700/446700 [00:56<00:00, 7842.46it/s] 
	(05/10): 100%|██████████| 10441418/10441418 [00:19<00:00, 528545.64it/s]
	(06/10): 100%|██████████| 115407/115407 [00:02<00:00, 49367.52it/s]
	(07/10): 100%|██████████| 115407/115407 [00:00<00:00, 213945.18it/s]
	(08/10): 100%|██████████| 36755/36755 [00:17<00:00, 2128.16it/s]
	(09/10): 100%|██████████| 115407/115407 [00:07<00:00, 15077.05it/s]
	(10/10): 100%|██████████| 753946/753946 [00:01<00:00, 527498.26it/s]


[1/18] - keybert-all-MiniLM-L12-v2-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:21<00:00, 20615.69it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 107105.06it/s]
	(03/10): 100%|██████████| 53506/53506 [00:43<00:00, 1220.87it/s]
	(04/10): 100%|██████████| 446700/446700 [00:52<00:00, 8494.28it/s] 
	(05/10): 100%|██████████| 11723171/11723171 [00:21<00:00, 535502.56it/s]
	(06/10): 100%|██████████| 115407/115407 [00:04<00:00, 23688.22it/s]
	(07/10): 100%|██████████| 115407/115407 [00:00<00:00, 194218.15it/s]
	(08/10): 100%|██████████| 29936/29936 [00:14<00:00, 2011.30it/s]
	(09/10): 100%|██████████| 115407/115407 [00:03<00:00, 31466.11it/s]
	(10/10): 100%|██████████| 774065/774065 [00:01<00:00, 538653.68it/s]


[2/18] - keybert-all-MiniLM-L12-v2


	(01/10): 100%|██████████| 446700/446700 [00:20<00:00, 22070.85it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 105006.98it/s]
	(03/10): 100%|██████████| 71316/71316 [00:58<00:00, 1212.27it/s]
	(04/10): 100%|██████████| 446700/446700 [00:57<00:00, 7776.86it/s] 
	(05/10): 100%|██████████| 10441418/10441418 [00:19<00:00, 524189.18it/s]
	(06/10): 100%|██████████| 115407/115407 [00:03<00:00, 29717.94it/s]
	(07/10): 100%|██████████| 115407/115407 [00:01<00:00, 66664.30it/s]
	(08/10): 100%|██████████| 36755/36755 [00:17<00:00, 2096.04it/s]
	(09/10): 100%|██████████| 115407/115407 [00:04<00:00, 27166.32it/s]
	(10/10): 100%|██████████| 753946/753946 [00:01<00:00, 528437.30it/s]


[3/18] - keybert-all-distilroberta-v1-post-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:19<00:00, 22573.87it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 105580.26it/s]
	(03/10): 100%|██████████| 72347/72347 [01:12<00:00, 1004.75it/s]
	(04/10): 100%|██████████| 446700/446700 [00:58<00:00, 7667.19it/s] 
	(05/10): 100%|██████████| 10597033/10597033 [00:20<00:00, 523827.93it/s]
	(06/10): 100%|██████████| 115407/115407 [00:02<00:00, 48663.77it/s]
	(07/10): 100%|██████████| 115407/115407 [00:01<00:00, 66375.46it/s]
	(08/10): 100%|██████████| 37217/37217 [00:18<00:00, 2046.98it/s]
	(09/10): 100%|██████████| 115407/115407 [00:04<00:00, 25910.73it/s]
	(10/10): 100%|██████████| 768098/768098 [00:01<00:00, 521483.60it/s]


[4/18] - keybert-all-distilroberta-v1-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:25<00:00, 17359.20it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 105041.27it/s]
	(03/10): 100%|██████████| 53506/53506 [00:45<00:00, 1182.65it/s]
	(04/10): 100%|██████████| 446700/446700 [00:53<00:00, 8331.47it/s] 
	(05/10): 100%|██████████| 11723171/11723171 [00:21<00:00, 533978.67it/s]
	(06/10): 100%|██████████| 115407/115407 [00:03<00:00, 30860.79it/s]
	(07/10): 100%|██████████| 115407/115407 [00:01<00:00, 66332.44it/s]
	(08/10): 100%|██████████| 29936/29936 [00:14<00:00, 2015.19it/s]
	(09/10): 100%|██████████| 115407/115407 [00:03<00:00, 31244.66it/s] 
	(10/10): 100%|██████████| 774065/774065 [00:01<00:00, 539304.25it/s]


[5/18] - keybert-all-distilroberta-v1


	(01/10): 100%|██████████| 446700/446700 [00:19<00:00, 23364.41it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 93909.15it/s] 
	(03/10): 100%|██████████| 72347/72347 [01:13<00:00, 986.71it/s] 
	(04/10): 100%|██████████| 446700/446700 [00:58<00:00, 7692.54it/s] 
	(05/10): 100%|██████████| 10597033/10597033 [00:20<00:00, 514565.92it/s]
	(06/10): 100%|██████████| 115407/115407 [00:03<00:00, 30883.29it/s]
	(07/10): 100%|██████████| 115407/115407 [00:01<00:00, 68172.66it/s]
	(08/10): 100%|██████████| 37217/37217 [00:18<00:00, 2037.52it/s]
	(09/10): 100%|██████████| 115407/115407 [00:04<00:00, 25162.99it/s]
	(10/10): 100%|██████████| 768098/768098 [00:01<00:00, 517447.18it/s]


[6/18] - keybert-all-mpnet-base-v2-post-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:20<00:00, 21722.03it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 100879.15it/s]
	(03/10): 100%|██████████| 69603/69603 [00:39<00:00, 1758.60it/s]
	(04/10): 100%|██████████| 446700/446700 [00:59<00:00, 7553.00it/s] 
	(05/10): 100%|██████████| 10519125/10519125 [00:21<00:00, 496882.75it/s]
	(06/10): 100%|██████████| 115407/115407 [00:02<00:00, 47219.03it/s]
	(07/10): 100%|██████████| 115407/115407 [00:00<00:00, 198261.24it/s]
	(08/10): 100%|██████████| 36424/36424 [00:14<00:00, 2526.11it/s]
	(09/10): 100%|██████████| 115407/115407 [00:04<00:00, 25280.66it/s]
	(10/10): 100%|██████████| 760106/760106 [00:01<00:00, 522075.50it/s]


[7/18] - keybert-all-mpnet-base-v2-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:26<00:00, 16913.15it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 101988.73it/s]
	(03/10): 100%|██████████| 53506/53506 [00:44<00:00, 1195.72it/s]
	(04/10): 100%|██████████| 446700/446700 [00:53<00:00, 8318.07it/s] 
	(05/10): 100%|██████████| 11723171/11723171 [00:22<00:00, 527430.37it/s]
	(06/10): 100%|██████████| 115407/115407 [00:06<00:00, 18989.14it/s]
	(07/10): 100%|██████████| 115407/115407 [00:00<00:00, 191749.65it/s]
	(08/10): 100%|██████████| 29936/29936 [00:15<00:00, 1974.05it/s]
	(09/10): 100%|██████████| 115407/115407 [00:03<00:00, 31452.70it/s]
	(10/10): 100%|██████████| 774065/774065 [00:01<00:00, 541600.44it/s]


[8/18] - keybert-all-mpnet-base-v2


	(01/10): 100%|██████████| 446700/446700 [00:20<00:00, 22293.32it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 102532.70it/s]
	(03/10): 100%|██████████| 69603/69603 [00:38<00:00, 1795.39it/s]
	(04/10): 100%|██████████| 446700/446700 [00:56<00:00, 7843.96it/s] 
	(05/10): 100%|██████████| 10519125/10519125 [00:20<00:00, 524593.62it/s]
	(06/10): 100%|██████████| 115407/115407 [00:02<00:00, 48123.19it/s]
	(07/10): 100%|██████████| 115407/115407 [00:01<00:00, 66327.53it/s]
	(08/10): 100%|██████████| 36424/36424 [00:14<00:00, 2546.72it/s]
	(09/10): 100%|██████████| 115407/115407 [00:04<00:00, 25198.15it/s]
	(10/10): 100%|██████████| 760106/760106 [00:01<00:00, 486570.37it/s]


[9/18] - keybert-distilbert-base-nli-mean-tokens-post-tokenized


	(01/10): 100%|██████████| 446700/446700 [00:20<00:00, 21403.47it/s]
	(02/10): 100%|██████████| 446700/446700 [00:04<00:00, 101567.91it/s]
	(03/10): 100%|██████████| 71975/71975 [01:12<00:00, 993.84it/s] 
	(04/10):  12%|█▏        | 53119/446700 [00:09<01:04, 6055.51it/s]