ENVIRON

In [1]:
import networkx as nx
import pandas as pd
import os
import numpy as np
import seaborn as sns
from tqdm import tqdm, trange
from itertools import combinations

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
tqdm.pandas()

In [4]:
lemmatizer = WordNetLemmatizer()
stopword_list = stopwords.words('english')

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
def flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

In [7]:
def morphs(text, noun = True, verb = False, adjective = False, adverb = False):
    poses = pos_tag(text, tagset = 'universal')
    filters = []

    if noun:
        filters.append('NOUN')
    if verb:
        filters.append('VERB')
    if adjective:
        filters.append('ADJ')
    if adverb:
        filters.append('ADV')

    return [pos[0] for pos in poses if pos[1] in filters]

COUNT OVER 10

In [8]:
# DATASET - ROBLOX
roblox1_df = pd.read_csv(f'./datasets/tfidf/roblox1.csv', index_col = 0, low_memory = False)
roblox2_df = pd.read_csv(f'./datasets/tfidf/roblox2.csv', index_col = 0, low_memory = False)
roblox3_df = pd.read_csv(f'./datasets/tfidf/roblox3.csv', index_col = 0, low_memory = False)
roblox4_df = pd.read_csv(f'./datasets/tfidf/roblox4.csv', index_col = 0, low_memory = False)
roblox5_df = pd.read_csv(f'./datasets/tfidf/roblox5.csv', index_col = 0, low_memory = False)

df = pd.concat([roblox1_df, roblox2_df, roblox3_df, roblox4_df, roblox5_df]).dropna(subset = ['keywords']).reset_index(drop = True)

# PREPROCESS
df['keywords'] = df['keywords'].progress_apply(lambda x : x.split())
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun

# OPTIONAL: TF-IDF FILTERING
documents = df['keywords'].progress_apply(lambda x : ' '.join(x))
tfidf_matrix = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names_out()
filter_num = 200

tfidf_dict = {}
for doc_idx, doc in enumerate(tqdm(documents)):
    feature_idx = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_idx, [tfidf_matrix[doc_idx, x] for x in feature_idx])
    for word_idx, score in tfidf_scores:
        word = words[word_idx]
        tfidf_dict[word] = score

tfidf_dict = {key: value for value, key in sorted([(score, word) for word, score in tfidf_dict.items()], reverse = True)}

# GRAPH FORMULATION - ROBLOX
G = nx.MultiGraph()
counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
counts = counts[counts > 10]

# ADD NODES
for item in tqdm(counts.index):
    G.add_node(item, weight = counts[item])

# ADD EDGES
for keywords in tqdm(df['keywords']):
    keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

    pairs = list(combinations(keyword_selected, 2))
    pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
    
    G.add_edges_from(pairs)

# MULTI to SINGLE - ROBLOX
H = nx.Graph()

# MULTI to SINGLE - ADD NODES
for item in tqdm(counts.index):
    H.add_node(item, weight = counts[item])

# MULTI to SINGLE - ADD EDGES
for u, v, data in tqdm(G.edges(data = True)):
    w = data['weight'] if 'weight' in data else 1.0
    if H.has_edge(u, v):
        H[u][v]['weight'] += w
    else:
        H.add_edge(u, v, weight=w)

# WRITE GRAPH
nx.write_graphml_lxml(H, f'./graph/roblox-tfidf10.graphml')

100%|██████████| 445931/445931 [00:01<00:00, 303994.14it/s]
100%|██████████| 445931/445931 [00:01<00:00, 264014.70it/s]
100%|██████████| 445931/445931 [00:16<00:00, 27134.55it/s]
100%|██████████| 445931/445931 [00:01<00:00, 342368.39it/s]
100%|██████████| 445931/445931 [00:01<00:00, 282027.95it/s]
100%|██████████| 445931/445931 [03:47<00:00, 1960.05it/s]
100%|██████████| 445931/445931 [00:00<00:00, 956335.40it/s]
100%|██████████| 445931/445931 [02:16<00:00, 3259.43it/s]
100%|██████████| 8300/8300 [00:00<00:00, 214816.44it/s]
100%|██████████| 445931/445931 [01:25<00:00, 5194.06it/s] 
100%|██████████| 8300/8300 [00:00<00:00, 224266.88it/s]
100%|██████████| 19817863/19817863 [00:40<00:00, 485862.03it/s]


In [9]:
# DATASET - ZEPETO
df = pd.read_csv(f'./datasets/tfidf/zepeto.csv', index_col = 0, low_memory = False)

# PREPROCESS
df = df.dropna(subset = ['keywords'])
df['keywords'] = df['keywords'].progress_apply(lambda x : x.split())
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun

# OPTIONAL: TF-IDF FILTERING
documents = df['keywords'].progress_apply(lambda x : ' '.join(x))
tfidf_matrix = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names_out()
filter_num = 200

tfidf_dict = {}
for doc_idx, doc in enumerate(tqdm(documents)):
    feature_idx = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_idx, [tfidf_matrix[doc_idx, x] for x in feature_idx])
    for word_idx, score in tfidf_scores:
        word = words[word_idx]
        tfidf_dict[word] = score

tfidf_dict = {key: value for value, key in sorted([(score, word) for word, score in tfidf_dict.items()], reverse = True)}

# GRAPH FORMULATION - ZEPETO
G = nx.MultiGraph()
counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
counts = counts[counts > 10]

# ADD NODES
for item in tqdm(counts.index):
    G.add_node(item, weight = counts[item])
    
# ADD EDGES
for keywords in tqdm(df['keywords']):
    keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

    pairs = list(combinations(keyword_selected, 2))
    pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
    
    G.add_edges_from(pairs)

# MULTI to SINGLE - ZEPETO
H = nx.Graph()

# MULTI to SINGLE - ADD NODES
for item in tqdm(counts.index):
    H.add_node(item, weight = counts[item])

# MULTI to SINGLE - ADD EDGES
for u, v, data in tqdm(G.edges(data = True)):
    w = data['weight'] if 'weight' in data else 1.0
    if H.has_edge(u, v):
        H[u][v]['weight'] += w
    else:
        H.add_edge(u, v, weight=w)

# WRITE GRAPH
nx.write_graphml_lxml(H, f'./graph/zepeto-tfidf10.graphml')

100%|██████████| 99015/99015 [00:00<00:00, 674031.33it/s]
100%|██████████| 99015/99015 [00:00<00:00, 537239.48it/s]
100%|██████████| 99015/99015 [00:01<00:00, 54296.00it/s]
100%|██████████| 99015/99015 [00:00<00:00, 591154.46it/s]
100%|██████████| 99015/99015 [00:00<00:00, 447369.11it/s]
100%|██████████| 99015/99015 [00:32<00:00, 3000.46it/s]
100%|██████████| 99015/99015 [00:00<00:00, 860190.29it/s]
100%|██████████| 99015/99015 [00:23<00:00, 4252.45it/s]
100%|██████████| 3072/3072 [00:00<00:00, 222387.37it/s]
100%|██████████| 99015/99015 [00:17<00:00, 5610.24it/s] 
100%|██████████| 3072/3072 [00:00<00:00, 212839.90it/s]
100%|██████████| 1786651/1786651 [00:03<00:00, 459031.57it/s]


WITHOUT FILTERING

In [10]:
# DATASET - ROBLOX
roblox1_df = pd.read_csv(f'./datasets/tfidf/roblox1.csv', index_col = 0, low_memory = False)
roblox2_df = pd.read_csv(f'./datasets/tfidf/roblox2.csv', index_col = 0, low_memory = False)
roblox3_df = pd.read_csv(f'./datasets/tfidf/roblox3.csv', index_col = 0, low_memory = False)
roblox4_df = pd.read_csv(f'./datasets/tfidf/roblox4.csv', index_col = 0, low_memory = False)
roblox5_df = pd.read_csv(f'./datasets/tfidf/roblox5.csv', index_col = 0, low_memory = False)

df = pd.concat([roblox1_df, roblox2_df, roblox3_df, roblox4_df, roblox5_df]).dropna(subset = ['keywords']).reset_index(drop = True)

# PREPROCESS
df['keywords'] = df['keywords'].progress_apply(lambda x : x.split())
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun

# OPTIONAL: TF-IDF FILTERING
documents = df['keywords'].progress_apply(lambda x : ' '.join(x))
tfidf_matrix = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names_out()
filter_num = 200

tfidf_dict = {}
for doc_idx, doc in enumerate(tqdm(documents)):
    feature_idx = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_idx, [tfidf_matrix[doc_idx, x] for x in feature_idx])
    for word_idx, score in tfidf_scores:
        word = words[word_idx]
        tfidf_dict[word] = score

tfidf_dict = {key: value for value, key in sorted([(score, word) for word, score in tfidf_dict.items()], reverse = True)}

# GRAPH FORMULATION - ROBLOX
G = nx.MultiGraph()
counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
# counts = counts[counts > 10]

# ADD NODES
for item in tqdm(counts.index):
    G.add_node(item, weight = counts[item])

# ADD EDGES
for keywords in tqdm(df['keywords']):
    keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

    pairs = list(combinations(keyword_selected, 2))
    pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
    
    G.add_edges_from(pairs)

# MULTI to SINGLE - ROBLOX
H = nx.Graph()

# MULTI to SINGLE - ADD NODES
for item in tqdm(counts.index):
    H.add_node(item, weight = counts[item])

# MULTI to SINGLE - ADD EDGES
for u, v, data in tqdm(G.edges(data = True)):
    w = data['weight'] if 'weight' in data else 1.0
    if H.has_edge(u, v):
        H[u][v]['weight'] += w
    else:
        H.add_edge(u, v, weight=w)

# WRITE GRAPH
nx.write_graphml_lxml(H, f'./graph/roblox-tfidf.graphml')

100%|██████████| 445931/445931 [00:03<00:00, 129352.84it/s]
100%|██████████| 445931/445931 [00:01<00:00, 223551.33it/s]
100%|██████████| 445931/445931 [00:15<00:00, 29180.72it/s]
100%|██████████| 445931/445931 [00:01<00:00, 223730.04it/s]
100%|██████████| 445931/445931 [00:02<00:00, 207243.90it/s]
100%|██████████| 445931/445931 [03:50<00:00, 1938.77it/s]
100%|██████████| 445931/445931 [00:00<00:00, 887612.60it/s]
100%|██████████| 445931/445931 [02:17<00:00, 3240.44it/s]
100%|██████████| 76990/76990 [00:00<00:00, 209502.81it/s]
100%|██████████| 445931/445931 [01:38<00:00, 4514.13it/s] 
100%|██████████| 76990/76990 [00:00<00:00, 206687.37it/s]
100%|██████████| 21413808/21413808 [00:48<00:00, 443371.44it/s]


In [11]:
# DATASET - ZEPETO
df = pd.read_csv(f'./datasets/tfidf/zepeto.csv', index_col = 0, low_memory = False)

# PREPROCESS
df = df.dropna(subset = ['keywords'])
df['keywords'] = df['keywords'].progress_apply(lambda x : x.split())
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if y.isalpha()])                                # exclude if numeric
df['keywords'] = df['keywords'].progress_apply(lambda x : [lemmatizer.lemmatize(y) for y in x])                         # lemmatize
df['keywords'] = df['keywords'].progress_apply(lambda x : list(set(x)))                                                 # drop duplicate
df['keywords'] = df['keywords'].progress_apply(lambda x : [y for y in x if len(y) >= 3 and len(y) <= 15])               # 3 <= len(keyword) <= 15
df['keywords'] = df['keywords'].progress_apply(morphs, noun = True, verb = True, adjective = False, adverb = False)     # select noun

# OPTIONAL: TF-IDF FILTERING
documents = df['keywords'].progress_apply(lambda x : ' '.join(x))
tfidf_matrix = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names_out()
filter_num = 200

tfidf_dict = {}
for doc_idx, doc in enumerate(tqdm(documents)):
    feature_idx = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_idx, [tfidf_matrix[doc_idx, x] for x in feature_idx])
    for word_idx, score in tfidf_scores:
        word = words[word_idx]
        tfidf_dict[word] = score

tfidf_dict = {key: value for value, key in sorted([(score, word) for word, score in tfidf_dict.items()], reverse = True)}

# GRAPH FORMULATION - ZEPETO
G = nx.MultiGraph()
counts = df['keywords'].explode().reset_index(drop = True).reset_index().groupby('keywords').count()['index']
# counts = counts[counts > 10]

# ADD NODES
for item in tqdm(counts.index):
    G.add_node(item, weight = counts[item])
    
# ADD EDGES
for keywords in tqdm(df['keywords']):
    keyword_selected = [keyword for keyword in keywords if keyword in counts.index]

    pairs = list(combinations(keyword_selected, 2))
    pairs = [(pair[0], pair[1]) if pair[0] < pair[1] else (pair[1], pair[0]) for pair in pairs]
    
    G.add_edges_from(pairs)

# MULTI to SINGLE - ZEPETO
H = nx.Graph()

# MULTI to SINGLE - ADD NODES
for item in tqdm(counts.index):
    H.add_node(item, weight = counts[item])

# MULTI to SINGLE - ADD EDGES
for u, v, data in tqdm(G.edges(data = True)):
    w = data['weight'] if 'weight' in data else 1.0
    if H.has_edge(u, v):
        H[u][v]['weight'] += w
    else:
        H.add_edge(u, v, weight=w)

# WRITE GRAPH
nx.write_graphml_lxml(H, f'./graph/zepeto-tfidf.graphml')

100%|██████████| 99015/99015 [00:00<00:00, 682901.46it/s]
100%|██████████| 99015/99015 [00:00<00:00, 538779.10it/s]
100%|██████████| 99015/99015 [00:01<00:00, 53687.86it/s]
100%|██████████| 99015/99015 [00:00<00:00, 567856.50it/s]
100%|██████████| 99015/99015 [00:00<00:00, 412753.97it/s]
100%|██████████| 99015/99015 [00:33<00:00, 2983.06it/s]
100%|██████████| 99015/99015 [00:00<00:00, 871210.36it/s]
100%|██████████| 99015/99015 [00:23<00:00, 4246.05it/s]
100%|██████████| 39121/39121 [00:00<00:00, 211388.65it/s]
100%|██████████| 99015/99015 [00:24<00:00, 3985.18it/s] 
100%|██████████| 39121/39121 [00:00<00:00, 213979.36it/s]
100%|██████████| 2351277/2351277 [00:05<00:00, 395663.44it/s]
