# import libraries
코랩 환경에서 실행하였습니다. 데이터 다운로드 및 처리에 필요한 라이브러리를 다운로드 받고 임포트합니다.

In [None]:
!pip install ijson
!pip install fastparquet



In [None]:
from collections import Counter, defaultdict
import heapq

from huggingface_hub import hf_hub_download
from tqdm import tqdm
import dask.dataframe as dd
import pandas as pd

# 그래프 데이터 생성

코랩 환경에서 약 35분 소요되었습니다. 이 부분은 뒤의 데이터 로딩 코드로 대체할 수 있습니다.

## 데이터 다운로드
나무위키 문서 데이터셋을 다운로드하고 dask 라이브러리를 사용해 불러옵니다.

데이터셋 크기가 매우 커서 pandas를 사용하여 한 번에 불러올 수 없습니다.

In [None]:
# Download with progress bar
file_path = hf_hub_download(
    repo_id="heegyu/namuwiki", repo_type='dataset', filename="namuwiki_20210301.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


namuwiki_20210301.parquet:   0%|          | 0.00/3.03G [00:00<?, ?B/s]

In [None]:
# Load the Parquet file
ddf = dd.read_parquet(file_path)

## 데이터 처리



나무위키 문법에서 링크를 나타내는 `[[...]]` 부분을 추출하는 함수입니다.

각주(`[*...]`) 안은 무시합니다.

In [None]:
def extract_double_bracket_links(text):
    """
    Extracts [[...]]-style links from text with filtering rules:
    - Only the part before '|' is kept, if present.
    - Links inside [* ... ] blocks are ignored.
    - Links starting with '파일:', 'http://', or 'https://' are ignored.

    Parameters:
        text (str): Input string.

    Returns:
        dict: Dictionary of valid link targets with occurrence counts.
    """
    results = []
    i = 0
    n = len(text)
    in_footnote = False

    while i < n:
        if not in_footnote and text[i:i+2] == "[*":
            in_footnote = True
            i += 2

        elif text[i:i+2] == "[[":
            end = text.find("]]", i + 2)
            if end == -1:
                break  # No matching ]]
            content = text[i+2:end]
            i = end + 2

            if in_footnote:
                continue

            # Remove after | if exists
            if "|" in content:
                content = content.split("|", 1)[0]

            # Filter unwanted prefixes
            if not (
                content.startswith("파일:") or
                content.startswith("http://") or
                content.startswith("https://")
            ):
                results.append(content)


        elif in_footnote and text[i] == "]":
            in_footnote = False
            i += 1

        else:
            i += 1

    return dict(Counter(results))


빠른 접근을 위해 id와 title 각각을 인덱스로 하는 Series를 만듭니다.

In [None]:
s_id_title = ddf['title'].compute().reset_index(drop=True)
s_title_id = s_id_title.reset_index().set_index('title', verify_integrity=True)['index'].sort_index()

문서 텍스트 데이터를 분석하여 문서 간 연결 관계를 저장하는 DataFrame을 만듭니다.

In [None]:
def create_link_edges_dataframe_dask(df_dask):
    # Use a closure to pass the mapping and link extractor
    def partition_func(partition_df):
        print('running partition')
        # records = []

        # for row in partition_df.itertuples(index=False):
        #     from_title = row.title
        #     text = row.text
        #     links = extract_double_bracket_links(text)

        #     for to_title, count in links.items():
        #         if to_title in s_title_id.index:
        #             records.append((s_title_id.loc[from_title], s_title_id.loc[to_title], count))

        records = (
            (s_title_id.loc[row.title], s_title_id.loc[to_title], count)
            for row in partition_df.itertuples(index=False)
            for to_title, count in extract_double_bracket_links(row.text).items()
            if to_title in s_title_id.index
        )

        return pd.DataFrame(records, columns=['from_title', 'to_title', 'count'])

    # Apply per partition
    link_df = df_dask.map_partitions(partition_func, meta={'from_title': int, 'to_title': int, 'count': int})

    return link_df


In [None]:
edges_df = create_link_edges_dataframe_dask(ddf).compute()

running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition
running partition


In [None]:
edges_df

Unnamed: 0,from_title,to_title,count
0,1,143751,2
1,1,128620,1
2,1,128612,2
3,1,118944,1
4,1,77175,1
...,...,...,...
20741,867022,203442,1
20742,867022,283833,1
20743,867022,181673,1
20744,867022,1181,1


In [None]:
edges_df = edges_df.sort_values(['from_title', 'to_title'])

In [None]:
edges_df.to_csv('./edges_df.csv', index=False)

In [None]:
s_id_title.to_csv('./s_id_title.csv', index=False)

양방향 연결이 모두 존재하는 경우만을 모은 `edges_undirected_df`를 만듭니다. 약 30초 소요됩니다.

In [None]:
# Create a reversed version of the original edges
reversed_df = edges_df.rename(columns={'from_title': 'to_title', 'to_title': 'from_title'})

# Perform an inner merge to find mutual bidirectional edges
bidirectional = edges_df.merge(reversed_df, on=['from_title', 'to_title'])

# Calculate sum of inverses of counts to set it as weights
bidirectional['weight'] = ((1 / bidirectional['count_x']) + (1 / bidirectional['count_y'])) / 2

# Since both directions should be included, concatenate both directions
edges_undirected_df = pd.concat([
    bidirectional[['from_title', 'to_title', 'weight']],
    bidirectional[['to_title', 'from_title', 'weight']].rename(columns={'to_title': 'from_title', 'from_title': 'to_title'})
])

# Drop duplicates just in case, and sort
edges_undirected_df = edges_undirected_df.drop_duplicates().sort_values(['from_title', 'to_title']).reset_index(drop=True)

edges_undirected_df.to_csv('./edges_undirected_df.csv', index=False)


체크섬을 확인하여 동일한 결과를 얻었는지 확인합니다.
- `f2c7e4cfc9895e3dc1eabc323f6fa6164eb515e5e229394bd2c795cf020ee391  ./s_id_title.csv`
- `2b475e5fffb6e5154a3ae3dd6dc63f6b0c5261b50c914b316787a5c1e1c295e6  ./edges_df.csv`
- `eb6ee598b1598a84e178922918b5264d0a57215cd9c5cdfcdd10f9cac17df652  ./edges_undirected_df.csv`

In [None]:
!sha256sum ./s_id_title.csv
!sha256sum ./edges_df.csv
!sha256sum ./edges_undirected_df.csv

f2c7e4cfc9895e3dc1eabc323f6fa6164eb515e5e229394bd2c795cf020ee391  ./s_id_title.csv
2b475e5fffb6e5154a3ae3dd6dc63f6b0c5261b50c914b316787a5c1e1c295e6  ./edges_df.csv
eb6ee598b1598a84e178922918b5264d0a57215cd9c5cdfcdd10f9cac17df652  ./edges_undirected_df.csv
