# relic data, keyword extract notebook 

- 2024.12, tk

In [1]:
import time, os
from datetime import datetime
import pandas as pd
from collections import deque
from openpyxl import Workbook, load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from IPython.display import display
from pprint import pprint
import re

In [2]:
start_time = time.time()

## Enviroments

In [3]:
# file path - please update
ifile = './keyword/용어입력.xlsx'

## Functions

In [4]:
def filename_suffix()->str:
    return f'{os.getpid()}-{datetime.now().strftime("%H%M%S")}'

In [5]:
def nonempty_rows_count(sheet:Worksheet) -> int: 
    non_empty_rows = 0
    for row in sheet.iter_rows():
        if any(cell.value for cell in row):
            non_empty_rows += 1
    return non_empty_rows


In [6]:
def workbook_info(workbook: Workbook, head:int=5, tail:int=5, datadisplay:bool=False) -> None:
    """print workbook and its sheet information"""
    
    # Get and print basic workbook properties
    print("\nWorkbook Properties:")
    #print(f"Title: {workbook.properties.title}")
    #print(f"Subject: {workbook.properties.subject}")
    #print(f"Created: {workbook.properties.created}")
    print(f"Modified: {workbook.properties.modified}")
    print(f"Number of sheets: {len(workbook.sheetnames)}")
    print(f"Names of sheets: {workbook.sheetnames}")
        
    # Get sheet properties
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        print(f"\nSheet: {sheet.title}")
        #print(f"  Dimensions: {sheet.dimensions}")  # Example: 'A1:C10'
        print(f"  Columns: {sheet.max_column}")
        print(f"  Rows: {sheet.max_row}")
        print(f"  Nonempty rows: {nonempty_rows_count(sheet)}")


        if datadisplay:
            # Print a sample of the first 5 rows to understand content
            if head:
                print(f"  First {head} rows:")
                for row in sheet.iter_rows(min_row=1, max_row=head, values_only=True):
                    print(f"    {row}")        
            if tail:
                print(f"  Tail {head} rows:")
                for row in sheet.iter_rows(min_row=sheet.max_row-tail, values_only=True):
                    print(f"    {row}")                        
        

In [7]:
def dataframe_info(df:pd.DataFrame, head:int=5, tail:int=5, datadisplay:bool=False) -> None:
    print(f'shape: {df.shape}')
    print(f'columns:\n{df.columns.tolist()}')
    
    if datadisplay:
        if head:
            print(f'head {head} rows:')
            display(df.head(head))
        if tail:
            print(f'tail {tail} rows:')
            display(df.tail(tail))

In [8]:
def replace_chars(text, old_chars, new_char):
      """
      Replaces all occurrences of the characters in `old_chars` with `new_char` in the given text.
    
      Args:
        text: The input string.
        old_chars: A string containing the characters to be replaced.
        new_char: The character to replace the old characters with.
    
      Returns:
        The modified string with the replacements made.
      """
      for char in old_chars:
        text = text.replace(char, new_char)
      return text

## Excel file information

In [9]:
iwb = load_workbook(ifile)
workbook_info(iwb, datadisplay=False)


Workbook Properties:
Modified: 2024-12-14 10:28:26
Number of sheets: 1
Names of sheets: ['Sheet1']

Sheet: Sheet1
  Columns: 3
  Rows: 24353
  Nonempty rows: 24353


## Excel file read to dataframe

In [10]:
# intput file

input_sheets_all = ['Sheet1'] 
input_sheets_exclude = [] 

assert set(input_sheets_all) == set(iwb.sheetnames), 'input_sheets is not match with input file sheet' 

input_columns = ['sheet','relic_id','6문단'] # 3
input_skiprows = 1
input_usecols = [0,1,2]

input_df = pd.read_excel(ifile, sheet_name=input_sheets_all[0], skiprows=input_skiprows, usecols=input_usecols, header=None, names=input_columns, na_filter=False, dtype='object', engine='openpyxl',)    
    
input_all_rows = input_df.shape[0]


In [11]:

# Strip whitespace from string columns
string_cols = input_df.select_dtypes(include='object').columns
input_df[string_cols] = input_df[string_cols].apply(lambda x: x.str.strip())

# remove invalid rows 
input_invalid_df = input_df[(input_df['6문단'].isnull()) | (input_df['6문단'].str.len() < 1)]
input_valid_df = input_df[(input_df['6문단'].notnull()) & (input_df['6문단'].str.len() > 0)]

# remove duplicate rows 
input_duplicate_df = input_valid_df[input_valid_df.duplicated(subset=['6문단'], keep='last')]
input_valid_df = input_valid_df.drop_duplicates(subset=['6문단'], keep='last') 

input_invalid_rows = input_invalid_df.shape[0]
input_duplicate_rows = input_duplicate_df.shape[0]
input_valid_rows = input_valid_df.shape[0]

idf = input_valid_df

In [12]:
print(f'input rows all={input_all_rows}, valid={input_valid_rows}, invalid={input_invalid_rows}, duplicate={input_duplicate_rows}')

input rows all=24352, valid=22484, invalid=216, duplicate=1652


In [13]:
dataframe_info(idf, datadisplay=False)

shape: (22484, 3)
columns:
['sheet', 'relic_id', '6문단']


In [14]:
dataframe_info(input_invalid_df, datadisplay=True)

shape: (216, 3)
columns:
['sheet', 'relic_id', '6문단']
head 5 rows:


Unnamed: 0,sheet,relic_id,6문단
2090,캡션검수1126(수정),PS0100200100104028200000_A1,
2091,캡션검수1126(수정),PS0100200100104028400000_A1,
2092,캡션검수1126(수정),PS0100200100104028900000_A1,
2093,캡션검수1126(수정),PS0100200100104029100000_A1,
2094,캡션검수1126(수정),PS0100200100104029400000_A1,


tail 5 rows:


Unnamed: 0,sheet,relic_id,6문단
2301,캡션검수1126(수정),PS0100201100102067800000_A1,
2302,캡션검수1126(수정),PS0100201100102078700000_A1,
2303,캡션검수1126(수정),PS0100201100102079600000_A1,
2304,캡션검수1126(수정),PS0100201100102081100000_A1,
2305,캡션검수1126(수정),PS0100201100102081400000_A1,


In [15]:
dataframe_info(input_duplicate_df, datadisplay=True)

shape: (1652, 3)
columns:
['sheet', 'relic_id', '6문단']
head 5 rows:


Unnamed: 0,sheet,relic_id,6문단
54,캡션검수 1125,PS0100200100103674300000_A1,"'한국전통문화유산', '한국전통문양', '상평통보', '엽전', '문자문', '문자..."
77,캡션검수 1125,PS0100200100103676700000_A1,"'한국전통문화유산', '한국전통문양', '상평통보', '엽전', '문자문', '문자..."
206,캡션검수 1125,2010-0833-0000023_A1,"한국전통문화유산', '한국전통문양', '당초문암막새', '암막새', '식물문', '..."
208,캡션검수 1125,2010-0876-0000055_A1,"한국전통문화유산', '한국전통문양', '연화문수막새', '수막새', '식물문', '..."
210,캡션검수 1125,2010-0960-0000002_A1,"한국전통문화유산', '한국전통문양', '연화문수막새', '식물문', '연꽃문', '..."


tail 5 rows:


Unnamed: 0,sheet,relic_id,6문단
23559,1214-2,PS0100100201100105100000_A1,"한국전통문화유산', '한국전통문양', '연꽃무늬 수막새', '복합문', '연꽃문',..."
23576,1214-2,PS0100100201100106500000_A2,"한국전통문화유산, 한국전통문양, 연꽃무늬 수막새, 수막새, 식물문, 순수함, 고귀함"
23731,1214-2,PS0100100102001428700000_A1,"한국전통문화유산, 한국전통문양, 국화문 장경병, 식물문, 생활소품, 생명력, 조화"
24089,1214-2,PS0100100102001638800000_A1,"한국전통문화유산, 한국전통문양, 백자철화대나무무늬항아리, 식물문, 생활소품, 자연, 조화"
24294,1214-2,PS0100100102002327200000_A2,"한국전통문화유산, 한국전통문양, 연화문 수막새, 식물문, 청정, 불멸"


In [16]:
# debug saving
idf.to_excel(f'./keyword/keyinput-valid-{filename_suffix()}.xlsx', index=False)
if input_invalid_rows:
    input_invalid_df.to_excel(f'./keyword/keyinput-invalid-{filename_suffix()}.xlsx', index=False)
if input_duplicate_rows:    
    input_duplicate_df.to_excel(f'./keyword/keyinput-duplicate-{filename_suffix()}.xlsx', index=False)

## Write to output file

In [18]:
keyset = set()
lookup = list()

for row in idf.itertuples(index=False): 
    keystmt = row[2]
    words = keystmt.split(',')
    
    for i,w in enumerate(words,start=1):
        w = replace_chars(w, "\'\"\‘\’", '')
        w = w.strip()
        if w.lower() == 'nan': 
            continue
        if len(w) < 1:
            continue
            
        keypos = (w,i)
        if not keypos in keyset:
            keyset.add(keypos)     
        lookup.append((w, i, row[0],row[1]))

keylist= sorted(keyset)         
keys = [keypos[0] for keypos in keylist]
posi = [keypos[1] for keypos in keylist]

odf = pd.DataFrame({'기존용어': keys, '용어순번': posi}) 
odf['한글용어'] = None
odf['영문용어'] = None
odf = odf.sort_values(by=['기존용어', '용어순번']) 

ldf = pd.DataFrame(lookup, columns=['기존용어', '용어순번', 'sheet', 'relic_id']) 
ldf = ldf.sort_values(by=['기존용어', '용어순번',  'sheet', 'relic_id']) 


In [19]:
dataframe_info(odf, datadisplay=True)

shape: (12888, 4)
columns:
['기존용어', '용어순번', '한글용어', '영문용어']
head 5 rows:


Unnamed: 0,기존용어,용어순번,한글용어,영문용어
0,12각반,3,,
1,1900,3,,
2,1903년 발행한 독수리 보통우표,3,,
3,1933년도 농구 경기 규칙,3,,
4,1전 청동화,3,,


tail 5 rows:


Unnamed: 0,기존용어,용어순번,한글용어,영문용어
12883,힘,10,,
12884,힘,11,,
12885,힘과 권위,9,,
12886,힘과 보호,7,,
12887,蓮花文 수막새,3,,


In [20]:
dataframe_info(ldf, datadisplay=True)

shape: (168236, 4)
columns:
['기존용어', '용어순번', 'sheet', 'relic_id']
head 5 rows:


Unnamed: 0,기존용어,용어순번,sheet,relic_id
23283,12각반,3,캡션검수1126(수정),PS0100200100102343400000_A1
27037,1900,3,캡션검수1201,PS0100202500800808800000_A1
27648,1903년 발행한 독수리 보통우표,3,캡션검수1201,PS0100203400600627200000_A1
27198,1933년도 농구 경기 규칙,3,캡션검수1201,PS0100202500800965400000_A1
12602,1전 청동화,3,캡션검수1126(수정),PS0100100101101786500000_A1


tail 5 rows:


Unnamed: 0,기존용어,용어순번,sheet,relic_id
103851,힘,10,"캡션검수1212(1,4문단)",PS0100100100500286300000_A1
53360,힘,11,캡션검수1206,7825_A1
111376,힘과 권위,9,"캡션검수1212(1,4문단)",PS0100100102002941000000_A1
104240,힘과 보호,7,"캡션검수1212(1,4문단)",PS0100100100500623500000_A3
8993,蓮花文 수막새,3,캡션검수 1125,PS0100101000101400100000_A1


In [21]:
odf = odf.reset_index(drop=True) 
odf.index += 1 
odf.to_excel(f'./keyword/용어작업-{filename_suffix()}.xlsx', index=True, index_label='식별번호', sheet_name='용어작업')

In [22]:
ldf = ldf.reset_index(drop=True) 
ldf.index += 1 
ldf.to_excel(f'./keyword/용어사용-{filename_suffix()}.xlsx', index=True, index_label='식별번호', sheet_name='용어사용')

In [23]:
end_time = time.time()
print(f"Done, eplapsed seconds = {end_time - start_time}")


Done, eplapsed seconds = 82.73450374603271
