In [348]:
# reload for every python code before executing everytime
%load_ext autoreload
%autoreload 2

In [350]:
import pandas as pd
import glob
import re
import os
from datetime import datetime, timedelta

# Kiwi POS tagger
from kiwipiepy import Kiwi

In [354]:
kiwi = Kiwi()

In [389]:
text_files = glob.glob('../../raw_data/BioNutrion_Dialog/*/*.txt')

In [390]:
len(text_files)

107

In [391]:
# notice: files are not stored as UTF-8, so need to specify it explicitly
idx = 98
with open(text_files[idx], 'r', encoding='utf-8') as fp:
    print(text_files[idx])
    dialog_contents = fp.read()

../../raw_data/BioNutrion_Dialog/2022-06-20 08_28_58/2022-06-20 김수연.txt


In [393]:
dialog_contents = dialog_contents.split('\n')

# Extract participant's name

* extract name from file name
* encrypt or shuffle name for personal info security

In [321]:
filename = os.path.basename(text_files[idx])
dirname = os.path.dirname(text_files[idx])

In [322]:
person_name = filename.split(' ')[-1].replace('.txt', '')

In [323]:
print(dirname)
print(filename)
print(person_name)

../../raw_data/BioNutrion_Dialog/2022-06-20 08_28_58
2022-06-20 김수연.txt
김수연


# Extract Date and Time
* 날짜와 시간이 함께 검출되어야 완전한 발화
* 날짜와 시간이 없으면 이전 발화 내용과 구분되는 문장으로 통합

## Regex for Date
* regular expression = ```^\d{4}-\d{2}-\d{2}```

## Regex for Time
* Regular Expression = ```\d{2}:\d{2}:\d{2}```

In [237]:
date_matched = re.search(r'^\d{4}-\d{2}-\d{2}', dialog_contents[3])

In [238]:
date_regex = re.compile(r'^\d{4}-\d{2}-\d{2}')

In [239]:
time_matched = re.search(r'\d{2}:\d{2}:\d{2}', dialog_contents[3])

In [240]:
time_regex = re.compile(r'\d{2}:\d{2}:\d{2}')

In [367]:
def is_full_talk(sentence: str):
    
    if type(sentence) is not str:
        print(sentence, 'is not a string')
    
#     date_matched = re.search(r'^\d{4}-\d{2}-\d{2}', sentence)
    date_matched = date_regex.search(sentence)
#     time_matched = re.search(r'\d{2}:\d{2}:\d{2}', sentence)
    time_matched = time_regex.search(sentence)
    
    if date_matched and time_matched:
        return True
    else:
        return False

In [364]:
def convert_single_talk(sentences: list, day_sep:str):
    ret_sentences = []
    head_talk = sentences[0]
    for sent in sentences[1:]:
        if is_full_talk(sent):
            ret_sentences.append(head_talk)
            head_talk = sent
        elif date_regex.search(sent.strip()):
            ret_sentences.append(head_talk)
            ret_sentences.append(day_sep)
            head_talk = sent
        else:            
            head_talk += ' ' + sent
            
    return ret_sentences

In [243]:
single_dialog = convert_single_talk(sentences=dialog_contents, day_sep='=' * 20)

In [406]:
def convert_to_dict(sentences:list):
    
    talk_dict = {}
    
    idx = 1
    for sent in sentences:
        if is_full_talk(sentence=sent):
            date_str = date_regex.search(sent).group()
            time_str = time_regex.search(sent).group()
            sent_splits = sent.split(' ')
            utterances = ' '.join(sent_splits[4:])
            utters = kiwi.split_into_sents(utterances)
            _utters = [v.text for v in utters]
            for utter in _utters:
                talk_dict[idx] = {
#                     'date': datetime.strptime(date_str, "%Y-%m-%d"),
#                     'date_time': datetime.strptime(date_str + ' ' + time_str, "%Y-%m-%d %H:%M:%S"),
                    'date': date_str,
                    'date_time': time_str,
                    'participants': sent_splits[2],
                    'utterances': utter
                }
                idx += 1
            
    return talk_dict

In [381]:
talks = convert_to_dict(sentences=single_dialog)

In [382]:
talk_df = pd.DataFrame().from_dict(talks).T

In [383]:
len(talk_df)

752

In [384]:
talk_df.keys()

Index(['date', 'date_time', 'participants', 'utterances'], dtype='object')

In [310]:
def divide_session(sentences:list, delta_mins:int, session_sep:str):
    # divide full dialogs into potential single intent sessions by seperator
    # prerequisites 1. the sentences must be formed with single utterance
    
    # Notice: this algorithm is simple but heuristic
    # The current talk is delayed more than ```delta mins``` -> treat this talk as different session/intent 
    delta_thld = timedelta(minutes=delta_mins)
    ret_sentences = []
    prev_date_time = None
    for sent in sentences:
        # step 1. get talk's date and time info only for full talk
    
        if is_full_talk(sentence=sent):
            date_str = date_regex.search(sent).group()
            time_str = time_regex.search(sent).group()
            cur_date_time = datetime.strptime(date_str + ' ' + time_str, "%Y-%m-%d %H:%M:%S")
            
            delta_time = cur_date_time - prev_date_time if prev_date_time else timedelta(minutes=0)
            
            # step 2. delta time is bigger than delta mins -> insert session separator
            if delta_time >= delta_thld:
                ret_sentences.append(session_sep)
                ret_sentences.append(sent)
            else:
                ret_sentences.append(sent)
            prev_date_time = cur_date_time
            
        else:
            ret_sentences.append(sent)
            
    return ret_sentences

In [315]:
sessioned_dialog = divide_session(sentences=single_dialog, delta_mins=180, session_sep = '*' * 30)

In [324]:
sessioned_dialog[0]

'셔니 님과의 상담 저장한 날짜 : 2022-06-20 08:28:58'

# Extract Alias

In [328]:
alias = sessioned_dialog[0].split(' ')[0]
print(alias)

셔니


# Extract All Participants in Conversation

In [335]:
def extract_talk_party(sentences:list):
    
    party_name = {}
    for sentence in sentences:
        if is_full_talk(sentence=sentence):
            splits = sentence.split(' ')
            party_name[splits[2]] = 1 if splits[2] not in party_name.keys() else 1
            
    
    return party_name.keys()

In [336]:
parties = extract_talk_party(sentences=single_dialog)

In [337]:
print(parties)

dict_keys(['셔니', '고은영', '김*영', '홍*민', '바이오뉴트리온_4', '고*미'])


# Automate Full Workflows

In [407]:
date_regex = re.compile(r'^\d{4}-\d{2}-\d{2}')
time_regex = re.compile(r'\d{2}:\d{2}:\d{2}')

# notice: files are not stored as UTF-8, so need to specify it explicitly
for idx, text_file in enumerate(text_files):
    
    with open(text_file, 'r', encoding='utf-8') as fp:
        dialog_contents = fp.read()
        
    dialog_contents = dialog_contents.split('\n')   
    
    filename = os.path.basename(text_file)
    dirname = os.path.dirname(text_file)
    person_name = filename.split(' ')[-1].replace('.txt', '')
    
    print('processing:', idx, filename, dirname, person_name)
    
    single_dialog = convert_single_talk(sentences=dialog_contents, day_sep='=' * 20)
    talks = convert_to_dict(sentences=single_dialog)
    talk_df = pd.DataFrame().from_dict(talks).T
    
    base_path = '../../data/cleansed_data'
    save_path = os.path.join(base_path, filename.replace('txt', 'csv'))
    talk_df.to_csv(save_path, sep='\t')
    
    print('saving:', save_path)
    

processing: 0 2022-06-20 최선희.txt ../../raw_data/BioNutrion_Dialog/2022-06-20 08_44_40 최선희
saving: ../../data/cleansed_data/2022-06-20 최선희.csv
processing: 1 2022-08-24 조연주.txt ../../raw_data/BioNutrion_Dialog/2022-08-24 13_43_56 조연주
saving: ../../data/cleansed_data/2022-08-24 조연주.csv
processing: 2 2022-08-24 고아라.txt ../../raw_data/BioNutrion_Dialog/2022-08-24 13_37_01 고아라
saving: ../../data/cleansed_data/2022-08-24 고아라.csv
processing: 3 2022-06-20 임혁재.txt ../../raw_data/BioNutrion_Dialog/2022-06-20 08_44_47 임혁재
saving: ../../data/cleansed_data/2022-06-20 임혁재.csv
processing: 4 2022-08-24 이수행.txt ../../raw_data/BioNutrion_Dialog/2022-08-24 13_56_31 이수행
saving: ../../data/cleansed_data/2022-08-24 이수행.csv
processing: 5 2022-06-20 강현정.txt ../../raw_data/BioNutrion_Dialog/2022-06-20 08_45_30 강현정
saving: ../../data/cleansed_data/2022-06-20 강현정.csv
processing: 6 2022-06-20 이수경.txt ../../raw_data/BioNutrion_Dialog/2022-06-20 08_44_54 이수경
saving: ../../data/cleansed_data/2022-06-20 이수경.csv
proces