In [2]:
import numpy as np
import pandas as pd
import re
from itertools import zip_longest

# ASSUMPTIONS
# a.b. surname -> A.B. Surname
# one-two-three ->  One-Two-Three or One-two-three ? (Instructions unclear)

# LIMITATIONS
# Cannot keep brackets in พระมหาอิษฎามาศ ปภสฺสรวณฺโณ (สุวรรณะ)
# Cannot split 'มาริยา เถาอินปาก   ธีรชัย เนตรถนอมศักดิ์  เพ็ญณี  แนรอท' where no commas are used
# Cannot split names with comma like Jose H. Bergantin, Jr.

path = '/kaggle/input/thaijo-researcher-for-code-submission/test.csv'
prefixes = ['Authors :', 'ม.ร.ว.', 'Assoc.Prof.Dr.', 'Assist. Prof. Dr.', 'Asst.Prof.Dr.', 'Mr.', 'M.L.', 'อาจารย์ ดร.', 'อ.ดร.', 'ผู้ช่วยศาสตราจารย์ ดร.', 'ผู้ช่วยศาสตราจารย์', 'รองศาสตราจารย์', 'ว่าที่ พ.ต. ดร.', 'ว่าที่ ร.ต.', 'ร.ต.อ.', 'พระครู', 'Phrakru', 'พระมหา', 'พันเอก', 'รองศาสตราจารย์ ดร.', 'รศ.', 'พระ', 'อาจารย์', 'ดร.', 'ผศ.ดร.', 'รศ.ดร.', 'Dr.',  'กสทช.', 'ครู']
suffixes = ['M.D.', 'M.R.', 'Ph.D', 'และคณะ', 'And Other', 'et al', 'et.al', 'Jr.', 'B. Pharm.', 'M. Pharm.', 'ดร.']
fail_words = ['บรรณาธิการ', 'สารบัญ', 'committee', 'editorial', 'Cover Vol.', 'Journal', 'ผู้ทรงคุณวุฒิ', 'ผู้เขียน', 'โรงพยาบาล', 'วิทยาลัย', 'วารสาร', 'Cover Vol.', 'Thailand', 'Author ']

def detect_language(text: str) -> str:
    thai_range = (0x0E01, 0x0E5B)  # Thai Unicode range
    # arabic_range = (0x0600, 0x06FF)

    num_thai = sum(thai_range[0] <= ord(char) <= thai_range[1] for char in text)
    num_english = len(re.findall('[a-zA-ZàâçéèêëîïôùûüÿæœÀÂÇÉÈÊËÎÏÔÙÛÜŸÆŒ]', text))

    is_thai = num_thai > 0
    is_english = num_english > 0

    if all((is_thai, is_english)):
        return "Thai and English"
    if is_thai:
        return "Thai"
    if is_english:
        return "English"
    else:
        return None
    
def extract_thai(text: str) -> str:
    return ''.join(re.findall(r'[\u0E01-\u0E5B .,]', text))

def extract_eng(text: str) -> str:
    return ''.join(re.findall('[a-zA-ZàâçéèêëîïôùûüÿæœÀÂÇÉÈÊËÎÏÔÙÛÜŸÆŒ .,-]', text))

def title_case(text: str) -> str:
    # Instructions on required cases weren't clear, so tried a multiple
    if not bool(text):
        return ''
    to_join = []
    words = text.split(' ')
    for word in words:
        if not word:
            continue
        # ABcD -> abcd
        word = word.lower()
        # abcd -> Abcd
        word = word[0].upper() + word[1:]
#         word = word.title()
        to_join.append(word)

    res = ' '.join(to_join)    
    # a.a. -> A.a.
    res = re.sub(r'(?=\s)[a-z]{1}\.', lambda x: x.group(0).upper(), res)
    # a.a -> a.A
    res = re.sub(r'\.{1}[a-z]{1}\s', lambda x: x.group(0).upper(), res)
    # a.a\s -> a.A\s
    res = re.sub(r'\.{1}[a-z]{1}\.{1}\s', lambda x: x.group(0).upper(), res)
    return res


def clean_sentence(text: str) -> str:
    if not bool(text):
        return ''
    text = text.strip(' ')
    text = re.sub(' +', ' ', text)
    text = re.sub(',+', ',', text)
    # Remove email addresses
    text = re.sub('[a-zA-Z.]+@[a-zA-Z.]+', '', text)
    # Remove language change fault typos
    text = text.lstrip(" ,.'๋์ื'")
    text = text.strip(' -–')
    return text

def split_sentence(text: str) -> str|list[str]:
    try:
        return text.split(',')
    except:
        return text
    
def remove_prefix_suffix(text: str) -> str:
    # Cannot use str.removeprefix() or str.removesuffix() because it will need the case to change.
    for prefix in prefixes:
        if text.lower().startswith(prefix.lower()):
            text = text[len(prefix):]
    for suffix in suffixes:
        if text.lower().endswith(suffix.lower()):
            text = text[:-len(suffix)]
    return text

def clean_eng(text: str) -> str:
    text = remove_prefix_suffix(text)
    text = extract_eng(text)
    text = clean_sentence(text)
    text = title_case(text)
    return text

def clean_thai(text: str) -> str:
    text = remove_prefix_suffix(text)
    text = extract_thai(text)
    text = clean_sentence(text)
    return text

def clean_word(text: str) -> str:
    text = clean_sentence(text)
    language = detect_language(text)
    if language is None:
        return text
    
    if language == 'Thai and English':
        # Thai or English comes first
        two_names = []
        # Remove irrelevant aprts before judging language order
        text = remove_prefix_suffix(text)
        # What comes first
        match detect_language(text[:2]):
            case 'Thai':
                # Thai first
                thai_text = clean_thai(text)
                thai_text = extract_thai(thai_text)
                thai_text = clean_thai(thai_text)
                two_names.append(thai_text)
                # Then English
                eng_text = clean_eng(text)
                eng_text = extract_eng(eng_text)
                eng_text = clean_eng(eng_text)
                two_names.append(eng_text)
                return two_names

            case 'English':
                # English first
                eng_text = clean_eng(text)
                eng_text = extract_eng(eng_text)
                eng_text = clean_eng(eng_text)
                two_names.append(eng_text)
                # Then Thai
                thai_text = clean_thai(text)
                thai_text = extract_thai(thai_text)
                thai_text = clean_thai(thai_text)
                two_names.append(thai_text)
                return two_names

        return text
    
    if language == 'English':
        return clean_eng(text)

    if language == 'Thai':
        return clean_thai(text)

    raise ValueError('Unknown language error')

def final_clean(text: str) -> str:
    return re.sub(r' \.', '', text).rstrip(',').replace('..', '')

def check_fail_words(text: str|list[str]) -> str:
    if not bool(text):
        return ''
    
    temp_text = text
    if isinstance(temp_text, list):
        temp_text = ' '.join(temp_text)
    for word in fail_words:
        if word.lower() in temp_text.lower():
            return ''
    return text

def append_names(name_list: list[str], authors: str|list[str]) -> list[str]:
    res = []
    if not bool(authors):
        return []
    if isinstance(authors, str):
        authors = [authors]
    for a in authors:
        a = final_clean(a)
        if a not in name_list:
            res.append(a)
    return res

def main():
    
    # bool(np.nan) is True -> creates problems
    df = pd.read_csv(path).replace({np.nan: None})
    entries = []
    for _, row in df.iterrows():

        keys = [f"{row['_id']}_{i}" for i in range(1,11)]
        names = []

        # Get author
        author = row['_source.author']
        author = check_fail_words(author)
        author = clean_sentence(author)
        author = clean_word(author)
        names.extend(append_names(names, author))

        # Get Co-authors
        co_authors = row['_source.co-author']
        co_authors = clean_sentence(co_authors)
        for ca in co_authors.split(','):
            ca = clean_word(ca)
            ca = check_fail_words(ca)
            names.extend(append_names(names, ca))

        # If no names at all
        if not any(names):
            names = [None for _ in range(10)]

        # Clip at length 10
        if len(names) > 10:
            names = names[:10]

        # zip_longest will automatically add None
        entry = list(zip_longest(keys, names))
        assert len(entry) == 10

        entries.extend(entry)

    df = pd.DataFrame(entries)
    df.columns = ['id', 'name']
    df.to_csv('submission.csv', index=False)
    

if __name__ == '__main__':
    main()