In [1]:
import re, unicodedata, chinese_converter, requests
from glob import glob
from lxml import html

In [2]:
#text loading
start_page = requests.get("http://spokentaiwanmandarin.nccu.edu.tw/corpus-data.html")
tree0 = html.fromstring(start_page.content)
corpus_pages = tree0.xpath("//a[@class='mod-articles-category-title ' and @href]/@href") #find the link destination

def scraper(url, corpus):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    p_node = tree.xpath("//td[@style='width: 770px;']/p")
    #A consistent and unique feature that marks the field of speech transcription texts is the format of the cell. Therefore, use xpath to find the node with an attribute that specifies the target cell. 
    if p_node:
        dialogue = tree.xpath("//td[@style='width: 770px;']/p/text()")
    else:
        dialogue = tree.xpath("//td[@style='width: 770px;']/text()") #For some reason, the last sample T050 has one missing layer /p. This is to go around the variance.
    corpus += dialogue
    return

total_corpus = []
for page in corpus_pages: 
    scraper("http://spokentaiwanmandarin.nccu.edu.tw"+page, total_corpus)


In [3]:
#saving original text locally for future use if needed
with open("NCCU_spoken_mandarin_corpus.txt", "w") as f: #saving corpus to local file
    for sent in total_corpus:
        f.write(sent + "\n")

In [4]:
#pre-process corpus texts
#"hon" may be used as sentence end particles and is therefore preserved
processed_corpus = []
extend_count = 0 #syllable extension count--save for future projects: may be related to TM dialect prosody pattern
LN_lst = [] #foreign language use to track on the side: any super high fx that can match up to the top 500 mandarin char/word?

for turn in total_corpus:
    de_unicode = unicodedata.normalize("NFKD", turn)
    de_zero = re.sub(r"\(0?\)", "", de_unicode) #resolve byte difference issues in chinese characters
    if re.search(r"=+", de_zero):
        extend_count += len(re.findall(r"=+", de_zero))
    de_extend = re.sub(r"=+", "", de_zero) #remove syllable extension marks
    if re.search(r"<L[1-6].*L[1-6]>", de_extend):
        code_switch = re.findall(r"< ?L ?[1-6].*(?:L ?[1-6] ?>)", de_extend)
        LN_lst += code_switch
    elif re.search(r"[a-zA-Z ]+L ?[1-6] ?>", de_extend):
        code_switch = re.findall(r"[a-zA-Z ]+L ?[1-6] ?>", de_extend)
        LN_lst += code_switch
    de_Ln = re.sub(r"(?:< ?L ?[1-6].*(?:L ?[1-6] ?>)?)|(?:[a-zA-Z ]+L ?[1-6] ?>)", "", de_extend) #remove code switching marks
    delaugh = re.sub(r"[@＠]+", " ", de_Ln) #remove laughing marks
    de_bracket = re.sub(r"[\[\]]", "", delaugh) #remove overlapping utterance marks
    de_parenth = re.sub(r"\(.*\)", "", de_bracket)
    de_exclaim = re.sub(r"uh|huh|um|hm|mhm|TSK", " ", de_parenth) #may be used as sentence separater #remove nonspeech sounds
    de_X = re.sub(r"< ?[xX]|[xX] ?>", "", de_exclaim) #remove unclear char marks
    de_3dots = re.sub(r"…", r"..", de_X) #resolve another byte difference
    shi_variant = re.sub(r"甚", "什", de_3dots)
    ta_variant = re.sub(r"她", "他", shi_variant)
    ni_variant = re.sub(r"妳", "你", ta_variant) #the last 3 to treat chinese character variants as the same char 
    sent_marker = re.sub(r"\.\.\.?| ", ",", ni_variant) #turn pauses into ","
    sent_lst = sent_marker.split(",") #split text by "," into utterances

    for sent in sent_lst:
        if re.search(r"[\u4e00-\u9fff]+", sent):
            processed_corpus.append(sent) #remove utterances w/o any Chinese characters


In [5]:
#saving raw text locally for future use if needed
with open("processed_NCCU_corpus.txt", "w") as f: #saving corpus to local file
    for sent in processed_corpus:
        f.write(sent + "\n")

In [6]:
#making dictionary of {ngrams:frequency}
def n_dict(n, corpus):
    n_dict = {}
    for sent in corpus:
        sent_len = len(sent) 
        if sent_len >= n: #skip sentences shorter than n
            for num in range(sent_len):
                if num+n <= sent_len: #as long as the current n-gram reading window does not exceed the sentence length:
                    n_gram = sent[num:num+n] #for each char in the utterance, collect n chars starting from the current char to make n-gram
                    if n>2 and n_gram == sent[num]*n: #do not collect an ngram that has 3 or more consecutively repeated chars as these are often interjections
                        continue
                    else:
                        if n_gram in n_dict:
                            n_dict[n_gram] +=1
                        else:
                            n_dict[n_gram] =1
                else:
                    continue
    return n_dict
#TM ngram dictionary of {ngram:frequency}:
TM_fourdict = n_dict(4, processed_corpus)
TM_tridict = n_dict(3, processed_corpus) 
TM_bidict = n_dict(2, processed_corpus)
TM_unidict = n_dict(1, processed_corpus)
print(len(TM_tridict), len(TM_bidict), len(TM_unidict)) #n-gram type counts


114925 51605 2873


In [7]:
#sort dict by frequnecy into tuples
def sort_dict2tup(ndict, cutoff):
    tup_lst = []
    for key in ndict:
        if ndict[key] < 5: #skip extremely low frequency ngrams
            continue
        else:
            tup_lst.append((key,ndict[key])) #turn dictionary keys & values into tuple as dictionaries can not be sorted
    tup_lst.sort(key = lambda x: x[1], reverse = True) #order list from highest to lowest freqency
    return tup_lst[:cutoff] #cut off n-grams after xth item (x=cutoff)

#sorted ngram tuples
TM_sort_fourtup = sort_dict2tup(TM_fourdict, len(TM_fourdict)) #decided not to use this information in the end
TM_sort_tritup = sort_dict2tup(TM_tridict, len(TM_tridict))
TM_sort_bitup = sort_dict2tup(TM_bidict, len(TM_bidict))
TM_sort_unitup = sort_dict2tup(TM_unidict, len(TM_unidict))



In [8]:
"""def freq(ntup): #counting total frequency for later use
    freq = 0
    for ngram, f in ntup:
        freq += f
    return freq
print(freq(TM_sort_tritup))
print(freq(TM_sort_bitup))
print(freq(TM_sort_unitup))"""

'def freq(ntup): #counting total frequency for later use\n    freq = 0\n    for ngram, f in ntup:\n        freq += f\n    return freq\nprint(freq(TM_sort_tritup))\nprint(freq(TM_sort_bitup))\nprint(freq(TM_sort_unitup))'

In [9]:
#remove nonword or phrase bigrams
TM_top_unigram = ['是', '就', '我', '他', '的', '那', '有', '不'] #most frequent unigrams that are almost guarantee to be stand-alone real words; [tp[0] for tp in TM_sort_unitup[:7]]
TM_except_bigrams = ["可是", "然後", "還是", "什麼"] #bigrams that contains TM_top_unigram--most likely non words or phrases. This can later be compiled/expanded with a Chinese dictionary
def rm_nonwords_bitup(ntup): #remove except_bigrams
    cp_ntup = ntup.copy() #copy preserves the original ntup, making code edits easier
    for tp in cp_ntup[:]:
        if tp[0][0] in TM_top_unigram or tp[0][1] in TM_top_unigram: #any bigrams that has any unigrams from TM_top_unigram; note: there is ambiguity in some of these bigrams, e.g., 一起 can mean "together" or an incident counter like in 一起 fire/car crash... in which the bigram is actually a phrase, but considering this is a spoken corpus, the colloquial meaning are favored here
            if tp[0] in TM_except_bigrams: #exclude compound words that are legitimate words
                continue
            else:
                cp_ntup.remove(tp) #remove the ngram tuple if it has one of the char in TM_except_bigrams
        else:
            continue
    return cp_ntup
TM_bitup = rm_nonwords_bitup(TM_sort_bitup) #bigram tuples with possible phrases/nonwords removed


In [10]:
#remove nonword or phrase trigrams
TM_top_bigram = [tp[0] for tp in TM_bitup[:50]] #top 50 bigrams are mostly real words
#['然後', '什麼', '覺得', '對啊', '因為', '可是', '這樣', '知道', '他們', '一個', '時候', '所以', '可以', '可能', '對對', '不知', '好像', '不會', '現在', 'XX', '怎麼', '應該', '其實', '而且', '比較', '自己', '還是', '這個', '如果', '為什', '你們', '樣子', '東西', '一直', '不要', '也不', '很多', '還有', '個人', '已經', '反正', '想說', '感覺', '後來', '老師', '嗯嗯', '想要', '之後', '一下']

def overlap_unigram(trigram, TM_top_unigram): #if more than 1 TM_top_unigram in the trigram, mark these trigrams to later reject them as these are likely nonwords
    count = 0
    for char in trigram:
        if char in TM_top_unigram:
            count += 1
    if count > 1:
        return True
    else:
        return False
    
TM_except_trigrams = ["為什麼", "對不起", "怎麼樣"]
def rm_nonwords_tritup(ntup): #remove trigrams that has TM_top_bigram--most likely nonwords or phrases
    cp_ntup = ntup.copy()
    for tp in cp_ntup[:]:
        if tp[0][:2] in TM_top_bigram or tp[0][1:3] in TM_top_bigram:
            if tp[0] in TM_except_trigrams: #exclude exception(s) that are legitimate real words
                continue
            else:
                cp_ntup.remove(tp)
        elif overlap_unigram(tp[0], TM_top_unigram): #trigrams containing more than one common unigrams are likely nonwords/phrases
            if tp[0] == "有沒有" or tp[0] == "是不是": #exclude exception(s) that are legitimate real words
                continue
            else:
                cp_ntup.remove(tp)
        else:
            continue
    return cp_ntup
TM_tritup = rm_nonwords_tritup(TM_sort_tritup) #trigram tuples with possible phrases/nonwords removed


In [11]:
#remove overlapping higher gram counts to avoid inflated counts
sort_tup_list = [TM_sort_unitup, TM_bitup, TM_tritup, TM_sort_fourtup] 
def correct_sort_ntup(n, cutoff): # n<3
    sort_ntup = sort_tup_list[n-1] #sorted (n-gram, frequency)
    sort_n1tup = sort_tup_list[n] #sorted (n+1-gram, frequency)
    correct_ndict = {} 
    for ngram, value in sort_ntup:
        correct_ndict[ngram] = value #collect ngram texts and freq into dictionary for easier frequency editing
        for n1gram, vl in sort_n1tup[:int(len(sort_tup_list[n])*0.0025)]: #if the concurrent ngram in most frequent n+1gram, this needs to be deducted to avoid inflated counts, eg., "什麼(what)" in "為什麼(why)"
            if ngram in n1gram:
                correct_ndict[ngram] -= vl
    correct_sort_ntup = sort_dict2tup(correct_ndict, cutoff) #sort the new dictionary results after the correction
    return correct_sort_ntup
x = 0.02 #adjust for most frequent ___ words. Here I decided to use 2% after reviewing different criteria. At this criteria it yields around 50x occurrence of the n-grams and seems to preserve the most real words and return the least non-words/phrases
TM_top_tritup = TM_tritup[:int(len(TM_tritup)*x)]
TM_top_bitup = correct_sort_ntup(2, int(len(TM_bidict)*0.375*x))
sort_tup_list.pop(1) #replace the old n-tuple with the corrected one. This is only meaningful for uni-tuple counts
sort_tup_list.append(TM_top_bitup)
TM_top_unitup = correct_sort_ntup(1, int(len(TM_unidict)*10*x))


In [17]:
#save unique ngrams in txt files
with open("TM_trigram.txt", "w") as f:
    for tup in TM_top_tritup:
        f.write(tup[0] + "\n")
with open("TM_bigram.txt", "w") as f:
    for tup in TM_top_bitup:
        f.write(tup[0] + "\n")
with open("TM_unigram.txt", "w") as f:
    for tup in TM_top_unitup:
        f.write(tup[0] + "\n")


In [18]:
#read unique CM grams in traditional Chinese and break into list of strings
with open("CM_trigram.txt", "r") as f:
    CM_trigram_txt = chinese_converter.to_traditional(f.read())
with open("CM_bigram.txt", "r") as f:
    CM_bigram_txt = chinese_converter.to_traditional(f.read())
with open("CM_unigram.txt", "r") as f:
    CM_unigram_txt = chinese_converter.to_traditional(f.read())

CM_trigram = CM_trigram_txt.split("\n")[:-1]
CM_bigram = CM_bigram_txt.split("\n")[:-1]
CM_unigram = CM_unigram_txt.split("\n")[:-1]


In [19]:
#uniq CM frequent words
def uniq_gram_count(n, CM_ngram, cutoff): #
    TM_ntup_lst = [TM_sort_unitup, TM_bitup, TM_tritup] #(almost) all n-grams in TM
    uniq_CM = []
    uniq_CM_count = 0
    TM_frequent_ngram = []
    for g, f in TM_ntup_lst[n-1][:int(cutoff*len(TM_ntup_lst[n-1]))]:
        TM_frequent_ngram.append(g) #create a list of n-grams in TM by cutoff proportion (0<=cutoff<=1)
    for ng in CM_ngram:
        if ng not in TM_frequent_ngram:
            uniq_CM.append(ng) #if an n-gram does not exist in TM_frequent_ngram, then treat it as an unique CM n-gram and add to uniq_CM
            uniq_CM_count += 1 #track how many unique n-grams identified in the end
    return uniq_CM, uniq_CM_count

c = 0.9 #0.9 results seem more meaningful than 0.9 or lower
print(uniq_gram_count(3, CM_trigram, c))
#['新西蘭', '挺好的', '比方說', '但是但', '在北京', '肯定是', '在新西', '還挺好', '我之前', '西蘭的', '在國內', '個孩子', '如果你']
print(uniq_gram_count(2, CM_bigram, c))
#['肯定', '還挺', '明白', '英語', '也挺', '方說', '別特', '房東', '在國', '在北', '漢語', '麼著', '後當', '環境', '正好', '估計', '旅遊', '挺好', '咱們', '者說', '別好', '們倆', '吧然', '月份', '經常', '個孩', '屬於', '經歷']
print(uniq_gram_count(1, CM_unigram, c))
#['挺', '啥', '唄', '咱', '倆', '呆', '估']

(['新西蘭', '挺好的', '比方說', '但是但', '在北京', '肯定是', '在新西', '還挺好', '我之前', '西蘭的', '在國內', '個孩子', '如果你'], 13)
(['肯定', '還挺', '明白', '英語', '也挺', '方說', '別特', '房東', '在國', '在北', '漢語', '麼著', '後當', '環境', '正好', '估計', '旅遊', '挺好', '咱們', '者說', '別好', '們倆', '吧然', '月份', '經常', '個孩', '屬於', '經歷'], 28)
(['挺', '啥', '唄', '咱', '倆', '呆', '估'], 7)


In [20]:
#sentence-final particles (collect the last word of each utterance and then sort by frequency)
sent_end = {}
for utter in processed_corpus:
    if utter[-1] in sent_end: #collect last character in each utterance into sent_end and track the frequency
        sent_end[utter[-1]] +=1
    else:
        sent_end[utter[-1]] =1
sent_end_tup = []
for end in sent_end: #turn the sent_end dictionary into tuple list 
    sent_end_tup.append((end, sent_end[end]))

sent_end_tup.sort(key = lambda x: x[1], reverse = True) #sort the tuple list
freq_SFP = ['啊','喔','嗎','了','吧','啦','耶','欸','嘛','呀','呢','哦','噢','吶'] #extract the most frequent SFP in TM
common_SFP = ['啊','嗎','了','吧','嘛','呀','呢','哦'] #common SFPs only
def token_count(tuplst, lst): 
    count = 0
    for tp in tuplst:
        if tp[0] in lst:
            count += tp[1]
    return count
print(token_count(sent_end_tup, freq_SFP))
print(token_count(sent_end_tup, common_SFP)) #token frequency for the 8 common SFPs


12414
8620


In [39]:
common_SFP_omit_le = ['啊','嗎','吧','嘛','呀','呢','哦']
def SFP_count(chi_char):
    SFP_count = 0
    for utt in processed_corpus:
        for char in utt:
            if char == chi_char:
                SFP_count += 1
    return SFP_count
print(SFP_count("了"))

2366


In [35]:
def prob_SFP(corpus, SFP_lst):
    count = 0
    corpus_len = 0
    for utt in corpus:
        corpus_len += len(utt)
        for char in utt:
            if char in SFP_lst:
                count += 1
    return count/(corpus_len-SFP_count("了")), count, corpus_len-SFP_count("了")
print(prob_SFP(processed_corpus, common_SFP_omit_le))

(0.028853727973602437, 9304, 322454)


In [16]:
#chart making only

"""TM_sort_tritup
with open("TM_trigram.csv", "w") as f:
    f.write("phrase,freq\n")
    for tgram in TM_sort_tritup:
        f.write(tgram[0]+f",{tgram[1]}\n")
with open("TM_bigram.csv", "w") as f:
    f.write("phrase,freq\n")
    for bgram in TM_sort_bitup:
        f.write(bgram[0]+f",{bgram[1]}\n")
with open("TM_unigram.csv", "w") as f:
    f.write("phrase,freq\n")
    for ugram in TM_sort_unitup:
        f.write(ugram[0]+f",{ugram[1]}\n")"""

'TM_sort_tritup\nwith open("TM_trigram.csv", "w") as f:\n    f.write("phrase,freq\n")\n    for tgram in TM_sort_tritup:\n        f.write(tgram[0]+f",{tgram[1]}\n")\nwith open("TM_bigram.csv", "w") as f:\n    f.write("phrase,freq\n")\n    for bgram in TM_sort_bitup:\n        f.write(bgram[0]+f",{bgram[1]}\n")\nwith open("TM_unigram.csv", "w") as f:\n    f.write("phrase,freq\n")\n    for ugram in TM_sort_unitup:\n        f.write(ugram[0]+f",{ugram[1]}\n")'