In [1]:
import re, unicodedata, chinese_converter
from glob import glob
from lxml import etree


In [2]:
#text loading
corpus_files = glob("The-spoken-L1-corpus-main/L1-L1 transcripts/*.txt")
CM_txt = ""
for file in corpus_files:
    with open(file, "r", encoding='utf-8-sig') as f:
        CM_txt += f.read()

In [3]:
#pre-process corpus texts
processed_corpus = []
de_space = re.sub(" ", "\n", CM_txt)
de_mark = re.sub(r"<.+> ?", "", de_space)
ta_variant = re.sub(r"她", "他", de_mark)
TA_char = re.sub(r"TA", "他", ta_variant)
de_filler = re.sub(r"(?<![a-zA-Z])eng|erm?(?![a-zA-Z])", "\n", TA_char)
de_er2 = re.sub(r"儿(?!子|童)", "", de_filler)
sents = de_er2.split("\n")
for sent in sents:
    if re.search(r"[\u4e00-\u9fff]+", sent): 
        processed_corpus.append(sent)

In [4]:
#ngrams
def n_dict(n, corpus):
    n_dict = {}
    for sent in corpus:
        sent_len = len(sent)
        if sent_len >= n:
            for num in range(sent_len):
                if num+n <= sent_len:
                    n_gram = sent[num:num+n]
                    if n>2 and n_gram == sent[num]*n:
                        continue
                    else:
                        if n_gram in n_dict:
                            n_dict[n_gram] +=1
                        else:
                            n_dict[n_gram] =1
                else:
                    continue
    return n_dict
CM_fourdict = n_dict(4, processed_corpus)
CM_tridict = n_dict(3, processed_corpus)
CM_bidict = n_dict(2, processed_corpus)
CM_unidict = n_dict(1, processed_corpus)
print(len(CM_tridict), len(CM_bidict), len(CM_unidict))

105217 42055 2312


In [5]:
#sort dict by frequnecy into tuples
def sort_dict2tup(ndict, cutoff):
    tup_lst = []
    for key in ndict:
        if ndict[key] < 5: #skip extremely low frequency ngrams
            continue
        else:
            tup_lst.append((key,ndict[key]))
    tup_lst.sort(key = lambda x: x[1], reverse = True)
    return tup_lst[:cutoff+1]

#CM sorted tuples
CM_sort_fourtup = sort_dict2tup(CM_fourdict, len(CM_fourdict))
CM_sort_tritup = sort_dict2tup(CM_tridict, len(CM_tridict))
CM_sort_bitup = sort_dict2tup(CM_bidict, len(CM_bidict))
CM_sort_unitup = sort_dict2tup(CM_unidict, len(CM_unidict))


In [6]:
#remove nonword or phrase bigrams
CM_top_unigram = [tp[0] for tp in CM_sort_unitup[:7]] 
#['是', '就', '的', '我', '个', '那', '后']
CM_except_bigrams = ["可是", "然后", "还是", "什么"]

def rm_nonwords_bitup(ntup): #remove bigrams that contains CM_top_unigram--most likely non words or phrases
    cp_ntup = ntup.copy() #copy preserves the original ntup, making code edits easier
    for tp in cp_ntup[:]:
        if tp[0][0] in CM_top_unigram or tp[0][1] in CM_top_unigram: #any bigrams that has any unigrams from CM_top_unigram
            if tp[0] in CM_except_bigrams:
                continue
            else:
                cp_ntup.remove(tp)
        else:
            continue
    return cp_ntup
CM_bitup = rm_nonwords_bitup(CM_sort_bitup)


In [7]:
#remove nonword or phrase trigrams
CM_top_bigram = [tp[0] for tp in CM_bitup[:50]] #top 50 bigrams are all real words

def overlap_unigram(trigram, CM_top_unigram): #if more than 1 CM_top_unigram in the trigram, mark these trigrams to later reject them as these are likely nonwords
    count = 0
    for char in trigram:
        if char in CM_top_unigram:
            count += 1
    if count > 1:
        return True
    else:
        return False

CM_except_bigrams = ["为什么", "对不起", "新西兰", "怎么样"]
def rm_nonwords_tritup(ntup): #remove trigrams that has CM_top_bigram--most likely nonwords or phrases
    cp_ntup = ntup.copy()
    for tp in cp_ntup[:]:
        if tp[0][:2] in CM_top_bigram or tp[0][1:3] in CM_top_bigram:
            if tp[0] in CM_except_bigrams: #exclude exception(s) that are legitimate real words
                continue
            else:
                cp_ntup.remove(tp)
        elif overlap_unigram(tp[0], CM_top_unigram): #trigrams containing more than one common unigrams are likely nonwords/phrases
            if tp[0] == "有沒有" or tp[0] == "是不是": #exclude exception(s) that are legitimate real words
                continue
            else:
                cp_ntup.remove(tp)
        else:
            continue
    return cp_ntup
CM_tritup = rm_nonwords_tritup(CM_sort_tritup)


In [8]:
# remove overlapping higher gram counts to avoid inflated counts
sort_tup_list = [CM_sort_unitup, CM_bitup, CM_tritup, CM_sort_fourtup]
def correct_sort_ntup(n, cutoff): # n<3
    sort_ntup = sort_tup_list[n-1]
    sort_n1tup = sort_tup_list[n]
    correct_ndict = {} 
    for ngram, value in sort_ntup:
        correct_ndict[ngram] = value #collect ngram texts and freq into dictionary for easier frequency editing
        for n1gram, vl in sort_n1tup[:int(len(sort_tup_list[n])*0.0025)]: #if the concurrent ngram in most frequent n+1gram, this needs to be deducted to avoid inflated counts, eg., "什麼(what)" in "為什麼(why)"
            if ngram in n1gram:
                correct_ndict[ngram] -= vl
    correct_sort_ntup = sort_dict2tup(correct_ndict, cutoff) #sort the new dictionary results after the correction
    return correct_sort_ntup
x = 0.005
CM_top_tritup = CM_tritup[:int(len(CM_tritup)*x)]
CM_top_bitup = correct_sort_ntup(2, int(len(CM_bidict)*x))
sort_tup_list.pop(1)
sort_tup_list.append(CM_top_bitup)
#replace the old n-tuple with the corrected one. This is only meaningful for uni-tuple counts
CM_top_unitup = correct_sort_ntup(1, int(len(CM_unidict)*(20*x)))


In [9]:
#write the ngrams (text only) into txt file
with open("CM_trigram.txt", "w") as f:
    for tup in CM_top_tritup:
        f.write(tup[0] + "\n")
with open("CM_bigram.txt", "w") as f:
    for tup in CM_top_bitup:
        f.write(tup[0] + "\n")
with open("CM_unigram.txt", "w") as f:
    for tup in CM_top_unitup:
        f.write(tup[0] + "\n")

In [10]:
#read unique CM grams in simplified Chinese and break into list of strings
with open("TM_trigram.txt", "r") as f:
    TM_trigram_txt = chinese_converter.to_simplified(f.read())
with open("TM_bigram.txt", "r") as f:
    TM_bigram_txt = chinese_converter.to_simplified(f.read())
with open("TM_unigram.txt", "r") as f:
    TM_unigram_txt = chinese_converter.to_simplified(f.read())

TM_trigram = TM_trigram_txt.split("\n")[:-1]
TM_bigram = TM_bigram_txt.split("\n")[:-1]
TM_unigram = TM_unigram_txt.split("\n")[:-1]


In [11]:
#uniq TM frequent words
def uniq_gram_count(n, TM_ngram, cutoff):
    CM_ntup_lst = [CM_sort_unitup, CM_bitup, CM_tritup]
    uniq_TM = []
    uniq_TM_count = 0
    CM_frequent_ngram = []
    for g, f in CM_ntup_lst[n-1][:int(cutoff*len(CM_ntup_lst[n-1]))]:
        CM_frequent_ngram.append(g)
    for ng in TM_ngram:
        if ng not in CM_frequent_ngram:
            uniq_TM.append(ng)
            uniq_TM_count += 1
    return uniq_TM, uniq_TM_count

c = 0.8
print(uniq_gram_count(3, TM_trigram, c))
print(uniq_gram_count(2, TM_bigram, c))
print(uniq_gram_count(1, TM_unigram, 0.5*c))

(['真的喔', '后我们', '对不对', '跟你讲', '好不好', '跟他讲', '个礼拜', '跟我讲'], 8)
(['一个', 'XX', '这个', '个人', '后来', '嗯嗯', '之后', '两个', 'ei', '喔喔', '好笑', '讲说', '以后', 'eh', '个小', '哪里', '后你', '整个', '好啦', '喔对', '个什', '夸张', '三个', '个月', '后面', '最后', '很像', '个男', '个礼', '重点', '个啊', '超好', '讲话', '几个', '很久'], 35)
(['喔', '嗯', 'X', '啦', '超', '耶', '笑', '拜', '怪', '张', '睡', '久', '万', '湾', '眼'], 15)


In [12]:
sent_end = {}
for utter in processed_corpus:
    if utter[-1] in sent_end:
        sent_end[utter[-1]] +=1
    else:
        sent_end[utter[-1]] =1
sent_end_tup = []
for end in sent_end:
    sent_end_tup.append((end, sent_end[end]))

sent_end_tup.sort(key = lambda x: x[1], reverse = True)
print(sent_end_tup)

[('的', 757), ('吧', 541), ('了', 453), ('对', 438), ('是', 383), ('啊', 350), ('吗', 274), ('嘛', 250), ('个', 173), ('呀', 147), ('种', 136), ('呢', 126), ('好', 113), ('说', 108), ('样', 100), ('后', 88), ('哦', 88), ('得', 69), ('些', 64), ('多', 63), ('么', 61), ('人', 58), ('话', 58), ('候', 53), ('去', 51), ('来', 45), ('下', 44), ('你', 43), ('行', 43), ('点', 41), ('方', 39), ('子', 38), ('情', 37), ('有', 36), ('以', 36), ('年', 36), ('白', 35), ('边', 34), ('事', 34), ('哈', 34), ('觉', 33), ('西', 32), ('就', 32), ('着', 31), ('大', 31), ('呗', 31), ('少', 30), ('课', 29), ('那', 28), ('在', 26), ('道', 26), ('们', 26), ('欸', 24), ('上', 24), ('他', 24), ('家', 24), ('作', 23), ('题', 23), ('过', 22), ('实', 22), ('动', 22), ('能', 21), ('间', 21), ('我', 21), ('力', 21), ('天', 20), ('学', 20), ('面', 19), ('班', 19), ('钱', 19), ('到', 18), ('语', 18), ('啥', 18), ('看', 18), ('法', 18), ('玩', 18), ('业', 18), ('时', 17), ('为', 17), ('会', 16), ('车', 15), ('用', 15), ('师', 15), ('校', 14), ('吃', 14), ('市', 14), ('思', 14), ('次', 13), ('要', 13), ('错',