## L555  Final Project

Zeping Liu (zepliu@iu.edu) 

StartDate: November 17, 2021  
EndDate: December 8, 2021


The aim of this program is **to obtain key measurements of sentence complexity in elementary school instructional materials from Grade 1 to Grade 6 used in mainland China**. Here, we identified two major measures to conceptualize sentence complexity from applied linguistic studies (Ortega, 2003; Kreyer, 2006; Cheung & Kemper, 1992; Lu, 2010). 

1. Length of sentences in terms of words and characters; 
2. Syntactic complexity:
    - The depth of syntactic trees;
    - The amount of common relative clauses;
    - The dependency distance.
    
References:
<br>
[1] Cheung, H. & Kemper, S. (1992). “Competing complexity metrics and adults’ production of complex sentences”. Applied Psycholinguistics, 13 (1), 53-76.
<br>
[2] Kreyer, R. (2006). Inversion in Modern Written English: Syntactic Complexity, Information Status and the Creative Writer. Tübingen: Gunter Narr Verlag.
<br>
[3] Lu, X. (2010). Automatic analysis of syntactic complexity in second language writing. International Journal of Corpus Linguistics, 15(4), 474-496.
<br>
[4] Ortega, L. (2003). Syntactic complexity measures and their relationship to L2 proficiency: A research synthesis of college-level L2 writing. Applied Linguistics, 24: 492–518.
<br>
[5] Xia, F. (2000). The part-of-speech tagging guidelines for the Penn Chinese Treebank (3.0). IRCS Technical Reports Series, 38.

In [None]:
import stanza
import nltk

In [None]:
# Install CoreNLP
corenlp_path = '/Users/zeping/My Drive/Courses/Fall 2021/Programming for Computational Linguistics/Files/LING-L555/stanford-corenlp'
# stanza.install_corenlp(dir=corenlp_path)

### download chinese models: latest is 4.2.0
# stanza.download_corenlp_models(model='chinese', version='4.3.2', dir=corenlp_path)

In [None]:
### Import packages

from stanza.server import CoreNLPClient
import os, re

### Set CORENLP_HOME so the computer knows where to look for CoreNLP 

os.environ["CORENLP_HOME"] = corenlp_path

In [None]:
### Define a client object
client = CoreNLPClient(
    properties='chinese',
    memory='4G', 
    # annotators describe which tool(s) you want to use to process the input sentences
    # see full list: https://stanfordnlp.github.io/CoreNLP/annotators.html
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
    endpoint='http://localhost:9002',
    # json is easier to get the parse tree as a string
    output_format='json',
    # prints out the log
    be_quiet=False
)

In [None]:
client.start()

In [None]:
# Try a simple sentence

text = '''
小壁虎爬呀爬，爬到小河边。他看见小鱼摇着尾巴，在河里游来游去。小壁虎说：“小鱼姐姐，您把尾巴借给我行吗？”小鱼说：“不行啊，如果我没有尾巴，我就没有办法拨水了。”
'''

In [None]:
# annotation happens here
document = client.annotate(text)
# `document` (a python dictionary) stores all the results

print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))
print('-'*50)
print()

############# 1. Length #############

# Count Chinese words
D = []  # Tokenization for dependency distance analysis
sum_t = 0
for i, sent in enumerate(document['sentences']):
    print("[Sentence {}]".format(i+1))
    d = [t['word'] for t in sent['tokens']]  
    D.append(d)    
    for t in sent['tokens']:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t['word'], t['lemma'], t['pos'], t['ner']))
    # To get the number of tokens excluding punctuations
    t_length = len([d for d in sent['tokens'] if ('pos', 'PU') not in d.items()])
    print("Number of tokens is", t_length)
    print()
    sum_t = sum_t + t_length
nSent = len(document['sentences'])

# Count Chinese characters
def hans_count(str):
    hans_total = 0
    for s in str:
        if '\u4e00' <= s <= '\u9fef':
            hans_total += 1
    return hans_total
    
print('='*50)
print("Total number of sentences is", nSent)
print("Total number of tokens is", sum_t)
print("Total number of characters is", hans_count(text))
print('='*50)

In [None]:
############# 2.1: Depth of syntactic trees #############

trees_str = []
for i, sent in enumerate(document['sentences']):
    print("[Sentence {}]".format(i+1))
    print(sent['parse'])
    # save the trees in bracketed format
    tbf = re.sub('\n\s+', ' ', sent['parse'])
    trees_str.append(tbf)
    print()
# print(trees_str)

# convert to nltk tree
# using: nltk.Tree.fromstring()
trees_nltk = []
for i, tree_str in enumerate(trees_str):
    tree_nltk = nltk.Tree.fromstring(tree_str)
    trees_nltk.append(tree_nltk)
print(f'we have {len(trees_nltk)} trees')
print()

# let's compute the depth for each trees 
total_sd = 0
for k in trees_nltk:
    k.pretty_print()
    kh = k.height()
    total_sd = total_sd + kh
    print("The syntactic depth is:", kh)
    print()
    
print('='*50)
print("Total number of syntactic depth is", total_sd)
print('='*50)

In [None]:
############# 2.2: Amount of Relative Clauses #############
nRC = 0

# looking for CSs
for i, sent in enumerate(document['sentences']):
    for w in sent['tokens']:
        if w['pos'] == 'DEC':
            nRC = nRC + 1
print("The amount of relative clauses is:", nRC)

In [None]:
############# 2.3: Dependency distance #############

nlp_zh = stanza.Pipeline("zh-hans", processors="lemma, tokenize, pos, depparse", tokenize_pretokenized=True) 

# D stores the tokenization
tree = nlp_zh(D).sentences
odd = 0
for t in tree:
    dd_sum = 0
    tdnew = [d for d in t.dependencies if d[1] != 'punct']   # Punctuations are not taken into account
    dep_num = len(tdnew)
    for dep in tdnew:
        dd = int(dep[2].id)-dep[2].head
        dd_sum = dd_sum + abs(dd)
        ave_dd = dd_sum/dep_num
        print(dep[0].text, dep[1], dep[2].text, dd)
    odd = odd + ave_dd
    print(nSent)
    print("The dependency distance of this sentence is:", ave_dd)
    print()
print("The overall dependecy distance of this text is:", round(odd/nSent,2))

In [None]:
####################################################################################################################
################################ Apply these processes for each text in each grade level############################
####################################################################################################################

In [None]:
# Step 1: Import the files
from pathlib import Path
p = Path('/Users/zeping/My Drive/Courses/Fall 2021/Programming for Computational Linguistics/Files/LING-L555/Text/Grade1_A')

texts = {}
for fn in p.glob("*.txt"):
    if fn not in texts:
        texts[fn] = [open(fn).read(), {}]

In [None]:
# Step 2: Define a function to process all texts

def get_stats(text):
    # annotation happens here
    document = client.annotate(text)
    # Aim 1: Length for sentences / words / characters
    sum_t = 0
    D = []
    for i, sent in enumerate(document['sentences']):
        d = [t['word'] for t in sent['tokens']]  
        D.append(d) 
        for t in sent['tokens']:
            t_length = len([d for d in sent['tokens'] if ('pos', 'PU') not in d.items()])
        sum_t = sum_t + t_length
    nSent = len(document['sentences'])
    def hans_count(str):
        hans_total = 0
        for s in str:
            if '\u4e00' <= s <= '\u9fef':
                hans_total += 1
        return hans_total
    # Aim 2: Depth of syntactic trees
    trees_str = []
    for i, sent in enumerate(document['sentences']):
        tbf = re.sub('\n\s+', ' ', sent['parse'])
        trees_str.append(tbf)
    trees_nltk = []
    for i, tree_str in enumerate(trees_str):
        tree_nltk = nltk.Tree.fromstring(tree_str)
        trees_nltk.append(tree_nltk)
    total_sd = 0
    for k in trees_nltk:
        kh = k.height()
        total_sd = total_sd + kh
    # Aim 3: The number of relative clauses by searching relativizers
    nRC = 0
    for i, sent in enumerate(document['sentences']):
        for w in sent['tokens']:
            if w['pos'] == 'CS':
                nRC = nRC + 1
    # Aim 4: Dependency distance
    nlp_zh = stanza.Pipeline("zh-hans", processors="lemma, tokenize, pos, depparse", tokenize_pretokenized=True) 
    tree = nlp_zh(D).sentences
    overall_dd = 0
    for t in tree:
        dd_sum = 0
        tdnew = [d for d in t.dependencies if d[1] != 'punct']   # Punctuations are not taken into account
        dep_num = len(tdnew)
        for dep in tdnew:
            dd = int(dep[2].id)-dep[2].head
            dd_sum = dd_sum + abs(dd)
            ave_dd = dd_sum/dep_num
        overall_dd = overall_dd + ave_dd
        
    
    # Print out all things
    return {'nSent': nSent,
           'nWord': sum_t,
           'nCha': hans_count(text),
           'sDepth': total_sd,
           'nRC': nRC,
           'DD': round(overall_dd/nSent,2)}

In [None]:
# Step 3: Get the statistics for each text:

for text in texts:
    texts[text][1] = get_stats(texts[text][0])
    
textlist = texts.items()
textlist = list(textlist)
textlist.sort()
outfile = 'results.tsv'
outfilefd = open(outfile, 'w')
for (fn, sent) in textlist:
    filename = fn.name.split("/")[-1]
    #print(filename, sent[1])
    #print('%s\t%s'%(filename,'\t'.join(['%.2f'% i for i in sent[1].items()])))
    print('%s\t%d\t%d\t%d\t%d\t%d\t%d\t.2f' % (filename, sent[1]['nSent'], sent[1]['nWord'], sent[1]['nCha'], sent[1]['sDepth'], sent[1]['nRC'], sent[1]['DD']),file=outfilefd)
outfilefd.close()

In [None]:
client.stop()

In [None]:
# Locate the output files
import os
print(os.getcwd())

In [None]:
# Merry Christmas
print('\n'.join([' ' * 10 + '*' + ' ' * 10, ' ' + "* Merry Christmas *",
                 '\n'.join('{0}{1}{0}'.format(' ' * ((21 - c) // 2), 
                                              ''.join(map(lambda i: '#' if i % 2 else 'o', range(c)))) for c in range(3, 22, 2)), 
                 ' ' * 9 + '/|\\' + ' ' * 9]))