# Table of Contents
 <p>

In [32]:
import os
import spacy
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV, NUM
from nltk import regexp_tokenize
import numpy as np
import json

In [33]:
nlp = spacy.load('en')
stopWords = {'e1': None, 'e2': None, 'ding': None, 'd': None, 'j': None, 'r': None, 't': None}
indicator = ['lead', 'leading', 'because', 'thus', 'therefore', 'consequence', 'due', 'result', 'hence',
             'cause', 'induce', 'inducing', 'causing', 'reason', 'effect']

def load_data(path):
    """
    Load date from file
    """
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        return f.read()
    
def isNoise(token):
    """
    Check if the token is a noise or not 
    """
    is_noise = False
    pos_tags = []
    if token.pos not in [NOUN, VERB, ADJ, ADV, NUM]:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    return is_noise

def clean(token):
    """
    Clean data
    """
    return token.lemma_

def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\d+(?:\.\d+)?%?       # numbers, incl. currency and percentages 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe 
           '''  
    return regexp_tokenize(s, pattern=pattern)

def find_pn(ws):
    """
    Find paired nominals
    """
    for i in range(len(ws)):
        if ws[i] == 'e1':
            for j in range(i+1, len(ws)):
                if ws[j] == 'e1':
                    pn1 = ws[i+1:j] 
        if ws[i] == 'e2':
            for j in range(i+1, len(ws)):
                if ws[j] == 'e2':
                    pn2 = ws[i+1:j]
    return pn1, pn2

def del_stop(ws):
    """
    Delete stopwords
    """
    return [i for i in [stopWords.get(i.lower(), i) for i in ws] if i != None]

def del_indicator(ws):
    """
    Delete causal indicators
    """
    return [i for i in ws if i not in indicator]

def write_to_file(content, name):
    """
    Write data to json file
    """
    with open(name, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()
        
def gen_data(data, causal=True):
    """
    Data generator
    """
    for i in range(len(data)):
        if causal:
            yield {
                'sentence': data[i],
                'label': 1
            }
        else:
            yield {
                'sentence': data[i],
                'label': 0
            }

In [4]:
data_path = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval.txt'
data = load_data(data_path).strip().split('\t')[1:]
data = [line.split('\n')[:2] for line in data]
sentence = [line[0] for line in data]
label = [1 if line[-1] == 'Cause-Effect(e2,e1)' or line[-1] == 'Cause-Effect(e1,e2)' else 0 for line in data]

In [5]:
sentWords = [cut(s) for s in sentence]
sentWords = [del_stop(ws) for ws in sentWords]
causalSent = [' '.join(sentWords[i]) for i in range(len(sentence)) if label[i] == 1]
noncausalSent = [' '.join(sentWords[i]) for i in range(len(sentence)) if label[i] == 0]

In [28]:
trueWords = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in causalSent]]
trueWords = [del_indicator(ws) for ws in trueWords]
trueWords = [i for i in trueWords if len(i) <= 12]
falseWords = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in noncausalSent]]
falseWords = [del_indicator(ws) for ws in falseWords]
falseWords = [i for i in falseWords if len(i) <= 12 and len(i) >= 6]

In [34]:
for i in gen_data(trueWords, causal=True):
    write_to_file(i, '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval.json')

for i in gen_data(falseWords, causal=False):
    write_to_file(i, '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval.json')