# Table of Contents
 <p><div class="lev1 toc-item"><a href="#COPA-Preprocess" data-toc-modified-id="COPA-Preprocess-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>COPA Preprocess</a></div><div class="lev1 toc-item"><a href="#CausalNet-Preprocess" data-toc-modified-id="CausalNet-Preprocess-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CausalNet Preprocess</a></div>

# COPA Preprocess

In [175]:
import json
import os
import spacy
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV, NUM, PROPN
from itertools import product

In [140]:
nlp = spacy.load('en')
DEV_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json'
TEST_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json'

In [141]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

def isNoise(token):
    """
    Check if the token is a noise or not 
    """
    is_noise = False
    pos_tags = []
    if token.pos not in [NOUN, VERB, ADJ, ADV, NUM, PROPN]:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    return is_noise

def clean(token):
    """
    Clean data
    """
    return token.lemma_

In [184]:
a=['1','2','3','4'];  
b=['4','5','6'];  
c = []
c = [i for i in product(a, b)]
    

In [185]:
c

[('1', '4'),
 ('1', '5'),
 ('1', '6'),
 ('2', '4'),
 ('2', '5'),
 ('2', '6'),
 ('3', '4'),
 ('3', '5'),
 ('3', '6'),
 ('4', '4'),
 ('4', '5'),
 ('4', '6')]

In [227]:
class Data:
    def __init__(self, path):
        self.premise = load_data(path, 'premise')
        self.ask_for = load_data(path, 'asks-for')
        self.alternative1 = load_data(path, 'alternative1')
        self.alternative2 = load_data(path, 'alternative2')
        self.label = load_data(path, 'most-plausible-alternative')
        
    def clean(self):
        premise = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.premise]]
        alternative1 = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.alternative1]]
        alternative2 = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.alternative2]]
        label = [int(l) for l in self.label]            
        return premise, alternative1, alternative2   
    
    def causalPair(self):
        p, a1, a2 = self.clean()
        cp1, cp2, c, e = [], [], [], []
        for i in range(500):
            if self.ask_for[i] == 'cause':
                cp1.append([cp for cp in product(a1[i], p[i])])
                cp2.append([cp for cp in product(a2[i], p[i])])
                c.append(a1[i]), c.append(a2[i]), e.append(p[i])
            else:
                cp1.append([cp for cp in product(p[i], a1[i])])
                cp2.append([cp for cp in product(p[i], a2[i])])
                c.append(p[i]), e.append(a1[i]), e.append(a2[i])
        return cp1, cp2, c, e

In [252]:
devData = Data(DEV_DATA_DIR)
testData = Data(TEST_DATA_DIR)

In [253]:
devcp1, devcp2, devCwords, devEwords  = devData.causalPair()
testcp1, testcp2, testCwords, testEwords  = testData.causalPair()

In [None]:
copaPairs = list(set(sum(devcp1+devcp2+testcp1+testcp2, [])))
copaCword = list(set(sum(devCwords+testCwords, [])))
copaEword = list(set(sum(devCwords+testEwords, [])))
copaWord = list(set(sum(sum(devData.clean(), []), []) + sum(sum(testData.clean(), []), [])))

In [313]:
len(copaPairs), len(copaCword), len(copaEword), len(copaWord)

(14532, 1901, 1903, 2764)

# CausalNet Preprocess

In [None]:
from tqdm import tqdm

In [None]:
filename = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/CausalNet/CausalNet.txt'
raw_text = [i.split() for i in open(filename).read().split('\n')]

In [None]:
causeWords = []
effectWords = []
cePairs = []
freq = []
for i in tqdm(range(len(raw_text[:-1]))):
    causeWords.append(raw_text[i][0])
    effectWords.append(raw_text[i][1])
    cePairs.append(tuple(raw_text[i][:2]))
    freq.append(raw_text[i][-1])

In [324]:
causalPair = []
for i in tqdm(range(len(raw_text[:-1]))):
    if raw_text[i][0] in copaWord and raw_text[i][1] in copaWord:
        causalPair.append(raw_text[i])

KeyboardInterrupt: 

In [None]:
causeWord = set(causeWords)
effectWord = set(effectWords)
cePair = set(cePairs)

In [275]:
len(causeWord), len(effectWord), len(cePair)

(59411, 59710, 62675002)

In [303]:
ceFreq = {p: freq[cePairs.index(p)] if p in cePairs else 0 for p in tqdm(copaPairs[:10])}
#causeFreq =
#effectFreq = 





  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A



 10%|█         | 1/10 [00:01<00:15,  1.72s/it][A[A[A[A



 20%|██        | 2/10 [00:04<00:17,  2.21s/it][A[A[A[A



 30%|███       | 3/10 [00:06<00:14,  2.13s/it][A[A[A[A



 40%|████      | 4/10 [00:07<00:11,  2.00s/it][A[A[A[A



 50%|█████     | 5/10 [00:09<00:09,  1.99s/it][A[A[A[A



 60%|██████    | 6/10 [00:10<00:07,  1.77s/it][A[A[A[A



 70%|███████   | 7/10 [00:14<00:06,  2.05s/it][A[A[A[A



 80%|████████  | 8/10 [00:15<00:03,  1.96s/it][A[A[A[A



 90%|█████████ | 9/10 [00:17<00:01,  1.98s/it][A[A[A[A



100%|██████████| 10/10 [00:19<00:00,  1.97s/it][A[A[A[A



[A[A[A[A

In [320]:
#causeFreq = {p: freq[cePairs.index(p)] if p in cePairs else 0 for p in tqdm(copaPairs[:10])}
causeFreq = {c: sum([int(i[-1]) for i in raw_text[:-1] if c == i[0]]) if c in causeWord else 0 for c in tqdm(copaCword[:10])}

KeyboardInterrupt: 

In [1]:
causeFreq = {[int(i[-1]) for i in tqdm(raw_text[:-1]) if 'sun' == i[0]] for w in cause}

effectFreq = 

afreq = [int(i[-1]) for i in tqdm(raw_text[:-1]) if 'sun' == i[0]]

bfreq = [int(i[-1]) for i in tqdm(raw_text[:-1]) if 'body' == i[1]]

cfreq = [int(i[-1]) for i in tqdm(raw_text[:-1]) if 'sun' == i[0] and 'body' == i[1]]

M = 660412209
N = 62675002
ALPHA = 0.66

N = 62675002
M = 660412209