In [19]:
from bs4 import BeautifulSoup as bs
import lxml
import os
import re
from tqdm import tqdm
import pandas as pd
import random
random.seed(2)

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# folder_path = '/content/drive/MyDrive/לימודים/NLP mini project/'
folder_path = ''

---
# Preprocessing data:
#### From TanakhDictaTEI we iterate over each book and retreive only sentences ("psukim") that contain the word "ACH"


In [6]:
path_torah =os.path.join(folder_path, 'TanakhDictaTEI-1', 'Torah')
path_prohets = os.path.join(folder_path, 'TanakhDictaTEI-1', 'Prophets')
path_writings = os.path.join(folder_path, 'TanakhDictaTEI-1', 'Writings')
paths = [path_torah, path_prohets, path_writings]

<div class="alert alert-block alert-info">
<b>Note:</b>  TEI from "moodle" should be stored in the same directory as the notebook
</div>

In [7]:
achs = []
i=2
for path in tqdm(paths):
    # print(path)
    dir = os.fsencode(path)
    for file in sorted(os.listdir(dir)):
        filename = os.fsdecode(file)
        # print(filename)
        with open(os.path.join(path, filename), 'r') as f:
            content = f.readlines()
            content = "".join(content)
            bs_content = bs(content, "lxml")
            senteces = bs_content.find_all('s')
            for s in senteces:
                words = s.find_all('w')
                for w in words:
                    dtoken = w.get_attribute_list('dtoken')
                    if dtoken[0] is not None:
                        if "אך_אַךְ" in dtoken[0]:
                            # print(i)
                            # i += 1
                            achs.append(s)



100%|█████████████████████████████████████████████| 3/3 [00:54<00:00, 18.23s/it]


In [None]:
ach_tags = pd.read_csv(os.path.join(folder_path,'Ach.csv'))['תיוג']

---
# Retreiving neighbors in different window sizes:
#### For each word in sentence we wish to get the "window" that surrounds it. 
#### We iterate over the tags <w> in each sentence \<s\> and append the tags: <br>
> For window size 2:
    (<t\-1> , <t+1>) <br>
    For window size 4:
    (<t\-2>, <t\-1>, <t+1>, <t+2>) <br>

#### These lists contain the data needed for the feature vectors for our learning models. 

In [119]:
def w_vec(achs=achs):
    tmp = 0
    vec2, vec4 = [], []
    for s in achs:
        words = s.find_all('w')
        i = 0
        #get index of ACH
        for w in words:
            dtoken = w.get_attribute_list('dtoken')
            if "אך_אַךְ" not in dtoken[0]:
                i += 1
            elif s==tmp:
                # TODO fix twice appearance:
                # print(s.get_attribute_list('displayname_eng')[0])
                # print(i)
                continue
            else:
                tmp = s
                break
        if i < 1:
            vec_2 = [-1, words[1]]
            vec_4 = [-1, -1, words[1], words[2]]
        elif i < 2:
            vec_2 = [words[i-1], words[i+1]]
            vec_4 = [-1, words[i-1], words[i+1], words[i+2]]
        else:
            # if (s.get_attribute_list('displayname_eng')[0]) == 'Pasuk 30': print(words)
            vec_2 = [words[i-1], words[i+1]]
            if len(words) < i+3:
                vec_4 = [words[i-2], words[i-1],words[i+1], -1]
                
            elif len(words) < i+2:
                vec_4 = [words[i-2], words[i-1],-1, -1]
            else: 
                vec_4 = [words[i-2], words[i-1],words[i+1], words[i+2]]
        vec2.append(vec_2)
        vec4.append(vec_4)
    return vec2, vec4  

In [120]:
vec2, vec4 = w_vec() 

---


# Feature vectors:
#### For our classification models, we first need to constract feature representation of the data

## Base features:

> Vocabulay features <br>
Morphological features <br>
Syntactic features <br>
Lemmatizing features <br>

In [121]:
# vocabulay
def feature_vec_1(achs=achs, vec2 = vec2, vec4=vec4):
    vec_vocab2, vec_vocab4 = [], []
    for vec in vec2:
        tmp = (list(map((lambda x: x.get_attribute_list('dtoken')[0].split('_', 1)[0] if (x != -1) else x), vec)))
        vec_vocab2.append(tmp)
    for vec in vec4:
        tmp = (list(map((lambda x: x.get_attribute_list('dtoken')[0].split('_', 1)[0] if (x != -1) else x), vec)))
        vec_vocab4.append(tmp)
    return vec_vocab2, vec_vocab4

In [122]:
# morphology
def feature_vec_2(achs=achs, vec2 = vec2, vec4=vec4):
    vec_morph2, vec_morph4 = [], []
    for vec in vec2:
        vec_morph2.append(list(map((lambda x: x.get_attribute_list('dtoken')[0].split('__', 1)[-1].split('_',1)[0] if (x != -1) else x), vec)))
    for vec in vec4:
        vec_morph4.append(list(map((lambda x: x.get_attribute_list('dtoken')[0].split('__', 1)[-1].split('_',1)[0] if (x != -1) else x), vec)))
    return vec_morph2, vec_morph4

In [123]:
#syntax
def feature_vec_3(achs=achs, vec2 = vec2, vec4=vec4):
    vec_syntax2, vec_syntax4 = [], []
    for i, vec in enumerate(vec2):
        functions = achs[i].find_all('syntacticinfo')
        phrases = {}
        for s in functions:
            c = s.find_all('clause')
            for p in c:
                p2 = p.find_all('phrase')
                for p in p2:
                    phrases[p.get_attribute_list('id')[0]] = p.get_attribute_list('function')[0]
        tmp = list(map((lambda x: x.find_all('m')[0].get_attribute_list('phraseid')[0] if (x != -1) else x), vec))
        tmp = list(map((lambda x: phrases.get(x) if (x != -1) else x), tmp))
        vec_syntax2.append(tmp)
    for i, vec in enumerate(vec4):
        functions = achs[i].find_all('syntacticinfo')
        phrases = {}
        for s in functions:
            c = s.find_all('clause')
            for p in c:
                p2 = p.find_all('phrase')
                for p in p2:
                    phrases[p.get_attribute_list('id')[0]] = p.get_attribute_list('function')[0]
        tmp = list(map((lambda x: x.find_all('m')[0].get_attribute_list('phraseid')[0] if (x != -1) else x), vec))
        tmp = list(map((lambda x: phrases.get(x) if (x != -1) else x), tmp))
        vec_syntax4.append(tmp)
    return vec_syntax2, vec_syntax4

In [124]:
#lemma
def feature_vec_4(achs=achs, vec2 = vec2, vec4=vec4):
    vec_lemma2, vec_lemma4 = [], []  
    for vec in vec2:
        vec_lemma2.append(list(map((lambda x: (re.sub("[/='[']", "", x.get_attribute_list('lemma')[0])) if (x != -1) else x), vec)))
    for vec in vec4:
        vec_lemma4.append(list(map((lambda x: (re.sub("[/='[']", "", x.get_attribute_list('lemma')[0])) if (x != -1) else x), vec)))
    return vec_lemma2, vec_lemma4 

## Combinations of features:

#### The combinations we wish to create are:: 
> (morpho X syntax) <br>
(morpho X lemma) <br>
(syntax X lemma)


For example, we combine (morph X syntax) feature: <br>
> **Morphologic:** (sub, disj) <br>
**Syntactic:** (Conj, Verb) <br>
>>**$\rightarrow$ ((sub * Conj), (disj * Verb))**<br>




In [125]:

def tuple_features(vec_a_2=[], vec_b_2 =[], vec_a_4=[], vec_b_4 = []):
    tup_2 = [((a,c),(b,d)) for ([a,b],[c,d]) in zip(vec_a_2, vec_b_2)]
    tup_4 = [((a,e),(b,f),(c,g),(d,h)) for ((a,b,c,d),(e,f,g,h)) in zip(vec_a_4, vec_b_4)]
    return tup_2, tup_4


In [126]:
# All 12 vectors
vec_vocab2, vec_vocab4 = feature_vec_1()
vec_morph2, vec_morph4 = feature_vec_2()
vec_syntax2, vec_syntax4 = feature_vec_3()
vec_lemma2, vec_lemma4  = feature_vec_4()
morph_and_syntax_2, morph_and_syntax_4 = tuple_features(vec_morph2, vec_syntax2, vec_morph4, vec_syntax4)
morph_and_lemma_2, morph_and_lemma_4 = tuple_features(vec_morph2, vec_lemma2, vec_morph4, vec_lemma4)
syntax_and_lemma_2, syntax_and_lemma_4 = tuple_features(vec_syntax2, vec_lemma2, vec_syntax4, vec_lemma4)

---
# Feature vectors to "one-hot-vector" representation:

#### We create a set of keys matching the values of the feature for each.   
#### This will match the requirements of the input for weka as "One-Hot-Vector".

In [127]:
def get_keys(vec = []):
    list_ = [inner for outer in vec for inner in outer]
    list_ = sorted(set(list_), key=list_.index)
    return list_

In [128]:
dir_name = 'inputs'
vecs = [(vec_vocab2, vec_vocab4), (vec_morph2, vec_morph4), (vec_syntax2, vec_syntax4), (morph_and_syntax_2, morph_and_syntax_4), (morph_and_lemma_2, morph_and_lemma_4), (syntax_and_lemma_2, syntax_and_lemma_4)]
tags = [get_keys(vecs[0][1]), get_keys(vecs[1][1]),get_keys(vecs[2][1]),get_keys(vecs[3][1]),get_keys(vecs[4][1]),get_keys(vecs[5][1])]
names = ['vocab', 'morpho', 'syntax', 'morph_syn', 'morph_lemma', 'syn_lemma']

---
# CSV files:
#### Creating csv of 12 vectors as input weka requires:



In [151]:
def index_vector(tags=[], vec2=[], vec4 =[], name = "", dir_name_inputs = 'inputs', ach_tags_ = ach_tags):
    if not os.path.exists(dir_name_inputs):
        os.mkdir(dir_name_inputs)
    if not os.path.exists('temps'):
        os.mkdir('temps')
    input_vector_4 = [(tags.index(x), tags.index(y),tags.index(z),tags.index(w)) for (x,y,z,w) in vec4] 
    df4 = pd.DataFrame(input_vector_4)
    df4.to_csv(os.path.join(folder_path,'temps', name+"4"), index=False, header=['w-2', 'w-1', 'w+1', 'w+2'])
    df4 = pd.read_csv(os.path.join(folder_path, 'temps', name+"4"))
    df = pd.concat([df4['w-2'], df4['w-1'], df4['w+1'], df4['w+2'], ach_tags_], axis=1)
    df.to_csv(os.path.join(folder_path, dir_name_inputs, name+"4.csv"), index=False)
    input_vector_2 = [(tags.index(x), tags.index(y)) for (x,y) in vec2]
    df2 = pd.DataFrame(input_vector_2)
    df2.to_csv(os.path.join(folder_path, 'temps', name+"2"), index=False, header=['w-1', 'w+1'])
    df2 = pd.read_csv(os.path.join(folder_path, 'temps', name+"2"))
    df = pd.concat([df2['w-1'], df2['w+1'], ach_tags_], axis=1)
    df.to_csv(os.path.join(folder_path, dir_name_inputs, name+"2.csv"), index=False)

In [152]:
for i in range(6):
    index_vector(tags=tags[i], vec2=vecs[i][0], vec4 =vecs[i][1], name = names[i], dir_name_inputs=dir_name)

---
# Testing modifications on the input

>size - try half the size<br>
regularized - eqaul range of tags

### Regularized subset:

In [113]:
def get_indices():
    words = ['אכן','רק','אבל','זה עתה']
    rak, aval, ata, achen = 0,0,0,0
    regs = []
    for i, tag in enumerate(ach_tags):
        if tag == words[0] and achen < 24:
            achen += 1
            regs.append(i)
        elif tag == words[1] and rak < 24:
            regs.append(i)
            rak += 1
        elif tag == words[2] and aval < 24:
            regs.append(i)
            aval +=1
        elif tag == words[3] and ata < 24:
            regs.append(i)
    return regs


In [176]:
achs_regularized_indiced = get_indices()
achs_reg_xml = [achs[i] for i in achs_regularized_indiced]
achs_reg_csv = pd.DataFrame([ach_tags[i] for i in achs_regularized_indiced])

In [153]:
# All 12 vectors of regularized
vec2_reg, vec4_reg = w_vec(achs=achs_reg_xml) 
vec_vocab2_reg, vec_vocab4_reg = feature_vec_1(achs=achs_reg_xml, vec2=vec2_reg, vec4=vec4_reg)
vec_morph2_reg, vec_morph4_reg = feature_vec_2(achs=achs_reg_xml, vec2=vec2_reg, vec4=vec4_reg)
vec_syntax2_reg, vec_syntax4_reg = feature_vec_3(achs=achs_reg_xml, vec2=vec2_reg, vec4=vec4_reg)
vec_lemma2_reg, vec_lemma4_reg  = feature_vec_4(achs=achs_reg_xml, vec2=vec2_reg, vec4=vec4_reg)
morph_and_syntax_2_reg, morph_and_syntax_4_reg = tuple_features(vec_morph2_reg, vec_syntax2_reg, vec_morph4_reg, vec_syntax4_reg)
morph_and_lemma_2_reg, morph_and_lemma_4_reg = tuple_features(vec_morph2_reg, vec_lemma2_reg, vec_morph4_reg, vec_lemma4_reg)
syntax_and_lemma_2_reg, syntax_and_lemma_4_reg = tuple_features(vec_syntax2_reg, vec_lemma2_reg, vec_syntax4_reg, vec_lemma4_reg)

In [154]:
dir_name_reg = 'inputs_reg'
vecs_reg = [(vec_vocab2_reg, vec_vocab4_reg), (vec_morph2_reg, vec_morph4_reg), (vec_syntax2_reg, vec_syntax4_reg), (morph_and_syntax_2_reg, morph_and_syntax_4_reg), (morph_and_lemma_2_reg, morph_and_lemma_4_reg), (syntax_and_lemma_2_reg, syntax_and_lemma_4_reg)]
tags_reg = [get_keys(vecs_reg[0][1]), get_keys(vecs_reg[1][1]),get_keys(vecs_reg[2][1]),get_keys(vecs_reg[3][1]),get_keys(vecs_reg[4][1]),get_keys(vecs_reg[5][1])]
names_reg = ['vocab', 'morpho', 'syntax', 'morph_syn', 'morph_lemma', 'syn_lemma']

In [175]:
for i in range(6):
    index_vector(tags=tags_reg[i], vec2=vecs_reg[i][0], vec4 =vecs_reg[i][1], name = names_reg[i], dir_name_inputs=dir_name_reg, ach_tags_=achs_reg_csv[0])

### Random subset:

In [177]:
achs_rand_indices = sorted(random.sample(range(161), 80))
achs_rand_xml = [achs[i] for i in achs_rand_indices]
achs_rand_csv = pd.DataFrame([ach_tags[i] for i in achs_rand_indices])

In [178]:
# All 12 vectors of regularized
vec2_rand, vec4_rand = w_vec(achs=achs_rand_xml) 
vec_vocab2_rand, vec_vocab4_rand = feature_vec_1(achs=achs_rand_xml, vec2=vec2_rand, vec4=vec4_rand)
vec_morph2_rand, vec_morph4_rand = feature_vec_2(achs=achs_rand_xml, vec2=vec2_rand, vec4=vec4_rand)
vec_syntax2_rand, vec_syntax4_rand = feature_vec_3(achs=achs_rand_xml, vec2=vec2_rand, vec4=vec4_rand)
vec_lemma2_rand, vec_lemma4_rand  = feature_vec_4(achs=achs_rand_xml, vec2=vec2_rand, vec4=vec4_rand)
morph_and_syntax_2_rand, morph_and_syntax_4_rand = tuple_features(vec_morph2_rand, vec_syntax2_rand, vec_morph4_rand, vec_syntax4_rand)
morph_and_lemma_2_rand, morph_and_lemma_4_rand = tuple_features(vec_morph2_rand, vec_lemma2_rand, vec_morph4_rand, vec_lemma4_rand)
syntax_and_lemma_2_rand, syntax_and_lemma_4_rand = tuple_features(vec_syntax2_rand, vec_lemma2_rand, vec_syntax4_rand, vec_lemma4_rand)

In [179]:
dir_name_rand = 'inputs_rand'
vecs_rand = [(vec_vocab2_rand, vec_vocab4_rand), (vec_morph2_rand, vec_morph4_rand), (vec_syntax2_rand, vec_syntax4_rand), (morph_and_syntax_2_rand, morph_and_syntax_4_rand), (morph_and_lemma_2_rand, morph_and_lemma_4_rand), (syntax_and_lemma_2_rand, syntax_and_lemma_4_rand)]
tags_rand = [get_keys(vecs_rand[0][1]), get_keys(vecs_rand[1][1]),get_keys(vecs_rand[2][1]),get_keys(vecs_rand[3][1]),get_keys(vecs_rand[4][1]),get_keys(vecs_rand[5][1])]
names_rand = ['vocab', 'morpho', 'syntax', 'morph_syn', 'morph_lemma', 'syn_lemma']

In [180]:
for i in range(6):
    index_vector(tags=tags_rand[i], vec2=vecs_rand[i][0], vec4 =vecs_rand[i][1], name = names_rand[i], dir_name_inputs=dir_name_rand, ach_tags_=achs_rand_csv[0])