In [1]:
##### UNCOMMENT #####

# # Requirements
# !pip install transformers
# !pip install sentence-transformers
# !pip install sklearn

In [2]:
import os
import numpy as np
import re
import matplotlib.pyplot as plt

In [3]:
path = '../Semantic_chunk_identification' ##### MODIFY #####
os.chdir(path)

##### UNCOMMENT #####
# from google.colab import drive
# drive.mount('drive', force_remount=True)

# Chunking

## Getting the data

In [4]:
DATASET = "headlines" # or "images" or "answers-students"

if (DATASET == "headlines" or DATASET == "images") :
    ds1_chunked = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent1.chunk.txt'  ##### MODIFY #####
    ds2_chunked = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent2.chunk.txt'  ##### MODIFY #####
    ds1 = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent1.txt'  ##### MODIFY #####
    ds2 = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent2.txt'  ##### MODIFY #####
else :
    ds1_chunked = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent1.chunk.txt'  ##### MODIFY #####
    ds2_chunked = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent2.chunk.txt'  ##### MODIFY #####
    ds1 = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent1.txt'  ##### MODIFY #####
    ds2 = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent2.txt'  ##### MODIFY #####

In [5]:
# Put the sentences in lists

ds1_lines = [line.strip() for line in open(ds1)]
ds2_lines = [line.strip() for line in open(ds2)]
ds1_lines_chunked = [line.strip() for line in open(ds1_chunked)]
ds2_lines_chunked = [line.strip() for line in open(ds2_chunked)]

In [6]:
# One example

idx = 3
print(ds1_lines[idx])
print(ds1_lines_chunked[idx])

Syria peace plan conditions " unacceptable , " opposition says
[ Syria peace plan conditions ] [ " ] [ unacceptable ] [ , ] [ " ] [ opposition ] [ says ]


Train test split

In [7]:
train_size = int(3/4*len(ds1_lines))

ds1_lines_train = ds1_lines[:train_size]
ds1_lines_gold_train = ds1_lines_chunked[:train_size]
ds1_lines_test = ds1_lines[train_size:]
ds1_lines_gold_test = ds1_lines_chunked[train_size:]

ds2_lines_train = ds2_lines[:train_size]
ds2_lines_gold_train = ds2_lines_chunked[:train_size]
ds2_lines_test = ds2_lines[train_size:]
ds2_lines_gold_test = ds2_lines_chunked[train_size:]

In [8]:
print('Train size:', len(ds1_lines_train))
print('Test size:', len(ds1_lines_test))

Train size: 567
Test size: 189


## Chunking with transformers

Nous utilisons un pipeline qui tokenise une phrase en entrée, utilise un modèle BERT sur les tokens, et renvoie le chunking de la phrase d'entrée

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-chunk")
model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-chunk")
pipe = pipeline('ner', grouped_entities=True, model=model, tokenizer=tokenizer)



**Méthode 2 pour obtenir les phrases chunkées:** On récupère directement les phrases chunkées en sortie du pipeline.

Cependant, ces dernières sont au format minuscule.

In [10]:
# Cette fonction retourne une phrase chunkée par le modèle mais EN MINUSCULES
def compute_chunk_list(pipe, sentence):
    chunk_dicts = pipe(sentence)
    chunk_list = []
    for chunk_dict in chunk_dicts:
        entity = chunk_dict['word']
        entity_group = chunk_dict['entity_group']
        chunk_list.append([entity, entity_group])
    return chunk_list

def get_gold_chunk_list(sentence): # EN MINUSCULES
    return sentence.lower().strip("[ ").strip(" ]").split(" ] [ ")

In [11]:
chunk_dicts = pipe('Syria peace plan conditions " unacceptable , " opposition says')
chunk_dicts[0]

{'entity_group': 'NP',
 'score': 0.9940616,
 'word': 'syria peace plan conditions',
 'start': 0,
 'end': 27}

**Méthode 1 pour obtenir les phrases chunkées:** On récupère les indexs de début et de fin de chunck pour récupérer les segments de phrase correspondant dans la phrase originale.

In [12]:
# Cette fonction retourne une phrase chunkée par le modèle COMME IL FAUT
def compute_chunk_list2(pipe, sentence):
    chunk_dicts = pipe(sentence)
    chunk_list = []
    for chunk_dict in chunk_dicts:
        entity = sentence[chunk_dict['start']:chunk_dict['end']]
        entity_group = chunk_dict['entity_group']
        chunk_list.append([entity, entity_group])
    return chunk_list

def get_gold_chunk_list2(sentence): # SANS MINUSCULES
    return sentence.strip("[ ").strip(" ]").split(" ] [ ")

In [13]:
ds1_train_chunks2 = [compute_chunk_list2(pipe, sentence) for sentence in ds1_lines_train]

In [14]:
ds1_train_chunks2[0]

[['Former Nazi death camp guard Demjanjuk', 'NP'],
 ['dead', 'ADJP'],
 ['at', 'PP'],
 ['91', 'NP']]

## Evaluation

On compare le chunking du modèle avec les gold chunks

In [15]:
ds1_train_gold_chunks2 = [get_gold_chunk_list2(sentence) for sentence in ds1_lines_gold_train]

In [16]:
print(ds1_train_chunks2[0])
print(ds1_train_gold_chunks2[0])

[['Former Nazi death camp guard Demjanjuk', 'NP'], ['dead', 'ADJP'], ['at', 'PP'], ['91', 'NP']]
['Former Nazi death camp guard Demjanjuk', 'dead', 'at 91']


In [17]:
# La fonction de score du chunking cherche si chaque chunk de l'output a un gold chunk correspondant.

def chunking_score(output_chunks, gold_chunks):
    score = 0
    for i in range(len(output_chunks)):
        sent_score = 0
        for chk in output_chunks[i]:
            if chk[0] in gold_chunks[i]:
                sent_score += 1
        score += sent_score/len(gold_chunks[i])
#         if i%10==0:
#             print(output_chunks[i])
#             print(gold_chunks[i])
    return score/len(output_chunks)

In [18]:
score = chunking_score(ds1_train_chunks2, ds1_train_gold_chunks2)

In [19]:
score

0.5704308282350613

Writing predicted chunking in a .txt file

In [20]:
# Tests
sentence = ds1_train_chunks2[0]
print(sentence)

string = "[ "
for i in range(len(sentence)-1):
    string = string + sentence[i][0] + " ] [ "
string = string + sentence[-1][0] + " ]\n"
print(string)
print(ds1_lines_gold_train[0])

[['Former Nazi death camp guard Demjanjuk', 'NP'], ['dead', 'ADJP'], ['at', 'PP'], ['91', 'NP']]
[ Former Nazi death camp guard Demjanjuk ] [ dead ] [ at ] [ 91 ]

[ Former Nazi death camp guard Demjanjuk ] [ dead ] [ at 91 ]


In [21]:
chunking_file = open(f"{DATASET}_predicted_chunks.txt","w")

for sentence in ds1_train_chunks2:
    string = "[ "
    for i in range(len(sentence)-1):
        string = string + sentence[i][0] + " ] [ "
    string = string + sentence[-1][0] + " ]\n"
    chunking_file.write(string)
    
chunking_file.close()

# Alignement

### Définition des méthodes

In [22]:
from math import *
from scipy.optimize import linear_sum_assignment

def norm(x):
    norm=0
    for elt in x:
        norm+=elt**2
    norm=sqrt(norm)
    return(norm)

def similarity(x,y):
    '''
    Cosine Similarity
    '''
    sim=np.dot(x, y)
    sim=sim/(norm(x)*norm(y))
    return(sim)

Méthode d'alignement : Hungarian Algorithm (a.k.a. the Kuhn-Munkres algorithm)

In [23]:
def alignment_chunks(sentence_1, sentence_2):
    '''
    Hungarian Algorithm : aligner chunks pour meilleur score possible de phrase 

    Idea for improvement : 
    Normalize chunk similarity by number of tokens in shorter chunk 
    such that it assigned higher scores to pairs of chunks such as physician and general physician.
    '''
    cost=[]
    if len(sentence_2)<len(sentence_1): ##If more rows than columns, not every row needs to be assigned to a column, and vice versa.
        s1, s2 = sentence_2, sentence_1 #Always select shorter sentence as first sentence !
        inv=True
    else:
        s1, s2=sentence_1, sentence_2
        inv=False

    for i in range (len(s1)):
        inter=[]
        for j in range(len(s2)):
            sim=alignment_mots(s1[i], s2[j])        
            inter.append(sim)
        cost.append(inter)
    cost=np.array(cost)

    row_ind, col_ind = linear_sum_assignment(-cost) # - car linear_sum_assignment minimise normalement
    
    #Liste de score de similarité pour chaque alignement optimal de chunks
    sim=cost[row_ind, col_ind]
    sim=min_max_scaler(sim)

    if inv:
        row_ind, col_ind=col_ind, row_ind

    list_couples_chunks_et_score=[]
    for i in range(len(sim)):
        list_couples_chunks_et_score.append((row_ind[i], col_ind[i], sim[i]))

    return(list_couples_chunks_et_score)


def alignment_mots(chunk_1, chunk_2):
    '''
    Hungarian Algorithm sur 2 chunks : aligner mots pour meilleur score possible
    '''
    if len(chunk_2)<len(chunk_1): ##If more rows than columns, not every row needs to be assigned to a column, and vice versa.
        c1, c2 = chunk_2, chunk_1 #Always select shorter sentence as first sentence !
    else:
        c1, c2 = chunk_1, chunk_2
    cost=[]
    for i in range (len(c1)):
        inter=[]
        for j in range(len(c2)):
            inter.append(similarity(c1[i], c2[j]))
        cost.append(inter)
    cost=np.array(cost)

    # print(cost.shape)
    # print(chunk_1, chunk_2)
    # print()

    #col_ind, #col_ind donne alignement des mots pour obtenir meilleur score de similarité possible
    row_ind, col_ind = linear_sum_assignment(-cost) #- car linear_sum_assignment minimise normalement
    
    #score de similarité entre 2 chunks
    sim=cost[row_ind, col_ind].sum()

    return(sim)
 

def min_max_scaler(sim):
    scaled=[]
    a, b = 3, 1 #On remarque que les scores non scalés sont tous entre 1 et 3 #np.max(sim), np.min(sim) 
    
    for elt in sim:
        inter = (elt-b)/(a-b)
        inter = inter*5 #Pour avoir des scores entre 0 et 5
        inter = round(inter) #Pour avoir des scores entiers
        scaled.append(inter)
    return(scaled)

In [27]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(ds1_lines).batch(128)
vectorizer.adapt(text_ds)

def create_list_of_embedded_chunks_exploitable_for_alignement(chunked_sentence):
  chunk=''

  list_sentence=[]
  for i in range(len(chunked_sentence)):
    elt = chunked_sentence[i]
    if elt!='[' or elt !=']' or elt!=' ':
      chunk += elt
    if elt==']':
      chunk = removeNoise(chunk)
      list_sentence.append(chunk)
      chunk = ''

  list_sentence_embedded=[]
  for elt in list_sentence:
    inter = elt.split(' ')
    inter_copy=inter[:] #inter without '

    for elt_bis in inter_copy:
      if elt_bis=='':
        inter_copy.remove(elt_bis)

    inter_embedded=[]
    for elt_ter in inter_copy:
      elt = vectorizer(elt_ter)
      elt = elt.numpy()
      elt = list(elt)
      if len(elt)!=0:
        inter_embedded.append(elt)

    list_sentence_embedded.append(inter_embedded)

  return(list_sentence_embedded)

def removeNoise(line):
    noiseTokens = ['[', ']', ',', '.', ':', ';', '"', "'", "''", "`", "-",]
    for token in noiseTokens:
      line = line.replace(token, "")
    return line

# def my_cosine_similarity(chunk1, liste_chunck_comparables):
#   nombre_chunk_comparables = len(liste_chunck_comparables)
#   sentences = [chunk1[0]]
#   sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#   for i in range(nombre_chunk_comparables):
#     print(liste_chunck_comparables[i][0])
#     sentences.append(liste_chunck_comparables[i][0])
#   sentence_embeddings = sbert_model.encode(sentences)
#   cos_sim = cosine_similarity(sentence_embeddings)
#   return cos_sim

### Tests préliminaires performance algo

**Remarque:** Notre algorithme d'alignement de chunks ne fonctionne pas pour certaines phrases et nous n'avons pas trouvé d'explications à celà pour le moment.

> Calcul de la proportion de combinaisons de phrases renvoyant un message d'erreur en prenant les 10 premières phrases du corpus.

In [28]:
n= 10#len(testSamplesChunked)

n_failed=0

for i in range(n-1):
  for j in range(i+1,n):
    try:

      sentence_1=ds2_lines_chunked[i]  # testSamplesChunked[i]
      sentence_2=ds2_lines_chunked[j]  # testSamplesChunked[j]

      sentence_a=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_1)
      sentence_b=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_2)

      print(alignment_chunks(sentence_a, sentence_b))
    except:
      n_failed+=1
      pass

nb_couples=n*(n+1)*0.5
print(n_failed/nb_couples) #Proportions de combinaisons ne fonctionnant pas m

[(0, 0, 2), (1, 1, 0), (2, 2, 2), (3, 3, 2)]
[(0, 0, 2), (1, 1, 0), (2, 2, 2)]
[(0, 3, 2), (1, 0, 0), (2, 4, 2), (3, 2, 0)]
[(0, 0, 2), (1, 1, 0), (2, 3, 2), (3, 2, 0)]
[(0, 3, 2), (1, 0, 0), (2, 1, 0), (3, 2, 0)]
[(0, 0, 2), (1, 1, 0), (2, 2, 5)]
[(0, 3, 2), (1, 0, 0), (2, 4, 5), (3, 2, 0)]
[(0, 0, 2), (1, 1, 0), (2, 3, 2), (3, 2, 0)]
[(0, 1, 0), (1, 0, 0), (2, 3, 5), (3, 2, 0)]
[(0, 3, 2), (1, 0, 0), (2, 4, 5)]
[(0, 0, 2), (1, 1, 0), (2, 3, 2)]
[(0, 1, 0), (1, 0, 0), (2, 3, 5)]
[(3, 0, 2), (0, 1, 0), (1, 2, 0), (4, 3, 2)]
[(0, 0, 0), (1, 1, 0), (2, 2, 0), (3, 3, 5)]
[(0, 3, 2), (1, 0, 0), (2, 1, 0), (3, 2, 0)]
0.5454545454545454


Recherche des chunks alignés avec des scores de similarités égaux à 5 pour étudier les performances de notre modèle

In [29]:
n= 100#len(testSamplesChunked)

n_failed=0
list_best_scores=[]

for i in range(1,n-1):
  for j in range(i+1,n):
    try:

      sentence_1=ds2_lines_chunked[i]
      sentence_2=ds2_lines_chunked[j]

      sentence_a=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_1)
      sentence_b=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_2)

      for elt in alignment_chunks(sentence_a, sentence_b)[:][2]:
        if elt==5:
          list_best_scores.append((i,j))

      #print(alignment_chunks(sentence_a, sentence_b), (i,j))
    except:
      n_failed+=1
      pass

In [30]:
for (i,j) in list_best_scores:
    sentence_1=ds2_lines_chunked[i]
    sentence_2=ds2_lines_chunked[j]

    sentence_a=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_1)
    sentence_b=create_list_of_embedded_chunks_exploitable_for_alignement(sentence_2)

    print(alignment_chunks(sentence_a, sentence_b))
    print(sentence_1)
    print(sentence_2)

[(0, 0, 2), (1, 1, 0), (2, 2, 5)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pakistan ]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[(0, 3, 2), (1, 0, 0), (2, 4, 5), (3, 2, 0)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pakistan ]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[(0, 1, 0), (1, 0, 0), (2, 3, 5), (3, 2, 0)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pakistan ]
[ China ] [ detains ] [ 7 ] [ for `` house sister '' scandal ]
[(0, 0, 2), (1, 1, 0), (2, 3, 5), (3, 2, 2)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pakistan ]
[ Boeing stock ] [ tumbles ] [ after fire ] [ on 787 Dreamliner ]
[(0, 1, 2), (1, 0, 0), (2, 3, 5), (3, 2, 2)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pakistan ]
[ Palestinians ] [ clash with ] [ security forces ] [ in W. Bank ]
[(0, 2, 2), (1, 0, 0), (2, 3, 5), (3, 1, 0)]
[ Drone strike ] [ kills ] [ four suspected militants ] [ in Pak

[(0, 2, 2), (1, 0, 0), (2, 3, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[(0, 0, 2), (1, 1, 0), (2, 3, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ Shinzo Abe ] [ is ] [ Japan ] [ 's prime minister ]
[(0, 0, 2), (1, 1, 0), (2, 2, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ Michael Schumacher ] [ still ] [ in a coma ] [ fighting ] [ for his life ] [ following ] [ skiing accident ]
[(0, 0, 2), (1, 1, 0), (2, 3, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ EU ministers ] [ hold ] [ crisis talks ] [ over horse meat scandal ]
[(0, 2, 2), (1, 0, 0), (2, 4, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ Brazil ] [ held ] [ by England ] [ 2-2 ] [ as Maracana re-opens ]
[(0, 2, 2), (1, 1, 0), (2, 0, 5)]
[ Syria peace ] [ dashed ] [ as deadline passes ]
[ George W Bush ] [ weighs into ] [ immigration debate ]
[(0, 1, 0), (1, 0, 0), (2, 3, 5)]
[ Syria peace ] [ dashed ] [ as deadline pa

[(3, 0, 2), (0, 1, 0), (4, 2, 5), (2, 3, 0)]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[ Ariel Castro ] [ sentenced ] [ to 1,000 years ] [ in prison ]
[(3, 0, 5), (0, 1, 0), (4, 2, 5)]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[ Indian rape victim ] [ dies ] [ in Singapore hospital ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[ Workers ] [ protest ] [ after Bangladesh building collapse ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[ Snowden ] [ releases ] [ first Russia video ]
[(3, 0, 2), (0, 1, 0), (4, 2, 5)]
[ Syria ] [ blames ] [ rebels ] [ for Houla massacre ] [ of over 100 ]
[ Italian journalist ] [ picked ] [ as presidential candidate ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ China ] [ detains ] [ 7 ] [ for `` house sister '' scandal ]
[ Egypt ] [ bracing ] [ for massive protests ]
[(0, 0, 0), (1, 1, 0), 

[(2, 0, 5), (0, 1, 0), (3, 2, 5)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Indian rape victim ] [ dies ] [ in Singapore hospital ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5), (3, 2, 2)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Israel ] [ agrees ] [ to free ] [ 104 Palestinian prisoners ]
[(0, 0, 0), (1, 1, 0), (2, 4, 5), (3, 3, 2)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Navy panel ] [ urges ] [ ouster ] [ of officer ] [ who faked death ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5), (3, 2, 2)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Militants ] [ kill ] [ 6 soldiers ] [ in northwest Pakistan ]
[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Workers ] [ protest ] [ after Bangladesh building collapse ]
[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ 10 ] [ killed ] [ in road accident ] [ in NW Pakistan ]
[ Snowden ] [ releases ] [ first Russia video ]
[(0, 2, 0), (1, 1, 0), (2, 3, 5), (3, 0, 2)]
[ 

[(0, 0, 0), (1, 1, 0), (4, 2, 5)]
[ Brazil ] [ held ] [ by England ] [ 2-2 ] [ as Maracana re-opens ]
[ Snowden ] [ releases ] [ first Russia video ]
[(2, 0, 2), (0, 1, 0), (4, 2, 5)]
[ Brazil ] [ held ] [ by England ] [ 2-2 ] [ as Maracana re-opens ]
[ Italian journalist ] [ picked ] [ as presidential candidate ]
[(0, 3, 5), (1, 4, 2), (2, 5, 2)]
[ George W Bush ] [ weighs into ] [ immigration debate ]
[ US ] [ says ] [ Syria ] [ may have used ] [ sarin gas in ] [ chemical weapons ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Hundreds ] [ fall ] [ sick ] [ in Bangladesh factory ]
[ Rouhani ] [ leads ] [ in initial Iran count ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Hundreds ] [ fall ] [ sick ] [ in Bangladesh factory ]
[ Egypt ] [ bracing ] [ for massive protests ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Hundreds ] [ fall ] [ sick ] [ in Bangladesh factory ]
[ Residents ] [ return ] [ to Texas blast site ]
[(0, 0, 0), (1, 1, 0), (3, 2, 5)]
[ Hundreds ] [ fall ] [ sick ] [ in Bangladesh factory ]
[ T

[(0, 0, 0), (1, 1, 0), (2, 5, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ South Korea ] [ confirms ] [ that ] [ North Korea ] [ has conducted ] [ controversial third nuclear test ]
[(0, 0, 0), (1, 1, 0), (2, 5, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ South Korea ] [ confirms ] [ that ] [ North Korea ] [ has conducted ] [ controversial third nuclear test ]
[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[(0, 2, 0), (1, 1, 0), (2, 0, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ 5.6 magnitude earthquake ] [ shakes ] [ Iran ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ Russian parliament ] [ to consider ] [ ban ] [ on U.S. adoptions ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Egypt ] [ bracing ] [ for massive protests ]
[ Egypt ] [ 's military ] [ takes control ] [ over Muslim Brotherhood , supporters ]
[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ Egypt ] 

[(2, 0, 0), (1, 1, 0), (0, 2, 5)]
[ Series of attacks ] [ kill ] [ 10 police ] [ in Afghanistan ]
[ Workers ] [ protest ] [ after Bangladesh building collapse ]
[(2, 0, 0), (1, 1, 0), (0, 2, 5)]
[ Series of attacks ] [ kill ] [ 10 police ] [ in Afghanistan ]
[ Snowden ] [ releases ] [ first Russia video ]
[(2, 0, 2), (1, 1, 0), (0, 2, 5)]
[ Series of attacks ] [ kill ] [ 10 police ] [ in Afghanistan ]
[ Italian journalist ] [ picked ] [ as presidential candidate ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Residents ] [ return ] [ to Texas blast site ]
[ Mali ] [ Counts ] [ Votes ] [ After Presidential Runoff ]
[(0, 0, 0), (1, 1, 0), (2, 5, 8)]
[ Residents ] [ return ] [ to Texas blast site ]
[ South Korea ] [ confirms ] [ that ] [ North Korea ] [ has conducted ] [ controversial third nuclear test ]
[(0, 2, 0), (1, 1, 0), (2, 0, 5)]
[ Residents ] [ return ] [ to Texas blast site ]
[ 5.6 magnitude earthquake ] [ shakes ] [ Iran ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Residents ] [ return ] [ to 

[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Snowden ] [ releases ] [ first Russia video ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Car bombing ] [ kills ] [ 14 ] [ in northern Iraq ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Missing red panda ] [ returned ] [ safely ] [ to National Zoo ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Google shares ] [ soar ] [ past $ 1,000 ] [ on strong earnings ]
[(0, 0, 0), (1, 1, 0), (2, 2, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Italian journalist ] [ picked ] [ as presidential candidate ]
[(0, 0, 0), (1, 1, 0), (2, 3, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Car bombings ] [ kill ] [ 13 civilians ] [ in Iraqi capital ]
[(0, 0, 0), (1, 2, 0), (2, 1, 5)]
[ Ten ] [ killed ] [ in Russia coal mine blast ]
[ Series ] [ of deadly attacks ] [ strikes ] [ Iraq ]
[(

[(3, 0, 2), (0, 1, 0), (5, 2, 2), (4, 3, 5)]
[ US ] [ says ] [ Syria ] [ may have used ] [ sarin gas in ] [ chemical weapons ]
[ Car bombings ] [ kill ] [ 13 civilians ] [ in Iraqi capital ]
[(3, 0, 5), (0, 1, 0), (4, 2, 5), (5, 3, 2), (1, 4, 0)]
[ US ] [ says ] [ Syria ] [ may have used ] [ sarin gas in ] [ chemical weapons ]
[ Former CIA officer ] [ sentenced ] [ to 30 months ] [ in prison ] [ for info leak ]
[(3, 0, 2), (0, 1, 0), (5, 2, 2), (4, 3, 5)]
[ US ] [ says ] [ Syria ] [ may have used ] [ sarin gas in ] [ chemical weapons ]
[ Pakistan-UN report ] [ reveals ] [ alarming levels ] [ of drug use ]
[(0, 3, 2), (1, 0, 0), (2, 4, 5), (3, 2, 0), (4, 1, 0)]
[ Israeli lawmakers ] [ call ] [ for internal probe ] [ into case ] [ of Prisoner X ]
[ Nigeria ] [ announces ] [ plans ] [ to change ] [ polymer Naira notes ]
[(2, 0, 5), (0, 1, 0), (4, 2, 5)]
[ Israeli lawmakers ] [ call ] [ for internal probe ] [ into case ] [ of Prisoner X ]
[ Indian rape victim ] [ dies ] [ in Singapore hosp

### Script

Alignement sur les gold chunks

In [31]:
dataset1_chunked = ds1_lines_chunked
dataset2_chunked = ds2_lines_chunked

alignements_gold = []
for S1, S2 in zip(dataset1_chunked, dataset2_chunked): 

  #Preprocess
  s1 = create_list_of_embedded_chunks_exploitable_for_alignement(S1)
  s2 = create_list_of_embedded_chunks_exploitable_for_alignement(S2)

  print(s1)
  print(s2)

  score_line = alignment_chunks(s1, s2)
  alignements_gold.append(score_line)

#alignements_gold

[[[259, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1291, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

ValueError: expected a matrix (2-D array), got a (0,) array

Alignement sur les chunks prédits

In [None]:
chunkedLines1 = chunkDataset(trainSamples, showErrors=False)
chunkedLines2 = chunkDataset(testSamples, showErrors=False)

alignements_pred = []
for S1, S2 in zip (chunkedLines1, chunkedLines2): 

  # Suppression des mots hors des chunks avant alignement
  s1 = removeWordsOutOfChunks(S1)
  s2 = removeWordsOutOfChunks(S2)

  #Preprocess
  s1 = create_list_of_embedded_chunks_exploitable_for_alignement(s1)
  s2 = create_list_of_embedded_chunks_exploitable_for_alignement(s2)

  score_line = alignment_chunks(s1, s2)
  alignements_pred.append(score_line)

# alignements_pred

ValueError: ignored