In [None]:
##### UNCOMMENT #####

# # Requirements
# !pip install transformers
# !pip install sentence-transformers
# !pip install sklearn

In [4]:
import os
import numpy as np
import re
import matplotlib.pyplot as plt

In [3]:
path = '../Semantic_chunk_identification' ##### MODIFY #####
os.chdir(path)

##### UNCOMMENT #####

# from google.colab import drive
# drive.mount('drive', force_remount=True)

# Chunking

## Getting the data

In [5]:
DATASET = "headlines" # or "images" or "answers-students"

if (DATASET == "headlines" or DATASET == "images") :
    ds1_chunked = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent1.chunk.txt'  ##### MODIFY #####
    ds2_chunked = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent2.chunk.txt'  ##### MODIFY #####
    ds1 = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent1.txt'  ##### MODIFY #####
    ds2 = f'train_2015_10_22.utf-8/STSint.input.{DATASET}.sent2.txt'  ##### MODIFY #####
else :
    ds1_chunked = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent1.chunk.txt'  ##### MODIFY #####
    ds2_chunked = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent2.chunk.txt'  ##### MODIFY #####
    ds1 = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent1.txt'  ##### MODIFY #####
    ds2 = f'train_students_answers_2015_10_27.utf-8/STSint.input.{DATASET}.sent2.txt'  ##### MODIFY #####

In [6]:
# Put the sentences in lists

ds1_lines = [line.strip() for line in open(ds1)]
ds2_lines = [line.strip() for line in open(ds2)]
ds1_lines_chunked = [line.strip() for line in open(ds1_chunked)]
ds2_lines_chunked = [line.strip() for line in open(ds2_chunked)]

In [8]:
# One example

idx = 3
print(ds1_lines[idx])
print(ds1_lines_chunked[idx])

Syria peace plan conditions " unacceptable , " opposition says
[ Syria peace plan conditions ] [ " ] [ unacceptable ] [ , ] [ " ] [ opposition ] [ says ]


In [18]:
train_size = int(3/4*len(ds1_lines))

ds1_lines_train = ds1_lines[:train_size]
ds1_lines_gold_train = ds1_lines_chunked[:train_size]
ds1_lines_test = ds1_lines[train_size:]
ds1_lines_gold_test = ds1_lines_chunked[train_size:]

ds2_lines_train = ds2_lines[:train_size]
ds2_lines_gold_train = ds2_lines_chunked[:train_size]
ds2_lines_test = ds2_lines[train_size:]
ds2_lines_gold_test = ds2_lines_chunked[train_size:]

In [19]:
print('Train size:', len(ds1_lines_train))
print('Test size:', len(ds1_lines_test))

Train size: 567
Test size: 189


## Chunking with transformers

Nous utilisons un pipeline qui tokenise une phrase en entrée, utilise un modèle BERT sur les tokens, et renvoie le chunking de la phrase d'entrée

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-chunk")
model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-chunk")
pipe = pipeline('ner', grouped_entities=True, model=model, tokenizer=tokenizer)



In [30]:
# Cette fonction retourne une phrase chunkée par le modèle mais EN MINUSCULES
def compute_chunk_list(pipe, sentence):
    chunk_dicts = pipe(sentence)
    chunk_list = []
    for chunk_dict in chunk_dicts:
        entity = chunk_dict['word']
        entity_group = chunk_dict['entity_group']
        chunk_list.append([entity, entity_group])
    return chunk_list

def get_gold_chunk_list(sentence): # EN MINUSCULES
    return sentence.lower().strip("[ ").strip(" ]").split(" ] [ ")

In [44]:
chunk_dicts = pipe('Syria peace plan conditions " unacceptable , " opposition says')
chunk_dicts[0]

{'entity_group': 'NP',
 'score': 0.9940616,
 'word': 'syria peace plan conditions',
 'start': 0,
 'end': 27}

In [45]:
# Cette fonction retourne une phrase chunkée par le modèle COMME IL FAUT
def compute_chunk_list2(pipe, sentence):
    chunk_dicts = pipe(sentence)
    chunk_list = []
    for chunk_dict in chunk_dicts:
        entity = sentence[chunk_dict['start']:chunk_dict['end']]
        entity_group = chunk_dict['entity_group']
        chunk_list.append([entity, entity_group])
    return chunk_list

def get_gold_chunk_list2(sentence): # SANS MINUSCULES
    return sentence.strip("[ ").strip(" ]").split(" ] [ ")

In [46]:
ds1_train_chunks2 = [compute_chunk_list2(pipe, sentence) for sentence in ds1_lines_train]

In [47]:
ds1_train_chunks2[0]

[['Former Nazi death camp guard Demjanjuk', 'NP'],
 ['dead', 'ADJP'],
 ['at', 'PP'],
 ['91', 'NP']]

Ici, on compare le chunking du modèle avec les gold chunks

In [50]:
ds1_train_gold_chunks2 = [get_gold_chunk_list2(sentence) for sentence in ds1_lines_gold_train]

In [51]:
print(ds1_train_chunks2[0])
print(ds1_train_gold_chunks2[0])

[['Former Nazi death camp guard Demjanjuk', 'NP'], ['dead', 'ADJP'], ['at', 'PP'], ['91', 'NP']]
['Former Nazi death camp guard Demjanjuk', 'dead', 'at 91']


In [52]:
# La fonction de score du chunking cherche si chaque chunk de l'output a un gold chunk correspondant.

def chunking_score(output_chunks, gold_chunks):
    score = 0
    for i in range(len(output_chunks)):
        sent_score = 0
        for chk in output_chunks[i]:
            if chk[0] in gold_chunks[i]:
                sent_score += 1
        score += sent_score/len(gold_chunks[i])
#         if i%10==0:
#             print(output_chunks[i])
#             print(gold_chunks[i])
    return score/len(output_chunks)

In [55]:
score = chunking_score(ds1_train_chunks2, ds1_train_gold_chunks2)

In [56]:
score

0.5704308282350613

## Writing predicted chunking in a .txt file

In [60]:
# Tests
sentence = ds1_train_chunks2[0]
print(sentence)

string = "[ "
for i in range(len(sentence)-1):
    string = string + sentence[i][0] + " ] [ "
string = string + sentence[-1][0] + " ]\n"
print(string)
print(ds1_lines_gold_train[0])

[['Former Nazi death camp guard Demjanjuk', 'NP'], ['dead', 'ADJP'], ['at', 'PP'], ['91', 'NP']]
[ Former Nazi death camp guard Demjanjuk ] [ dead ] [ at ] [ 91 ]

[ Former Nazi death camp guard Demjanjuk ] [ dead ] [ at 91 ]


In [62]:
chunking_file = open(f"{DATASET}_predicted_chunks.txt","w")

for sentence in ds1_train_chunks2:
    string = "[ "
    for i in range(len(sentence)-1):
        string = string + sentence[i][0] + " ] [ "
    string = string + sentence[-1][0] + " ]\n"
    chunking_file.write(string)
    
chunking_file.close()

# Aligning the chunks

In [None]:
from math import *
from scipy.optimize import linear_sum_assignment

def min_max_scaler(sim):
    scaled=[]
    a, b=1, -1 #Cosine similarity entre -1 et 1
    for elt in sim:
        inter=(elt-b)/(a-b)
        inter=inter*5 #Pour avoir des scores entre 0 et 5
        inter=round(inter) #Pour avoir des scores entiers
        scaled.append(inter)
    return(scaled)

def alignment_chunks(cost):
    
    cost_bis=np.array(cost) #si pas sous bon format
    inv=False
    if cost_bis.shape[0]<cost_bis.shape[1]:
        cost_bis=cost_bis.T
        inv=True
    
    row_ind, col_ind = linear_sum_assignment(-cost_bis) #- car linear_sum_assignment minimise normalement
    sim=cost_bis[row_ind, col_ind] #Liste de score de similarité pour chaque alignement optimal de chunks

    if inv:
        row_ind, col_ind=col_ind, row_ind

    sim=min_max_scaler(sim)

    list_couples_chunks_et_score=[]
    for i in range(len(sim)):
        list_couples_chunks_et_score.append((row_ind[i], col_ind[i], sim[i]))
    return(list_couples_chunks_et_score)