In [3]:
import spacy
from zss import Node, simple_distance
from functools import partial

# Load the French language model
nlp = spacy.load("fr_core_news_md")

# Function to convert a spaCy token to a zss Node recursively
def token_to_zss_node(mode, token):
    if mode=="dep":
        node = Node(token.dep_)
    elif mode=="text":
        node = Node(token.orth_)
    elif mode=="pos":
        node = Node(token.pos_)
        
    for child in token.children:
        node.addkid(token_to_zss_node(mode=mode, token=child))
    return node

def build_zss_tree(spacy_doc, zss_parser):
    root = Node("ROOT")
    for sent in spacy_doc.sents:
        root.addkid(zss_parser(sent.root))
    return root

# Function to compute tree edit distance between two documents
def compute_tree_edit_distance(parsed_doc1, parsed_doc2, zss_parser):
    root1 = build_zss_tree(spacy_doc=parsed_doc1,zss_parser=zss_parser)
    root2 = build_zss_tree(spacy_doc=parsed_doc2,zss_parser=zss_parser)
    # Compute the tree edit distance using zss
    return simple_distance(root1, root2)

# Function to compare syntactic similarity (simple example)
def syntactic_similarity(doc1, doc2):
    # Compare the dependency tags between the two docs
    matches = 0
    for token1, token2 in zip(doc1, doc2):
        if token1.dep_ == token2.dep_:
            matches += 1
    # Calculate similarity as the proportion of matching dependency tags
    similarity = matches / min(len(doc1), len(doc2))
    return similarity

def _remove_stop_words(doc):
    return " ".join([str(t) for t in doc if not t.is_stop])

def semantic_similarity(doc1, doc2):
    return doc1.similarity(doc2)
    
# zss parser
zss_dependency_parser= partial(token_to_zss_node,'dep')
zss_token_text_parser= partial(token_to_zss_node,'text')
zss_part_of_speech_parser= partial(token_to_zss_node,'pos')

###################################################################
# syntactically similar but semantically very different
doc1_text = "Je vis en union libre. Quels sont les droits de mon partenaire sur nos enfants nés en union libre ?"
doc2_text = "Je travaille en tant qu'indépendant. Quels sont les droits de mon associé sur notre entreprise fondée ensemble?"
parsed_doc1 = nlp(doc1_text)
parsed_doc2 = nlp(doc2_text)
preprocessed_parsed_doc1 =  nlp(_remove_stop_words(parsed_doc1))
preprocessed_parsed_doc2 = nlp(_remove_stop_words(parsed_doc2))


print(f"==============syntactically similar but semantically very different===============")
print(f"Human Question: {doc1_text}")
print(f"Generated Question: {doc2_text}")
dependency_distance = compute_tree_edit_distance(parsed_doc1, parsed_doc2,zss_parser=zss_dependency_parser)
syntactic_similarity_count = syntactic_similarity(parsed_doc1, parsed_doc2)
semantic_similarity_score = semantic_similarity(preprocessed_parsed_doc1, preprocessed_parsed_doc2)
print(f"Dependency Tree edit distance between the documents: {dependency_distance}")
print(f"Syntactic similarity between the documents: {syntactic_similarity_count}")
print(f"Semantic similarity between the documents: {semantic_similarity_score}")


# semantically similar but syntactically very different
doc1_text = "Je vis en union libre. Quels sont les droits de mon partenaire sur nos enfants nés en union libre ?"
doc2_text = "Concernant les enfants nés en union libre, quels droits reviennent au partenaire non marié ?"
parsed_doc1 = nlp(doc1_text)
parsed_doc2 = nlp(doc2_text)
preprocessed_parsed_doc1 =  nlp(_remove_stop_words(parsed_doc1))
preprocessed_parsed_doc2 = nlp(_remove_stop_words(parsed_doc2))

print(f"===============semantically similar but syntactically very different==============")
print(f"Human Question: {doc1_text}")
print(f"Generated Question: {doc2_text}")
dependency_distance = compute_tree_edit_distance(parsed_doc1, parsed_doc2,zss_parser=zss_dependency_parser)
syntactic_similarity_count = syntactic_similarity(parsed_doc1, parsed_doc2)
semantic_similarity_score = semantic_similarity(preprocessed_parsed_doc1, preprocessed_parsed_doc2)
print(f"Dependency Tree edit distance between the documents: {dependency_distance}")
print(f"Syntactic similarity between the documents: {syntactic_similarity_count}")
print(f"Semantic similarity between the documents: {semantic_similarity_score}")

Human Question: Je vis en union libre. Quels sont les droits de mon partenaire sur nos enfants nés en union libre ?
Generated Question: Je travaille en tant qu'indépendant. Quels sont les droits de mon associé sur notre entreprise fondée ensemble?
Dependency Tree edit distance between the documents: 9.0
Syntactic similarity between the documents: 0.1
Semantic similarity between the documents: 0.7520130196301145
Human Question: Je vis en union libre. Quels sont les droits de mon partenaire sur nos enfants nés en union libre ?
Generated Question: Concernant les enfants nés en union libre, quels droits reviennent au partenaire non marié ?
Dependency Tree edit distance between the documents: 18.0
Syntactic similarity between the documents: 0.125
Semantic similarity between the documents: 0.8995861539208164


In [102]:
# Visualize the dependency parse of the document

# Set the options for rendering
options = {
    'compact': True,  # Makes the layout more compact
    'distance': 100,  # Controls the distance between tokens
    'add_lemma': False,  # Whether to include lemmas
}

spacy.displacy.render(parsed_doc1, style='dep',jupyter=True, options=options)

In [103]:
spacy.displacy.render(parsed_doc2, style='dep',jupyter=True, options=options)

In [7]:
# Load the French language model
nlp = spacy.load("en_core_web_sm")
parsed_doc1 = nlp("I live in a common-law relationship.")
options = {
    'compact': True,  # Makes the layout more compact
    'distance': 100,  # Controls the distance between tokens
    'add_lemma': False,  # Whether to include lemmas
}
spacy.displacy.render(parsed_doc1, style='dep',jupyter=True, options=options)