<a href="https://colab.research.google.com/github/ym001/distancia/blob/master/notebook/textDistance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install distancia==0.0.72


Collecting distancia==0.0.72
  Downloading distancia-0.0.72-py3-none-any.whl.metadata (14 kB)
Downloading distancia-0.0.72-py3-none-any.whl (124 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.6/124.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: distancia
Successfully installed distancia-0.0.72


In [5]:
import math
from typing import List, Dict
from collections import Counter
from distancia import TFIDF
 #Exemple d'utilisation
corpus = [
    "the cat sat on the mat",
    "the dog sat on the mat",
    "the dog chased the cat"
]

text1 = "the cat is sitting on the mat"
text2 = "the dog is sitting on the mat"

tfidf_distance = TFIDF(corpus)
similarity_score: float = tfidf_distance.compute(text1, text2)

print(f"TF-IDF Similarity: {similarity_score}")

TF-IDF Similarity: 0.9293358130759506


In [6]:
from typing import Set
from distancia import OverlapCoefficient

# Exemple d'utilisation
set1 = {"the", "cat", "sat", "on", "the", "mat"}
set2 = {"the", "dog", "sat", "on", "the", "mat"}

overlap = OverlapCoefficient()
similarity_score: float = overlap.compute(set1, set2)

print(f"Overlap Coefficient: {similarity_score}")


Overlap Coefficient: 0.8


In [7]:
from typing import List, Dict
from collections import Counter
import math
from distancia import Euclidean,BagOfWordsDistance




# Exemple d'utilisation
text1 = "the cat sat on the mat"
text2 = "the dog sat on the mat"

bow_distance = BagOfWordsDistance()
similarity_score: float = bow_distance.compute(text1, text2)

print(f"Bag-of-Words Distance: {similarity_score}")


Bag-of-Words Distance: 1.4142135623730951


In [8]:
from distancia import Cosine,FastTextDistance

from typing import List, Dict
from gensim.models import FastText
from math import sqrt



# Exemple d'utilisation
# Supposons que vous avez un modèle FastText pré-entraîné
# model = FastText.load("fasttext_model.bin")  # Charger un modèle FastText pré-entraîné

# Pour l'exemple, nous créons un modèle simple avec un corpus minimal
sentences = [["the", "cat", "sat", "on", "the", "mat"], ["the", "dog", "sat", "on", "the", "mat"]]
model = FastText(sentences, vector_size=100, window=5, min_count=1, sg=1)

text1 = "the cat sat on the mat"
text2 = "the dog sat on the mat"

fasttext_distance = FastTextDistance(model)
distance: float = fasttext_distance.compute(text1, text2)

print(f"FastText Distance: {distance}")


FastText Distance: 0.12081323820146783


In [9]:
from typing import List, Set
from distancia import NgramDistance

# Exemple d'utilisation
ngram_distance = NgramDistance(n=3)  # Tri-grammes (n=3)

text1: str = "The quick brown fox"
text2: str = "The quick brown dog"

distance: float = ngram_distance.compute(text1, text2)
print(f"N-gram Distance: {distance}")


N-gram Distance: 0.3529411764705882


In [10]:
from typing import List, Tuple

from distancia import SmithWaterman

# Exemple d'utilisation
seq1: str = "AGACTG"
seq2: str = "GACTTAC"

sw = SmithWaterman(match_score=2, mismatch_penalty=-1, gap_penalty=-2)

# Calcul de la distance
max_score, score_matrix = sw.compute(seq1, seq2)
print(f"Max Alignment Score: {max_score}")

# Effectuer le traceback
aligned_seq1, aligned_seq2 = sw.traceback(score_matrix, seq1, seq2)
print(f"Aligned Sequence 1: {aligned_seq1}")
print(f"Aligned Sequence 2: {aligned_seq2}")


Max Alignment Score: 8
Aligned Sequence 1: GACT
Aligned Sequence 2: GACT


In [30]:
from distancia import MongeElkanDistance,Levenshtein
from typing import List, Callable


# Exemple d'utilisation
text1: str = "the quick brown fox"
text2: str = "the quick brown dog"

# Convertir les textes en listes de mots
set1: List[str] = text1.split()
set2: List[str] = text2.split()

# Créer une instance de la classe Monge-Elkan avec la distance de Levenshtein comme distance de base
monge_elkan = MongeElkanDistance(base_distance=Levenshtein())

# Calculer la distance Monge-Elkan
distance: float = monge_elkan.compute(set1, set2)
print(f"Monge-Elkan Distance: {distance}")


Monge-Elkan Distance: 0.5


In [31]:
from typing import List, Dict
import math
from collections import Counter

from distancia import JensenShannonDivergence

# Exemple d'utilisation avec des textes
text1: str = "The quick brown fox jumps over the lazy dog"
text2: str = "The fast brown fox leaps over the lazy dog"

# Vocabulaire global (tous les mots apparaissant dans les textes)
vocabulary: List[str] = list(set(text1.split()) | set(text2.split()))

# Créer une instance de la classe Jensen-Shannon Divergence
js_divergence = JensenShannonDivergence()

# Convertir les textes en distributions de probabilités
dist1: List[float] = js_divergence.text_to_distribution(text1, vocabulary)
dist2: List[float] = js_divergence.text_to_distribution(text2, vocabulary)

# Calculer la Jensen-Shannon Divergence entre les deux textes
divergence: float = js_divergence.compute(dist1, dist2)
print(f"Jensen-Shannon Divergence: {divergence}")


Jensen-Shannon Divergence: 0.15403270679109896


In [15]:
from typing import List, Tuple
from collections import Counter
import math
from distancia import BLEUScore

# Exemple d'utilisation
hypothesis: List[str] = "the cat is on the mat".split()
references: List[List[str]] = [
    "the cat is on the mat".split(),
    "there is a cat on the mat".split()
]

# Créer une instance de la classe BLEUScore
bleu = BLEUScore()

# Calculer le BLEU Score
score: float = bleu.compute(hypothesis, references)
print(f"BLEU Score: {score}")


BLEU Score: 1.0


In [16]:
from typing import List, Tuple, Dict
from collections import Counter

from distancia import ROUGEScore

# Example usage:
hypothesis: List[str] = "the cat is on the mat".split()
references: List[List[str]] = [
    "the cat is on the mat".split(),
    "there is a cat on the mat".split()
]

# Create an instance of the ROUGEScore class with bigrams (n=2)
rouge = ROUGEScore(n_gram=2)

# Compute the ROUGE-N score
rouge_n_score: Dict[str, float] = rouge.compute(hypothesis, references)
print(f"ROUGE-N Score: {rouge_n_score}")


ROUGE-N Score: {'recall': 1.0}


In [18]:
from typing import List, Dict, Tuple
import math

from distancia import SoftCosineSimilarity
# Example usage:
term_similarity_matrix: Dict[Tuple[str, str], float] = {
    ("cat", "cat"): 1.0,
    ("cat", "dog"): 0.5,
    ("dog", "dog"): 1.0,
    ("mat", "mat"): 1.0,
    ("on", "on"): 1.0,
    ("is", "is"): 1.0
}

# Create an instance of SoftCosineSimilarity with the term similarity matrix
soft_cosine_sim = SoftCosineSimilarity(term_similarity_matrix=term_similarity_matrix)

# Define two documents as lists of words
doc1: List[str] = ["the", "cat", "is", "on", "the", "mat"]
doc2: List[str] = ["the", "dog", "is", "on", "the", "mat"]

# Compute the Soft Cosine Similarity
similarity_score: float = soft_cosine_sim.compute(doc1, doc2)
print(f"Soft Cosine Similarity: {similarity_score}")


Soft Cosine Similarity: 0.4374999999999999


In [21]:
from typing import List, Dict
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import TruncatedSVD as LSA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from distancia  import TopicModelingDistance
# Example usage:
documents: List[str] = [
    "The cat sat on the mat.",
    "Dogs are great companions.",
    "Cats and dogs are popular pets.",
    "I love my pet cat and dog."
]

# Initialize TopicModelingDistance with LDA and 5 topics
topic_model_distance = TopicModelingDistance(method='LDA', num_topics=5)

# Fit the model to a list of documents
topic_model_distance.fit(documents)

# Compute the distance between two new documents
doc1: str = "The cat sat on the mat."
doc2: str = "Dogs are great companions."
distance: float = topic_model_distance.compute(doc1, doc2)
print(f"Topic Distance (LDA): {distance}")

# You can also use LSA by changing the method
lsa_model_distance = TopicModelingDistance(method='LSA', num_topics=5)
lsa_model_distance.fit(documents)
distance_lsa: float = lsa_model_distance.compute(doc1, doc2)
print(f"Topic Distance (LSA): {distance_lsa}")


Topic Distance (LDA): 1.060007066936174
Topic Distance (LSA): 1.4142135623730954


In [23]:
from typing import List, Tuple

from distancia import AlignmentBasedMeasures
# Example usage:
text1: str = "The quick brown fox jumps over the lazy dog"
text2: str = "The quick fox jumps over a lazy dog"

# Initialize AlignmentBasedMeasures class
alignment_measure = AlignmentBasedMeasures()

# Align the texts and compute the alignment score
aligned_texts: List[Tuple[str, str]] = alignment_measure.align_texts(text1, text2)
score: float = alignment_measure.compute(text1, text2)

# Output the results
print("Aligned Texts:")
for word1, word2 in aligned_texts:
    print(f"{word1:15} {word2:15}")

print(f"\nAlignment Score: {score}")


Aligned Texts:
The             The            
quick           quick          
brown           -              
fox             fox            
jumps           jumps          
over            over           
the             a              
lazy            lazy           
dog             dog            

Alignment Score: 0.7777777777777778


In [25]:
from typing import List, Tuple, Set, Dict

from distancia import GappyNGramDistance
# Example usage:
text1: str = "the quick brown fox jumps over the lazy dog"
text2: str = "the fast brown fox leaps over a sleepy dog"

# Initialize GappyNGramDistance class with n=3 and gap_size=1
gappy_ngram_distance = GappyNGramDistance(n=3, gap_size=1)

# Compute the gappy n-gram similarity
similarity_score: float = gappy_ngram_distance.compute(text1, text2)

# Output the result
print(f"Gappy N-gram Similarity Score: {similarity_score:.4f}")


Gappy N-gram Similarity Score: 0.0000


In [27]:
from typing import List, Set, Tuple

from distancia import SoftJaccardSimilarity
# Example usage:
text1: str = "the quick brown fox jumps over the lazy dog"
text2: str = "the fast brown fox leaps over a sleepy dog"

# Initialize SoftJaccardSimilarity class with a threshold of 0.5 (50% similarity for matching)
soft_jaccard = SoftJaccardSimilarity(threshold=0.5)

# Compute the soft Jaccard similarity
similarity_score: float = soft_jaccard.compute(text1, text2)

# Output the result
print(f"Soft Jaccard Similarity Score: {similarity_score:.4f}")


Soft Jaccard Similarity Score: 0.4167


In [34]:
import zlib
from typing import Tuple

from distancia import NormalizedCompression
# Example usage:
text1: str = "the quick brown fox jumps over the lazy dog"
text2: str = "the fast brown fox leaps over a sleepy dog"

# Initialize the NCD class
ncd_calculator = NormalizedCompression()

# Compute the NCD between two texts
ncd_value: float = ncd_calculator.compute(text1, text2)

# Output the result
print(f"Normalized Compression Distance (NCD): {ncd_value:.4f}")


Normalized Compression Distance (NCD): 0.4200
