In [1]:
import os
import numpy as np

SAMPLE_1 = [os.path.join('out', 'correct_1.txt'), os.path.join('out', 'original_1.txt')]
SAMPLE_2 = [os.path.join('out', 'correct_2.txt'), os.path.join('out', 'original_2.txt')]

In [2]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_sim(query, document): 
    text = [query, document]
    count_vec = CountVectorizer()
    count_vec.fit(text)
    arr = count_vec.transform(text).toarray()
    vector1, vector2 = arr
    return cosine_similarity(np.array([vector1]), np.array([vector2]))[0][0]

def get_jaccard_sim(query, document):
    query = query.split()
    document = document.split()
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    if len(union) == 0:
        print('No similar words')
        return 0
    return len(intersection)/len(union)

def get_levenshtein_sim(query, document):
    size_x = len(query) + 1
    size_y = len(document) + 1
    matrix = np.zeros((size_x, size_y), dtype=int)
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if query[x-1] == document[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1] + 1,
                    matrix[x, y-1] + 1
                )
#     print (matrix)
    distance =  matrix[size_x - 1, size_y - 1]
    return (size_x + size_y - distance) / (size_x + size_y)

In [3]:
print("[Sample01]:")
with open(SAMPLE_1[0], 'r') as file1:
    with open(SAMPLE_1[1], 'r') as file2:
        file1_txt = file1.read()
#         print(len(file1_txt.split()))
        file2_txt = file2.read()
        print(f"Jaccard Similarity: {get_jaccard_sim(file2_txt, file1_txt)}")
        print(f"Cosine Similarity: {get_cosine_sim(file2_txt, file1_txt)}")
        print(f"Levenshtein Similarity: {get_levenshtein_sim(file2_txt, file1_txt)}")
        
print("\n[Sample02]:")
with open(SAMPLE_2[0], 'r') as file1:
    with open(SAMPLE_2[1], 'r') as file2:
        file1_txt = file1.read()
#         print(len(file1_txt.split()))
        file2_txt = file2.read()
        print(f"Jaccard Similarity: {get_jaccard_sim(file2_txt, file1_txt)}")
        print(f"Cosine Similarity: {get_cosine_sim(file2_txt, file1_txt)}")
        print(f"Levenshtein Similarity: {get_levenshtein_sim(file2_txt, file1_txt)}")

[Sample01]:
Jaccard Similarity: 0.5263157894736842
Cosine Similarity: 0.8512055557875504
Levenshtein Similarity: 0.6825595984943539

[Sample02]:
Jaccard Similarity: 0.031914893617021274
Cosine Similarity: 0.21698151830137313
Levenshtein Similarity: 0.06382978723404255
