In [20]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
# from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel


In [2]:
with open('English_queries.txt', 'r') as file:
    queries_en = [line.strip() for line in file.readlines()]
    
# Print First 5 queries out of the 142 queries
queries_en[:5]

['What is the capital of Argentina?',
 'Who won the Nobel Prize for Literature in 2020?',
 'What is the square root of 144?',
 'How far is the moon from the Earth?',
 'Who painted the Mona Lisa?']

In [3]:
with open('Turkish_doc.txt', 'r') as file:
    doc_tr = [line.strip() for line in file.readlines()]
    
# Print First 5 queries out of the 142 queries
doc_tr[:5]

["Arjantin'in başkenti Buenos Aires'tir.",
 "2020 yılında Edebiyat Nobel Ödülü'nü Amerikalı şair Louise Glück kazanmıştır.",
 "144'ün karekökü 12'dir.",
 "Ay, Dünya'dan ortalama 384.400 kilometre uzaklıktadır.",
 'Mona Lisa tablosu, İtalyan Rönesans sanatçısı Leonardo da Vinci tarafından resmedilmiştir.']

In [4]:
# Vectorizing using TfidVectorizer
vectorizer = TfidfVectorizer()
vecs_en = vectorizer.fit_transform(queries_en)
vecs_tr = vectorizer.fit_transform(doc_tr)

In [5]:
# Computing truncated SVD
svd = TruncatedSVD(n_components=2)
lsi_en = svd.fit_transform(vecs_en)
lsi_tr = svd.fit_transform(vecs_tr)

In [6]:
# Cosine similarity between files
def cosine_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

# Jaccard similarity between files
def jaccard_sim(a, b):
    intersection = len(set(a).intersection(set(b)))
    union = len(set(a).union(set(b)))
    return intersection / union

# Dice similarity between files
def dice_sim(a, b):
    intersection = len(set(a).intersection(set(b)))
    total = len(a) + len(b)
    return (2 * intersection) / total

# Overlap similarity between files
def overlap_sim(a, b):
    intersection = len(set(a).intersection(set(b)))
    min_len = min(len(a), len(b))
    return intersection / min_len

In [7]:
# Compute the similarity matrix between the English and Turkish files using different similarity measures
sim_cosine = np.zeros((len(queries_en), len(doc_tr)))
sim_jaccard = np.zeros((len(queries_en), len(doc_tr)))
sim_dice = np.zeros((len(queries_en), len(doc_tr)))
sim_overlap = np.zeros((len(queries_en), len(doc_tr)))

In [8]:
for i in range(len(queries_en)):
    for j in range(len(doc_tr)):
        sim_cosine[i][j] = cosine_sim(lsi_en[i], lsi_tr[j])
        sim_jaccard[i][j] = jaccard_sim(vecs_en[i].toarray()[0], vecs_tr[j].toarray()[0])
        sim_dice[i][j] = dice_sim(vecs_en[i].toarray()[0], vecs_tr[j].toarray()[0])
        sim_overlap[i][j] = overlap_sim(vecs_en[i].toarray()[0], vecs_tr[j].toarray()[0])

# Print the similarity matrices
print("Cosine similarity:\n", sim_cosine)
print("Jaccard similarity:\n", sim_jaccard)
print("Dice similarity:\n", sim_dice)
print("Overlap similarity:\n", sim_overlap)

Cosine similarity:
 [[ 0.86193947 -0.41642314  0.87596189 ...  0.90901999  0.87163871
  -0.13050392]
 [ 0.40993705  0.94917912  0.38392324 ...  0.31508027  0.39211715
   0.99976784]
 [ 0.97936306 -0.10113143  0.98469743 ...  0.99483637  0.98310916
   0.19604765]
 ...
 [ 0.90206479  0.52138446  0.88947098 ...  0.85351074  0.89349867
   0.74936523]
 [ 0.95240032 -0.20608755  0.96065719 ...  0.97845882  0.95815003
   0.09092957]
 [ 0.86392265  0.58914089  0.84930302 ...  0.80824373  0.85396275
   0.80079749]]
Jaccard similarity:
 [[0.09090909 0.07692308 0.11111111 ... 0.0625     0.08333333 0.07692308]
 [0.08333333 0.07142857 0.1        ... 0.05882353 0.07692308 0.07142857]
 [0.1        0.08333333 0.125      ... 0.06666667 0.09090909 0.08333333]
 ...
 [0.08333333 0.07142857 0.1        ... 0.05882353 0.07692308 0.07142857]
 [0.08333333 0.07142857 0.1        ... 0.05882353 0.07692308 0.07142857]
 [0.08333333 0.07142857 0.1        ... 0.05882353 0.07692308 0.07142857]]
Dice similarity:
 [[0.0

In [9]:
cosine_correct = 0
jaccard_correct = 0
dice_correct = 0
overlap_correct = 0

for i in range(len(queries_en)):
  cosine_max_index = np.argmax(sim_cosine[i])
  jaccard_max_index = np.argmax(sim_jaccard[i])
  dice_max_index = np.argmax(sim_dice[i])
  overlap_max_index = np.argmax(sim_overlap[i])
  if cosine_max_index == i:
    cosine_correct += 1
  if jaccard_max_index == i:
      jaccard_correct += 1
  if dice_max_index == i:
      dice_correct += 1
  if overlap_max_index == i:
      overlap_correct += 1

cosine_accuracy = cosine_correct / len(queries_en)
jaccard_accuracy = jaccard_correct / len(queries_en)
dice_accuracy = dice_correct / len(queries_en)
overlap_accuracy = overlap_correct / len(queries_en)

print(f"\nAccuracy for cosine similarity: {cosine_accuracy}")
print(f"Accuracy for Jaccard similarity: {jaccard_accuracy}")
print(f"Accuracy for Dice similarity: {dice_accuracy}")
print(f"Accuracy for overlap similarity: {overlap_accuracy}")


Accuracy for cosine similarity: 0.007042253521126761
Accuracy for Jaccard similarity: 0.007042253521126761
Accuracy for Dice similarity: 0.007042253521126761
Accuracy for overlap similarity: 0.007042253521126761


In [10]:
avg_sim_sim = sim_cosine + sim_jaccard + sim_dice + sim_overlap
avg_sim_correct = 0
for i in range(len(queries_en)):
  avg_sim_max_index = np.argmax(avg_sim_sim[i])
  if avg_sim_max_index == i:
    avg_sim_correct += 1

avg_sim_accuracy = avg_sim_correct / len(queries_en)
print(f"\nAccuracy for Average Similarity: {avg_sim_accuracy}")


Accuracy for Average Similarity: 0.014084507042253521


In [11]:
# Define the language model and tokenizer for translation
model_name = "Helsinki-NLP/opus-mt-tc-big-en-tr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [12]:
# Test the model
def translate_en_to_tr(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example usage
english_text = "Hello, how are you? I was wondering if you could lend me a pen"
turkish_text = translate_en_to_tr(english_text)
print(f"English: {english_text}")
print(f"Turkish: {turkish_text}")

English: Hello, how are you? I was wondering if you could lend me a pen
Turkish: Merhaba, nasılsınız? Acaba bana bir kalem ödünç verebilir misiniz?


In [13]:
# Tranrslate the queries to Turkish
queries_tr = []
for query in queries_en:
    input_ids = tokenizer.encode(query, return_tensors='pt')
    translated = model.generate(input_ids=input_ids, num_beams=4, max_length=100, early_stopping=True)
    translated_query = tokenizer.decode(translated[0], skip_special_tokens=True)
    queries_tr.append(translated_query)

In [14]:
# Print First 5 queries of 142
queries_tr[:5]

["Arjantin'in başkenti neresidir?",
 "2020 yılında Nobel Edebiyat Ödülü'nü kim kazandı?",
 "144'ün karekökü nedir?",
 "Ay, Dünya'dan ne kadar uzakta?",
 "Mona Lisa'yı kim boyadı?"]

In [15]:
# Vectorize the documents using the TfidfVectorizer
vectorizer = TfidfVectorizer()
vecs_trans = vectorizer.fit_transform(queries_tr)
vecs_tr = vectorizer.fit_transform(doc_tr)

# Compute the truncated SVD
svd = TruncatedSVD(n_components=2)
lsi_en = svd.fit_transform(vecs_trans)
lsi_tr = svd.fit_transform(vecs_tr)

In [16]:
# Compute the similarity matrix between the English and Spanish documents using different similarity measures
n = len(queries_en)
m = len(doc_tr)
sim_cosine = np.zeros((n, m))
sim_jaccard = np.zeros((n, m))
sim_dice = np.zeros((n, m))
sim_overlap = np.zeros((n, m))

for i in range(n):
    for j in range(m):
        sim_cosine[i][j] = cosine_sim(lsi_en[i], lsi_tr[j])
        sim_jaccard[i][j] = jaccard_sim(set(vecs_tr[j].nonzero()[1]), set(vecs_trans[i].nonzero()[1]))
        sim_dice[i][j] = dice_sim(set(vecs_tr[j].nonzero()[1]), set(vecs_trans[i].nonzero()[1]))
        sim_overlap[i][j] = overlap_sim(set(vecs_tr[j].nonzero()[1]), set(vecs_trans[i].nonzero()[1]))

# Print the similarity matrices
print("Cosine similarity:\n", sim_cosine)
print("Jaccard similarity:\n", sim_jaccard)
print("Dice similarity:\n", sim_dice)
print("Overlap similarity:\n", sim_overlap)

Cosine similarity:
 [[ 0.92005173 -0.30336921  0.92444005 ...  0.94801012  0.93634426
  -0.0154039 ]
 [ 0.43275223  0.93827543  0.42249056 ...  0.3605264   0.39281196
   0.99817864]
 [ 0.38992982  0.9534865   0.37945145 ...  0.31631652  0.34918318
   0.99991054]
 ...
 [ 0.27556667  0.98296901  0.26463664 ...  0.19912787  0.23315714
   0.99417365]
 [ 0.78434825  0.69146056  0.77725596 ...  0.73314398  0.75639795
   0.87055589]
 [ 0.25616064  0.98646908  0.2451711  ...  0.17936107  0.21353511
   0.99180243]]
Jaccard similarity:
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.0625    ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.07692308 0.         0.         ... 0.0625     0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
Dice similarity:
 [[0. 

In [17]:
cosine_correct = 0
jaccard_correct = 0
dice_correct = 0
overlap_correct = 0

for i in range(n):
  cosine_max_index = np.argmax(sim_cosine[i])
  jaccard_max_index = np.argmax(sim_jaccard[i])
  dice_max_index = np.argmax(sim_dice[i])
  overlap_max_index = np.argmax(sim_overlap[i])
  if cosine_max_index == i:
    cosine_correct += 1
  if jaccard_max_index == i:
      jaccard_correct += 1
  if dice_max_index == i:
      dice_correct += 1
  if overlap_max_index == i:
      overlap_correct += 1

cosine_accuracy = cosine_correct / n
jaccard_accuracy = jaccard_correct / n
dice_accuracy = dice_correct / n
overlap_accuracy = overlap_correct / n

print(f"\nAccuracy for cosine similarity: {cosine_accuracy}")
print(f"Accuracy for Jaccard similarity: {jaccard_accuracy}")
print(f"Accuracy for Dice similarity: {dice_accuracy}")
print(f"Accuracy for overlap similarity: {overlap_accuracy}")


Accuracy for cosine similarity: 0.0
Accuracy for Jaccard similarity: 0.0
Accuracy for Dice similarity: 0.0
Accuracy for overlap similarity: 0.0


In [18]:
avg_sim_sim = sim_cosine + sim_jaccard + sim_dice + sim_overlap
avg_sim_correct = 0
for i in range(n):
  avg_sim_max_index = np.argmax(avg_sim_sim[i])
  if avg_sim_max_index == i:
    avg_sim_correct += 1

avg_sim_accuracy = avg_sim_correct / n
print(f"\nAccuracy for Average Similarity: {avg_sim_accuracy}")


Accuracy for Average Similarity: 0.0


In [21]:
import torch
import torch.nn.functional as F
import numpy as np

# Load the multilingual BERT model and tokenizer
model = AutoModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



In [22]:
with open('English_queries.txt', 'r') as file:
    queries_en = [line.strip() for line in file.readlines()]
    
# Print First 5 queries out of the 142 queries
queries_en[:5]

['What is the capital of Argentina?',
 'Who won the Nobel Prize for Literature in 2020?',
 'What is the square root of 144?',
 'How far is the moon from the Earth?',
 'Who painted the Mona Lisa?']

In [23]:
with open('Turkish_doc.txt', 'r') as file:
    doc_tr = [line.strip() for line in file.readlines()]
    
# Print First 5 queries out of the 142 queries
doc_tr[:5]

["Arjantin'in başkenti Buenos Aires'tir.",
 "2020 yılında Edebiyat Nobel Ödülü'nü Amerikalı şair Louise Glück kazanmıştır.",
 "144'ün karekökü 12'dir.",
 "Ay, Dünya'dan ortalama 384.400 kilometre uzaklıktadır.",
 'Mona Lisa tablosu, İtalyan Rönesans sanatçısı Leonardo da Vinci tarafından resmedilmiştir.']