In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import sys

sys.path.append('../extractive')

from model import extract_top_n_sentences

# Read Dataset
dataset_path = "../../Dataset/multilingua.pk"
with open(dataset_path, 'rb') as f:
  dataset = pickle.load(f)

dataset

In [None]:
#Group by language, drop some language
group_lang = {source: group.reset_index(drop=True) for source, group in dataset.groupby('source')}

for lang in ['Hindi', 'Indonesian', 'Japanese', 'Korean']:
    group_lang.pop(lang, None)

counts = {lang: len(df) for lang, df in group_lang.items()}
counts

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Union
import time
from IPython.display import clear_output

def cosine_similarity_between_texts(text1: str, text2: str) -> float:
    """
    Cosine similarity between text1 and text2
    Return: [0,0 ~ 1.0]
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])  # shape (2, n_features)
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return float(similarity[0][0])

def eval(example):
    clear_output(wait=True)
    print(example.name)
    summary = extract_top_n_sentences(example['document'], n=3)
    sim_score = cosine_similarity_between_texts(summary, example['summary'])
    return sim_score

In [16]:
evaluate = {
    'lang': [],
    'cosine_sim_score': []
}
for lang, dataset in group_lang.items():
    print(lang)
    evaluate['lang'].append(lang)

    sim_scores = dataset.apply(eval, axis=1).tolist()
    evaluate['cosine_sim_score'].append(sum(sim_scores) / len(sim_scores))

4030


In [17]:
pd.DataFrame(evaluate)

Unnamed: 0,lang,cosine_sim_score
0,Arabic,0.153302
1,Chinese,0.007025
2,Czech,0.129634
3,Dutch,0.314484
4,English,0.291405
5,French,0.27117
6,German,0.188313
7,Italian,0.203007
8,Portuguese,0.210418
9,Russian,0.102415
