In [1]:
import nltk
import math
import re
import typing as T
from nltk.corpus import stopwords
import pandas as pd
import string
import numpy as np
from tqdm import tqdm

In [None]:
np.random.seed(42)

In [2]:
#adding the stopwords for the spanish language
STOPWORDS = stopwords.words('spanish') + list(string.punctuation)
#setting the number of batches need to be sampled randomly and the
#size of each batch
num_batches = "<ENTER NUMBER>"
batch_size = "<ENTER NUMBER>"
#set the final data size we want to have
target_size=50_000

In [3]:
#the source for the following functions is the following link: https://github.com/anthonyprinaldi/js-divergence/tree/master

def isNumber(num: str) -> bool:
    """Check if a string is a number

    Args:
        num (str): piece of text

    Returns:
        bool: True iff the string can be converted
            to a number
    """
    try:
        float(num)
        return True
    except ValueError:
        return False

def computeFreqDistribution(doc: str, stopwords: bool = False) -> nltk.FreqDist:
    """Computes the frequency of each word in a document

    Args:
        doc (str): string containing the entire document
        stopwords (bool): boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.

    Returns:
        nltk.FreqDist: frequency distribution
    """
    tokens = nltk.regexp_tokenize(doc,'\S+')
    filtered_tokens = [w.lower().strip('.,?!"\'') for w in tokens]
    consolidated_tokens = []
    for w in filtered_tokens:
        if isNumber(w):
            consolidated_tokens.append("<NUMBER>")
            continue            
        elif re.match("[\d]+(pm|am)$", w):
            consolidated_tokens.append("<TIME>")
            continue
        elif re.match("[\d]+:[\d]+(pm|am)?$", w):
            consolidated_tokens.append("<TIME>")
            continue
        elif re.match("\(?(\w+)\)?$", w):
            m = re.match("\(?(\w+)\)?$", w)
            consolidated_tokens.append(m.group(1))
            continue
        else:
            consolidated_tokens.append(w) 
    
    if stopwords:
        consolidated_tokens = [w for w in consolidated_tokens if w not in STOPWORDS and w != "" ]
    else:
        consolidated_tokens = [w for w in consolidated_tokens if w != ""]
             
    fd = nltk.FreqDist(consolidated_tokens)
    return fd

def computeUnigramDistribution(doc: str, n_words: int = None, stopwords: bool = False) -> T.Tuple[dict, float]:
    """
    Computes the relative frequencies (i.e., probs) of the most common unigrams
        in a document

    Args:
        doc (str): string containing the entire document
        n_words (int, optional): Number of most common words to consider.
            Defaults to None.
        stopwords: boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.

    Returns:
        dict: relative frequencies of the form dist[word] = prob
        float: sum of all the probabilities of the n_words most frequent unigrams
    """
    fd = computeFreqDistribution(doc, stopwords)
    keys = list(fd.keys())[:n_words]
    values = list(fd.values())[:n_words]
    N = float(sum(values))
    dist = {}
    for key in keys:
        dist[key] = float(fd[key])/N
    return (dist,N)

def mergeDistributionJS(dist1: dict, dist2: dict) -> dict:
    """
    Merges the two distributions used in the JS divergence

    Args:
        dist1 (dict): probability distribution of the form dist1[word] = prob
        dist2 (dict): probability distribution of the form dist2[word] = prob

    Returns:
        dict: New merged distribution including all words from both distributions
    """
    mergeDist = {}
    for key in dist1.keys():
        mergeDist[key] = 1/2*dist1[key]
    for key in dist2.keys():
        if key in mergeDist.keys():
            mergeDist[key] += 1/2*dist2[key]
        else:
            mergeDist[key] = 1/2*dist2[key]
    return mergeDist

def KLDivergence(P: dict, M: dict, log_base: float = math.e) -> float:
    """
    Computes the KL divergence for two distributions
        KL(P||M) = \sum_{x \in X}[p(x) * \log(p(x)/q(x))]

    Args:
        P (dict): probability distribution of words
        M (dict): probability distribution of words
        log_base (float): Base value to use for log.
            Defaults to Euler's constant

    Returns:
        float: KL divergence of two distributions
    """
    div = 0
    for key in P.keys():
        div += P[key] * math.log(P[key] / M[key], log_base)
    return div

def JSDivergence(doc1: str, doc2: str, num_words: int = None, log_base: float = math.e, stopwords: bool = False) -> float:
    """
    Calculates the JS Divergence value for two corpora

    Args:
        doc1 (str): string containing the entire document
        doc2 (str): string containing the entire document
        num_words (int): number of most frequent words to
            consider. Defaults to all words.
        log_base (float): Base value to use for log.
            Defaults to Euler's constant
        stopwords (bool): boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.

    Returns:
        float: the JS divergence of the two corpora
    """
    P, N1 = computeUnigramDistribution(doc1, num_words, stopwords)
    Q, N2 = computeUnigramDistribution(doc2, num_words, stopwords)
    M = mergeDistributionJS(P, Q)
    js = 1/2*KLDivergence(P, M, log_base) + 1/2*KLDivergence(Q, M, log_base)
    return js / math.log(log_base)

In [4]:
def make_doc(batch):
    doc = " ".join(batch)
    return doc


def open_text_file(file_path):
    try:
        with open(file_path, 'r',encoding="utf8") as file:
            input_texts = file.readlines()
        return input_texts
    except FileNotFoundError:
        print("File not found.")
    except IOError:
        print("Error reading the file.")

def create_batches_with_replacement(df, num_batches, batch_size):    
    batches = []
    for _ in range(num_batches):
        batch = df.sample(n=batch_size)
        batches.append(batch)
    
    return batches

In [6]:
#since the dataset file will have both languages, give the column name of the
#language you want to compare against the development set
#give the column name of the development set language
COLUMN_NAME = "<COLUMN NAME OF THE LANGUAGE YOU WANT TO CALCULATE JS DIV>"
#read the datafram containing the data 
#the data should contain both the source and the target language
df = pd.read_csv("<PATH TO THE CSV FILE OF THE DATASET>")
#here we load the development set which we want to compare against the sampled sentences
with open("<PATH TO THE DEVELOPMENT SET>", 'r',encoding="utf8") as file:
    es_eval_sentences = file.readlines()

#we can make the evaluation set into a doc as required by the JS divergence function
es_doc_eval = make_doc(es_eval_sentences)

#we create the batches by sampling our dataset
batches = create_batches_with_replacement(df, num_batches, batch_size)
batches = [(num,batch) for num,batch in enumerate(batches)]

In [8]:
#We calculate the JS divergence for each batch and store it in a dictionary
data = {"batch_num":[],
       "Js_div":[]}
for num,batch in tqdm(batches):
    train_doc = make_doc(list(batch[COLUMN_NAME].values))
    score = JSDivergence(train_doc,es_doc_eval,stopwords=True)
    data["batch_num"].append(num)
    data["Js_div"].append(score) 

#we create a dataframe with the results and sort it by the JS divergence
js_df = pd.DataFrame(data=data)
js_df.sort_values(by="Js_div",ascending=True,inplace=True)

In [None]:
#we create a final dataframe with the sentences that we want to keep
#based on the JS divergence and the desired size of the final dataset
final_df = pd.DataFrame()
i=0
while True:
    print(i)
    batch_num = js_df.iloc[i,0]
    target_df = batches[batch_num][1]
    assert batches[batch_num][0]==batch_num
    final_df = pd.concat([final_df,target_df])
    final_df.drop_duplicates(subset=["es"],inplace=True)
    print(final_df.shape)
    if len(final_df) >= target_size:
        break
    i+=1

In [20]:
final_df.to_csv("<PATH TO THE LOCATION OF THE FINAL OUTPUT>",index=False)