In [None]:
import re
import itertools
from pprint import pprint
import pickle
import pandas as pd
import requests
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy import stats

In [None]:
data = pd.read_csv("") # Path to metadata file provided by NEwsela

In [None]:
data.head()

In [None]:
print(data['language'].value_counts())

In [None]:
def get_stats():
    # Distribution of y and y_lexile
    text_counts = data['grade_level'].value_counts()

    print('Categorial grade_level')
    print(pd.DataFrame({
        '#text': text_counts,
        '%text': (100 * text_counts / text_counts.sum()).round(2),
    }).sort_index())

    dfs = [data]

    stats_d = {'name': ['text'],
             '#': [len(df['grade_level']) for df in dfs],
             'min': [df['grade_level'].min() for df in dfs],
             'max': [df['grade_level'].max() for df in dfs],
             'mean': [df['grade_level'].mean() for df in dfs],
             'std': [df['grade_level'].std() for df in dfs]
            }

    stats_df = pd.DataFrame(stats_d)
    stats_df.set_index('name')

    print('Continuous grade_level')
    print(stats_df)


In [None]:
get_stats()

Grade Stats

In [None]:
grade_groups = data.groupby(['grade_level','language'])['filename'].apply(list).reset_index()

In [None]:
grades_lang = {}
for index, row in grade_groups.iterrows():
    y = row['grade_level']
    ids = row['filename']
    lang = row['language']
    if y in grades_lang:
        grades_lang[y][lang] = ids
    else:
        grades_lang[y] = {}
        grades_lang[y][lang] = ids

In [None]:
import collections
import functools
import os

path = "" # Path to articles

def iterate_data(data):
    if isinstance(data, str):
        assert os.path.exists(data), f"path `{data}` does not exist!"
        with open(data, "r") as f:
            for line in f:
                if len(line.strip()) > 0:
                    yield line

    elif isinstance(data, collections.Iterable):
        for x in data:
            yield x

def get_stats_data(ids):
    _vocab = collections.Counter()
    _data = []
    for filename in ids:
        for line in iterate_data(path + filename):
            tokens = line.lower().split()
            _vocab.update(tokens)
            _data.append(tokens)
    return _vocab, _data

In [None]:
lang="en"
for i in range(2, 13):
    _vocab, _data = get_stats_data(grades_lang[i][lang])
    length_stats = [len(x) for x in _data]
    print("Grade: " + str(i))
    print("Number of Articles: " + str(len(grades_lang[i][lang])))
    print("Average length of the sentence:"  + str(sum(length_stats)/len(length_stats)))
    print("Number of unique tokens: " + str(len(_vocab)))

Creating data groups by language

In [None]:
def clean_slug(x):
    x = x.replace("-spanish", "")
    return x

data["slug"] = data["slug"].apply(clean_slug)

In [None]:
news_groups = data.groupby(['slug','language'])['filename'].apply(list).reset_index()

In [None]:
news_groups.head()

Translating spanish articles to english

In [None]:
def get_lang_ids(flang):
    lang_ids = set()
    for index, row in news_groups.iterrows():
        if row['language'] == flang:
            lang_ids |= set(row['filename'])
    return lang_ids

In [None]:
import pickle
from time import sleep
from google_api_translate import Translator, TextUtils

creds_path = "" # Google Translate Credentials json file path


def create_translation(src_file, output_file):
    lines = open(src_file, encoding='utf-8').read().strip().split('\n')
    if not os.path.exists(output_file):
        with open(output_file, "w") as f:
            for line in lines:
                if(line!=""):
                    success = False
                    while(not success):
                        try:
                            trans = Translator(creds_path=creds_path).translate(text=line, target_language='en')
                            f.write(trans.text if trans.text!="" else "\n\n")
                            success = True
                        except:
                            sleep(100)
                            continue
                else:
                    f.write("\n\n")             
    return True

In [None]:
spanish_ids = get_lang_ids("es")
print("# of Spanish Articles: " + str(len(spanish_ids)))

In [None]:

for id1 in tqdm(spanish_ids):
    if not os.path.exists(path + id1[:-4] + "_trans.txt"):
        create_translation(path + id1, path + id1[:-4] + "_trans.txt")

Create Slug dict and save

In [None]:
slugs_esen = {}
for index, row in news_groups.iterrows():
    slug = row['slug']
    ids = row['filename']
    lang = row['language']
    if slug in slugs_esen:
        slugs_esen[slug][lang] = ids
    else:
        slugs_esen[slug] = {}
        slugs_esen[slug][lang] = ids

In [None]:
slugs_esen

In [None]:
import random

# Test set is created using the news articles(original and simplified) that have both spanish and english rewrite

def create_test(k=0.8):
    test_ids_es = []
    test_ids_en = []
    count=0
    for slug in slugs_esen.keys():
        if "en" in slugs_esen[slug] and "es" in slugs_esen[slug]:
            count+=1
            test_ids_es.extend(slugs_esen[slug]["es"])
            test_ids_en.extend(slugs_esen[slug]["en"])
    n=int(0.8*count)
    return test_ids_es[:n], test_ids_es[n:], test_ids_en[:n], test_ids_en[-n:]

In [None]:
test_ids_es, dev_ids_es, test_ids_en, dev_ids_en = create_test()

Create pairs

In [None]:
def create_pairs():
    es_pairs = []
    en_pairs = []
    esen_pairs = []
    for slug in slugs_esen.keys():
        if "en" in slugs_esen[slug]:
            en_pairs.extend(list(itertools.combinations(slugs_esen[slug]["en"], 2)))
        if "es" in slugs_esen[slug]:
            es_pairs.extend(list(itertools.combinations(slugs_esen[slug]["es"], 2)))
        if "en" in slugs_esen[slug] and "es" in slugs_esen[slug]:
            esen_pairs.extend(list(itertools.product(slugs_esen[slug]["es"], slugs_esen[slug]["en"])))
    return es_pairs, en_pairs, esen_pairs

In [None]:
es_pairs, en_pairs, esen_pairs = create_pairs()

Also available in our CrossLingualAlignmentTool: https://github.com/sweta20/ComplexityControlledMT/tree/master/CrossLingualAlignment

In [None]:
import sys
""" Path to Massalign """
sys.path.append("/usr/local/lib/python3.6/dist-packages/massalign")

from massalign.core import *
from gach import sentence_align
from nltk.tokenize.toktok import ToktokTokenizer
toktok = ToktokTokenizer()

m = MASSAligner()


In [None]:
def get_massalign_sentence_pairs(file1, file2):
    #Train model over them:
    model = TFIDFModel([file1, file2], 'https://ghpaetzold.github.io/massalign_data/stop_words.txt')
    
    #Get paragraph aligner:
    paragraph_aligner = VicinityDrivenParagraphAligner(similarity_model=model, acceptable_similarity=0.3)

    #Get sentence aligner:
    sentence_aligner = VicinityDrivenSentenceAligner(similarity_model=model, acceptable_similarity=0.21, similarity_slack=0.05)

    #Get paragraphs from the document:
    p1s = m.getParagraphsFromDocument(file1)
    p2s = m.getParagraphsFromDocument(file2)
    #Align paragraphs:
    alignments, aligned_paragraphs = m.getParagraphAlignments(p1s, p2s, paragraph_aligner)
    
    #Align sentences in each pair of aligned paragraphs:
    alignmentsl = []
    for a in aligned_paragraphs:
        p1 = a[0]
        p2 = a[1]
        alignments, aligned_sentences = m.getSentenceAlignments(p1, p2, sentence_aligner)
        
        alignmentsl.extend(aligned_sentences)
    return alignmentsl


def create_spanish_english_alignments(id1, id2):
    
    spa_file = id1
    eng_file = id2
    spa_trans_file = id1[:-4] + "_trans.txt"
    
    massalign_sentence_pairs = get_massalign_sentence_pairs(spa_trans_file, eng_file)
    translation_sentence_pairs = sentence_align(spa_file, spa_trans_file, 0.97, 1.8)
    
    pairs = []
    for eng_trans, eng_org in massalign_sentence_pairs:
        eng_simple_tok_1 = toktok.tokenize(eng_trans)
        
        spanish = ''
        prev_spa = ''
        for spa, eng in translation_sentence_pairs:
            eng_simple_tok_2 = toktok.tokenize(eng)
        
            I = len(set(eng_simple_tok_2).intersection(set(eng_simple_tok_1)))
            U = len(set(eng_simple_tok_2))
            try:
                percent_overlap = float(I)/U
                if percent_overlap > 0.5 and spa!=prev_spa:
                    spanish += spa
                    prev_spa = spa
                    break
            except:
                continue
        if spanish != '':
            pairs.append([spanish, eng_org])
    return pairs

def create_mono_alignments(id1, id2):
    return get_massalign_sentence_pairs(id1, id2)

In [None]:
en_sentence_pairs = {}
for pair in tqdm(en_pairs):
    try:
        en_sentence_pairs[pair] = create_mono_alignments(path + pair[0], path + pair[1])
    except:
        continue
        
lengths = {key:len(value) for key,value in en_sentence_pairs.items()}
print("Sentence pairs", sum(lengths.values()))

In [None]:
with open("data/en_pairs.pkl", "wb") as f:
    pickle.dump(en_sentence_pairs, f)

In [None]:
es_sentence_pairs = {}
for pair in tqdm(es_pairs):
    try:
        es_sentence_pairs[pair] = create_mono_alignments(pair[0], pair[1])
    except:
        continue
        
lengths = {key:len(value) for key,value in es_sentence_pairs.items()}
print("Sentence pairs", sum(lengths.values()))

In [None]:
with open("data/es_pairs.pkl", "wb") as f:
    pickle.dump(es_sentence_pairs, f)

In [None]:
esen_sentence_pairs = {}
for pair in tqdm(esen_pairs):
    try:
        esen_sentence_pairs[pair] = create_spanish_english_alignments(pair[0], pair[1])
    except:
        continue
    
lengths = {key:len(value) for key,value in esen_sentence_pairs.items()}
print("Sentence pairs", sum(lengths.values()))

In [None]:
lengths = {key:len(value) for key,value in esen_sentence_pairs.items()}
print("Sentence pairs", sum(lengths.values()))

load all sentence pairs:

In [None]:
with open("data/en_pairs.pkl", "rb") as f:
    en_sentence_pairs = pickle.load(f)
with open("data/es_pairs.pkl", "rb") as f:
    es_sentence_pairs = pickle.load(f)
with open("data/esen_pairs.pkl", "rb") as f:
    esen_sentence_pairs = pickle.load(f)

Create train/dev/test as reruired by the repository

In [None]:
from sklearn.model_selection import train_test_split

token = {'2': '<TWO>', '3': '<THREE>' , '4': '<FOUR>', '5': '<FIVE>', '6' : '<SIX>',
        '7': '<SEVEN>', '8':'<EIGHT>', '9' : '<NINE>', '10': '<TEN>', '11': '<ELEVEN>', '12' : '<TWELVE>'}
inv_map = {v: k for k, v in token.items()}

def write_to_file(data, split, f_prefix, path="data/"):
    src_file = open(path + split + "_" + f_prefix + ".src", "w")
    dst_file = open(path + split + "_" + f_prefix + ".tgt", "w")
    
    for i in range(len(data)):
        src_file.write(token[str(data[i][1])] + "\t" +  data[i][2] + "\n")
        dst_file.write(data[i][3] + "\n")
    
    src_file.close()
    dst_file.close()
    
def create_train_test(sent_pairs, f_prefix, col="y"):
    train_data = []
    test_data = []
    dev_data = []
    all_data = {}
    
    for pair in tqdm(sent_pairs):
        id1, id2 = pair
        y1 = data[data["id"]==id1][col].values[0]
        y2 = data[data["id"]==id2][col].values[0]
        if f_prefix == "esen_neq":
            if int(y2) < int(y1):
                sentences = sent_pairs[pair]
                for sentence_pair in sentences:
                    if (y1, y2) not in all_data:
                        all_data[(y1, y2)] = 1
                    else:
                        all_data[(y1, y2)] += 1
                    if id2 in test_ids_en:
                        test_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
                    elif id2 in dev_ids_en:
                        dev_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
                    else:
                        train_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
        elif f_prefix == "esen_eq":
            if int(y2) == int(y1):
                sentences = sent_pairs[pair]
                for sentence_pair in sentences:
                    if id2 in test_ids_en:
                        test_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
                    elif id2 in dev_ids_en:
                        dev_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
                    else:
                        train_data.append((y1, y2, sentence_pair[0], sentence_pair[1]))
        
        elif f_prefix == "es":
            sentences = sent_pairs[pair]
            for sentence_pair in sentences:
                if id2 in test_ids_es:
                    test_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
                elif id2 in dev_ids_es:
                    dev_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
                else:
                     train_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
    
        else:
            sentences = sent_pairs[pair]
            for sentence_pair in sentences:
                if (y2, y1) not in all_data:
                    all_data[(y2, y1)] = 1
                else:
                    all_data[(y2, y1)] += 1
                if id2 in test_ids_en:
                    test_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
                elif id2 in dev_ids_en:
                    dev_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
                else:
                     train_data.append((y2, y1, sentence_pair[1], sentence_pair[0]))
    
    write_to_file(train_data, "train", f_prefix)
    write_to_file(dev_data, "dev", f_prefix)
    write_to_file(test_data, "test", f_prefix)
    
    print("# of train_data: {}, # of dev data: {}, # of test_data: {}".format(len(train_data), len(dev_data), len(test_data)))
    
    return train_data, dev_data, test_data, all_data

In [None]:
train_data, dev_data, test_data, all_data = create_train_test(en_sentence_pairs, "en")

In [None]:
train_data, dev_data, test_data = create_train_test(es_sentence_pairs, "es")

In [None]:
train_data, dev_data, test_data, all_data = create_train_test(esen_sentence_pairs, "esen_neq")

In [None]:
train_data, dev_data, test_data = create_train_test(esen_sentence_pairs, "esen_eq")

In [None]:
def get_grade_stats(file_name):
    with open(file_name) as f:
        data = f.readlines()
    grade_stats = {}
    for i in range(0,len(data)):
        src = data[i].split("\t")[1]
        tgt_grade = data[i].split("\t")[0]
        if tgt_grade not in grade_stats:
            grade_stats[tgt_grade] = 1
        else:
            grade_stats[tgt_grade] += 1
            
    return grade_stats, len(grade_stats)


In [None]:
grade_stats, _ = get_grade_stats("data/dev_es.src")

In [None]:
for i in range(3, 13):
    for j in range(2,12):
        if (i,j) in all_data:
            print(i, j, all_data[(i,j)])

Creating ARI dataset

In [None]:
import sys
sys.path.append("/home/sweta/Work/Simplification/readability")

from compute_ari_accuracy import get_text_ari_grade_score

In [None]:
def clip_val(x, min_val=2, max_val=12):
    if(x<min_val):
        return min_val
    elif(x>max_val):
        return max_val
    else:
        return int(x)

def get_ari_file(src_file, tgt_file, out_file):
    src_data = open(src_file).readlines()
    tgt_data = open(tgt_file).readlines()
    
    with open(out_file, "w") as f:
        for i in range(len(src_data)):
            tgt_ari_grade = token[str(clip_val(get_text_ari_grade_score(tgt_data[i].strip())))]
            out_file.write(tgt_ari_grade + "\t" + src_data.split("\t")[1])

In [None]:
src_file = "data/train_en.src"
tgt_file = "data/train_en.tgt"
out_file = "data/train_en_ari.src"

get_ari_file(src_file, tgt_file, out_file)