Warning: takes more than 6 hours long time to annotate 
    `muss_mined_paraphrases/en_mined_paraphrases/train.complex`

In [1]:
import sys
sys.path.append("../") # go to parent dir

from multiprocessing import Pool
import os
import time

import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from muss_utils import (
    compute_features, 
    fetch_preprocessor_used_in_muss_model_training, 
)

from utils import read_lines

In [2]:
n_jobs = os.cpu_count()
print(n_jobs)

32


In [3]:
split = 'train'
overwrite = True

srcs = read_lines(f'../resources/data/en/muss_mined_paraphrases/en_mined_paraphrases/{split}.complex')
tgts = read_lines(f'../resources/data/en/muss_mined_paraphrases/en_mined_paraphrases/{split}.simple')

print(len(srcs), len(tgts))
assert len(srcs) == len(tgts)

anno_file = f'../resources/data/en/muss_mined_paraphrases/en_mined_paraphrases/analysis/{split}.tsv'

1194945 1194945


In [None]:
preprocessors = fetch_preprocessor_used_in_muss_model_training()

def compute_features_para(src, tgt, preprocessors=preprocessors):
    return compute_features(src, tgt, preprocessors)

if overwrite:
    print(f'running {n_jobs} jobs...')
    t0 = time.time()
    with Pool(processes=n_jobs) as pool:
        result = pool.starmap(compute_features_para, zip(srcs, tgts), chunksize=1000)
    t1 = time.time()
    print(f'time taken: {t1-t0:.2f} seconds')
    print(f'writing to {anno_file}...')
    with open(anno_file, 'w', encoding='utf8') as f:
        for item in result:
            f.write(f'{json.dumps(item, ensure_ascii=False)}\n')
    print(f'done!')

Loaded preprocessors: [LengthRatioPreprocessor(target_ratio=0.8, use_short_name=False), ReplaceOnlyLevenshteinPreprocessor(bucket_size=0.05, noise_std=0, target_ratio=0.8, use_short_name=False), WordRankRatioPreprocessor(language='en', target_ratio=0.8, use_short_name=False), DependencyTreeDepthRatioPreprocessor(language='en', target_ratio=0.8, use_short_name=False)]
running 32 jobs...


In [None]:
df = pd.read_json(anno_file, lines=True)
df

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16,14), sharey=True)

bins = np.arange(0.0, 2.01, 0.05)
print(len(bins), bins)

# sns hist
sns.histplot(df, x='LENGTHRATIO', kde=True, stat='density', bins=bins, binwidth=0.05, ax=axes[0][0], legend=True)
sns.histplot(df, x='WORDRANKRATIO', kde=True, stat='density', bins=bins, binwidth=0.05, ax=axes[0][1], legend=True)
sns.histplot(df, x='REPLACEONLYLEVENSHTEIN', kde=True, stat='density', bins=bins, binwidth=0.05, ax=axes[1][0], legend=True)
sns.histplot(df, x='DEPENDENCYTREEDEPTHRATIO', kde=True, stat='density', bins=bins, binwidth=0.05, ax=axes[1][1], legend=True)

for ax_i in axes:
    for ax_j in ax_i:
        ax_j.set_xlim(min(bins), max(bins))
        ax_j.set_xticks(bins)
        ax_j.set_xticklabels(list(map(lambda x: f'{x:.2f}', bins)), rotation=90, ha='right', rotation_mode='anchor')

fig.suptitle(f'ACCESS Features on en web-mined paraphrases from MUSS')
plt.tight_layout()
plt.savefig(f'../results/plots/access_features_on_en_muss_mined_{split}_hist.png', dpi='figure')

In [None]:
# fig, axes = plt.subplots(2, 2, figsize=(16,14), sharey=True)
# bins = np.arange(0.0, 2.01, 0.05)
# print(len(bins), bins)

# # lr_binned = pd.cut(df['length_ratio'], bins=bins, include_lowest=True).apply(lambda x: x.right)
# # word_binned = pd.cut(df['lex_complexity'], bins=bins, include_lowest=True).apply(lambda x: x.right)
# # lev_binned = pd.cut(df['levenshtein'], bins=bins, include_lowest=True).apply(lambda x: x.right)
# # dtd_binned = pd.cut(df['dep_tree_depth'], bins=bins, include_lowest=True).apply(lambda x: x.right)

# # bar
# df['LENGTHRATIO'].value_counts(sort=False).plot(kind='bar', rot=90, ax=axes[0][0], legend=True)
# df['WORDRANKRATIO'].value_counts(sort=False).plot(kind='bar', rot=90, ax=axes[0][1], legend=True)
# df['REPLACEONLYLEVENSHTEIN'].value_counts(sort=False).plot(kind='bar', rot=90, ax=axes[1][0], legend=True)
# df['DEPENDENCYTREEDEPTHRATIO'].value_counts(sort=False).plot(kind='bar', rot=90, ax=axes[1][1], legend=True)

# # kde
# # lr_binned.value_counts(sort=False).plot(kind='kde', rot=90, ax=axes[0][0], legend=True)
# # word_binned.value_counts(sort=False).plot(kind='kde', rot=90, ax=axes[0][1], legend=True)
# # lev_binned.value_counts(sort=False).plot(kind='kde', rot=90, ax=axes[1][0], legend=True)
# # dtd_binned.value_counts(sort=False).plot(kind='kde', rot=90, ax=axes[1][1], legend=True)

# # hist
# # lr_binned.hist(bins=len(bins), ax=axes[0][0], legend=True)
# # word_binned.hist(bins=len(bins), ax=axes[0][1], legend=True)
# # lev_binned.hist(bins=len(bins), ax=axes[1][0], legend=True)
# # dtd_binned.hist(bins=len(bins), ax=axes[1][1], legend=True)

# # hist plt
# # axes[0][0].hist(lr_binned, color = 'blue', edgecolor = 'black', bins = len(bins))
# # axes[0][1].hist(word_binned, color = 'blue', edgecolor = 'black', bins = len(bins))
# # axes[1][0].hist(lev_binned, color = 'blue', edgecolor = 'black', bins = len(bins))
# # axes[1][1].hist(dtd_binned, color = 'blue', edgecolor = 'black', bins = len(bins))

# # ax = out.value_counts(sort=False).plot.bar(rot=90, color="b", figsize=(10,6))
# # ax.set_xticklabels([c[1:-1].replace(","," to") for c in out.cat.categories])

# fig.suptitle(f'ACCESS Features on en web-mined paraphrases from MUSS')
# plt.tight_layout()
# plt.savefig(f'../results/plots/access_features_on_en_muss_mined_{split}_bar.png', dpi='figure')