# incom.py - A Toolbox for Calculating Linguistic Distances and Asymmetries between Related Languages

## Visualizations

This notebook shows how to generate visualizations based on pre-computed levenhstein distance and word adaptation surprisal.

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
DPI = 150
import seaborn as sns
%matplotlib inline
%run ../utils.py
%pwd

Read the **pre-comuted** data from an excel file

In [None]:
foreign, native = ('BG', 'RU')
results = f'../outputs/results/{native}_{foreign}_120_results.xlsx'
results_mod = f'../outputs/results/{native}_{foreign}_120_mod_results.xlsx'

# Specify names of sheets in excel file
sheets = [
        f'{foreign}-{native}', 
        f'{foreign}-char-entropy', 
        f'{native}-char-entropy', 
        f'{foreign}-{native}-surprisals',
        f'{foreign}-{native}-mod-surprisals',
        f'{foreign}-{native}-probabilities',
        f'{native}-{foreign}',
        f'{native}-{foreign}-surprisals',
        f'{native}-{foreign}-mod-surprisals',
        f'{native}-{foreign}-probabilities',
        'costs'
    ]

# Read data
dfs = read_data(results, sheets, index_col=0)
dfs_mod = read_data(results_mod, sheets, index_col=0)

In [None]:
dfs['BG-RU'].head(2)  # inspect data

In [None]:
dfs['RU-BG'].head(2)  # inspect data

Compute **summary statistics** of our data.

In [None]:
dfs['BG-RU'].describe()

In [None]:
dfs['RU-BG'].describe()

---

## Basic Visualizations

In [None]:
data_BGRU = dfs['BG-RU']
data_RUBG = dfs['RU-BG']
mod_data_BGRU = dfs_mod['BG-RU']
mod_data_RUBG = dfs_mod['RU-BG']

In [None]:
prefix = '../outputs/plots/ru_bg'

Create histograms for **edit distances**.

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(6, 6))
axes.hist(data_BGRU['normalized LD'], bins=15, color='red', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(foreign, native))
axes.hist(data_RUBG['normalized LD'], bins=15, color='blue', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(native, foreign))
fig.suptitle('Normalized Levenshtein distance')
axes.grid()
axes.set_xlabel(r"Normalized LD")
axes.set_ylabel(r"Number of word pairs")
axes.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.);
plt.savefig(prefix + '/120_normalized_ld.pdf', dpi=DPI)  
plt.savefig(prefix + '/120_normalized_ld.png', dpi=DPI); 

Create histograms for **word adaptation surprisal**

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(8, 8))
axes.hist(data_BGRU['normalized WAS'], bins=15, color='red', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(foreign, native))
axes.hist(data_RUBG['normalized WAS'], bins=15, color='blue', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(native, foreign))
fig.suptitle('Normalized word adaptation surprisal')
axes.set_xlabel("Normalized WAS")
axes.set_ylabel("Number of word pairs")
axes.grid()
axes.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
plt.savefig(prefix + '/120_normalized_was.pdf', dpi=DPI)
plt.savefig(prefix + '/120_normalized_was.png', dpi=DPI);

Create histogram for **word adaptation surprisal** with **unmodified** and **modified surprisals**.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

axes[0].hist(data_BGRU['normalized WAS'], bins=15, color='red', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(foreign, native))
axes[0].hist(data_RUBG['normalized WAS'], bins=15, color='blue', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(native, foreign))
axes[0].grid()
axes[0].legend()
axes[0].set_title('unmodified')
axes[0].set_xlabel('Normalized WAS')
axes[0].set_ylabel('Number of word pairs')


axes[1].hist(mod_data_BGRU['normalized WAS'], bins=15, color='red', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(foreign, native))
axes[1].hist(mod_data_RUBG['normalized WAS'], bins=15, color='blue', alpha=.2, label='{:s} (foreign) to {:s} (native)'.format(native, foreign))
axes[1].grid()
axes[1].legend()
axes[1].set_title('modified')
axes[1].set_xlabel('Normalized WAS')

# axes[1].legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
fig.suptitle('Normalized word adaptation surprisal');
plt.savefig(prefix + '/120_normalized_modified_was.pdf', dpi=DPI)
plt.savefig(prefix + '/120_normalized_modified_was.png', dpi=DPI);

## Advanced Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12), sharex=True, sharey=True)
sns.stripplot(x="alignment length", y="normalized WAS", palette="Set3", data=data_BGRU, jitter=0.1, ax=axes[0][0])
sns.boxplot(x="alignment length", y="normalized WAS", data=data_BGRU, palette="Set3", ax=axes[1][0])
axes[0][0].set_title('BG foreign RU native')
sns.stripplot(x="alignment length", y="normalized WAS", palette="Set3", data=data_RUBG, jitter=0.1, ax=axes[0][1])
sns.boxplot(x="alignment length", y="normalized WAS", data=data_RUBG, palette="Set3", ax=axes[1][1])
axes[0][1].set_title('RU foreign BG native')
plt.savefig(prefix + '/120_was_boxplots.png', dpi=DPI)
plt.savefig(prefix + '/120_was_boxplots.pdf', dpi=DPI)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12), sharex=True, sharey=True)
sns.stripplot(x="alignment length", y="normalized LD", palette="Set3", data=data_BGRU, jitter=0.1, ax=axes[0][0])
sns.boxplot(x="alignment length", y="normalized LD", data=data_BGRU, palette="Set3", ax=axes[1][0])
axes[0][0].set_title('BG foreign RU native')
sns.stripplot(x="alignment length", y="normalized LD", palette="Set3", data=data_RUBG, jitter=0.1, ax=axes[0][1])
sns.boxplot(x="alignment length", y="normalized LD", data=data_RUBG, palette="Set3", ax=axes[1][1])
axes[0][1].set_title('RU foreign BG native')
plt.savefig(prefix + '/120_ld_boxplots.png', dpi=DPI)
plt.savefig(prefix + '/120_ld_boxplots.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized LD", "normalized WAS", data=data_BGRU, kind="reg", color='red', xlim=(0.0, 0.6), ylim=(0.0, 1.8), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('BG foreign RU native')
plt.savefig(prefix + '/120_jointplot_BGRU.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_BGRU.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized LD", "normalized WAS", data=data_RUBG, kind="reg", color='blue', xlim=(0.0, 0.6), ylim=(0.0, 1.8), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('RU foreign BG native')
plt.savefig(prefix + '/120_jointplot_RUBG.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_RUBG.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized LD", "intelligibility scores", data=data_BGRU, kind="reg", color='red', xlim=(0.0, 0.6), ylim=(0.0, 1.2), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('BG foreign RU native')
plt.savefig(prefix + '/120_jointplot_BGRU_nld_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_BGRU_nld_intell.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized WAS", "intelligibility scores", data=data_BGRU, kind="reg", color='red', xlim=(0.0, 1.8), ylim=(0.0, 1.2), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('BG foreign RU native')
plt.savefig(prefix + '/120_jointplot_BGRU_nwas_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_BGRU_nwas_intell.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized LD", "intelligibility scores", data=data_RUBG, kind="reg", color='blue', xlim=(0.0, 0.6), ylim=(0.0, 1.2), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('RU foreign BG native')
plt.savefig(prefix + '/120_jointplot_RUBG_nld_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_RUBG_nld_intell.pdf', dpi=DPI)

In [None]:
g = sns.jointplot("normalized WAS", "intelligibility scores", data=data_RUBG, kind="reg", color='blue', xlim=(0.0, 1.8), ylim=(0.0, 1.2), space=0)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('RU foreign BG native')
plt.savefig(prefix + '/120_jointplot_RUBG_nwas_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_jointplot_RUBG_nwas_intell.pdf', dpi=DPI)

## Additional Visualizations

In [None]:
# Create scatter plot of normalized LD, WAS and intelligibility scores
fig, axes = plt.subplots(1, 1, figsize=(7, 6))
cax = axes.scatter(data_BGRU['normalized LD'], data_BGRU['normalized WAS'], c=data_BGRU['intelligibility scores'], s=200, cmap='viridis_r', alpha=0.8)
axes.set_title('Intelligibility score depending on normalized LD and normalized WAS')
axes.set_xlabel("Normalized LD")
axes.set_ylabel("Normalized WAS")
axes.grid(alpha=0.3)
cbar = fig.colorbar(cax)
cbar.set_label('itelligibility score', rotation=270, labelpad=25)
plt.savefig(prefix + '/120_scatter_BGRU_nld_nwas_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_scatter_BGRU_nld_nwas_intell.pdf', dpi=DPI)
plt.tight_layout();

In [None]:
# Create scatter plot of normalized LD, WAS and intelligibility scores
fig, axes = plt.subplots(1, 1, figsize=(7, 6))
cax = axes.scatter(data_RUBG['normalized LD'], data_RUBG['normalized WAS'], c=data_RUBG['intelligibility scores'], s=200, cmap='viridis_r', alpha=0.8)
axes.set_title('Intelligibility score depending on normalized LD and normalized WAS')
axes.set_xlabel("Normalized LD")
axes.set_ylabel("Normalized WAS")
axes.grid(alpha=0.3)
cbar = fig.colorbar(cax)
cbar.set_label('itelligibility score', rotation=270, labelpad=25)
plt.savefig(prefix + '/120_scatter_RUBG_nld_nwas_intell.png', dpi=DPI)
plt.savefig(prefix + '/120_scatter_RUBG_nld_nwas_intell.pdf', dpi=DPI)
plt.tight_layout();

In [None]:
char_entropy_BG = dfs['BG-char-entropy']
char_entropy_BG = char_entropy_BG[char_entropy_BG['entropy (per character)'] != 0]
char_entropy_BG = char_entropy_BG.sort_values(by='entropy (per character)')
char_entropy_BG

In [None]:
BG_chars = list(char_entropy_BG.index)
BG_chars[4] = 'ᴓ'
print(BG_chars)

In [None]:
char_entropy_RU = dfs['RU-char-entropy']
char_entropy_RU = char_entropy_RU[char_entropy_RU['entropy (per character)'] != 0]
char_entropy_RU = char_entropy_RU.sort_values(by='entropy (per character)')
char_entropy_RU

In [None]:
RU_chars = list(char_entropy_RU.index)
print(RU_chars)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)
axes[1].bar(RU_chars, char_entropy_RU['entropy (per character)'])
axes[1].set_ylim([0.0, 2.2])
axes[0].set_ylabel('character entropy')
axes[1].set_title('RU for BG readers')
axes[0].bar(BG_chars, char_entropy_BG['entropy (per character)'], color='red')
axes[0].set_title('BG for RU readers')
#plt.xticks(fontsize=12, rotation=0)
plt.savefig(prefix + '/120_char_entropy.png', dpi=DPI)
plt.tight_layout();