# incom.py - A Toolbox for Calculating Linguistic Distances and Asymmetries between Related Languages

## Compute Levenshtein distance, word adaptiation surprisal, and conditional entropy

This notebook shows how to compute Levenshtein distance and word adaptation surprisal for a given set of word pairs. It is meant as a template that can be easily modified and adapted for different language pairs.  

---

In [None]:
import pandas as pd
%run ../utils.py
%pwd

Read the **costs matrix file** for a given language pair.

In [None]:
costs = '../inputs/costs/RU_BG_costs.xlsx'
costs_RUBG = read_cost_matrix(file=costs, pair='RU/BG') 
costs_BGRU = costs_RUBG.T

In [None]:
costs_RUBG.head(2)  # inspect data

In [None]:
costs_BGRU.head(2)  # inspect data

Read the word pairs given in an **excel file**.

In [None]:
word_list = '../inputs/wordlists/120_RU_BG.xlsx'
df = read_data(file=word_list, sheets='Tabelle1', drop_duplicates=True, remove_whitespace=True)  

In [None]:
df.head(2)  # inspect data

Compute the **Levenshtein distance** for all wordpairs.

In [None]:
levensthein_BGRU = levenshtein_distance(df, foreign='BG', native='RU', costs=costs_BGRU)  # from the perspective of a RU native reader
levensthein_RUBG = levenshtein_distance(df, foreign='RU', native='BG', costs=costs_RUBG)  # from the perspective of a BG native reader

In [None]:
levensthein_BGRU.head(2)  # inspect data

In [None]:
levensthein_RUBG.head(2)  # inspect data

Make sure that Levenshtein distance is symmetric.

In [None]:
assert levensthein_BGRU['LD'].all() == levensthein_RUBG['LD'].all()
assert levensthein_BGRU['normalized LD'].all() == levensthein_RUBG['normalized LD'].all()

Compute the **character surprisals** and **probabilities** based on the alignments computed in the previous step. 

In [None]:
probs_BGRU, surprisals_BGRU = character_surprisals(levensthein_BGRU, foreign='BG', native='RU')
probs_RUBG, surprisals_RUBG = character_surprisals(levensthein_RUBG, foreign='RU', native='BG')

In [None]:
surprisals_BGRU.head(2)  # inspect data

In [None]:
surprisals_RUBG.head(2)  # inspect data

In [None]:
probs_BGRU.head(2)  # inspect data

In [None]:
probs_RUBG.head(2)  # inspect data

Compute the **transformation entropy** for each character.

In [None]:
char_entropy_BG = character_entropy(surprisals_BGRU, probs_BGRU)  # from the perspective of a RU native reader
char_entropy_RU = character_entropy(surprisals_RUBG, probs_RUBG)  # from the perspective of a BG native reader

In [None]:
char_entropy_BG.head(2)  # inspect data

In [None]:
char_entropy_RU.head(2)  # inspect data

Compute **full conditional entropies** between the language pairs.

In [None]:
H_RU_BG = full_conditional_entropy('RU', 'BG', levensthein_RUBG, surprisals_BGRU, probs_BGRU)
H_BG_RU = full_conditional_entropy('BG', 'RU', levensthein_BGRU, surprisals_RUBG, probs_RUBG)
print('H(RU|BG): ', H_RU_BG)
print('H(BG|RU): ', H_BG_RU)

Compute the **word adaptation surprisal** based on the alignments and character surprisal values.

In [None]:
was_BGRU = word_adaptation_surprisal(levensthein_BGRU, surprisals_BGRU, probs_BGRU)  # from the perspective of a RU native reader
was_RUBG = word_adaptation_surprisal(levensthein_RUBG, surprisals_RUBG, probs_RUBG)  # from the perspective of a BG native reader

In [None]:
was_BGRU.head(2)  # inspect data

In [None]:
was_RUBG.head(2)  # inspect data

(Optional) Create a **modified version** of the surprisal matrix with zeros on the diagonal.

In [None]:
mod_surprisals_BGRU = modify_character_surprisals(surprisals_BGRU, diag_value=0.0)
mod_surprisals_RUBG = modify_character_surprisals(surprisals_RUBG, diag_value=0.0)

----

(Optional) Write the (intermediate) results to disk. **This step is only needed if you want to further manually modify the surprisals.**

In [None]:
# Uncomment the following lines only if you want to edit the surprisals manually in Excel

# output_path = '../outputs/results/RU_BG_120_results_tmp.xlsx'
# store_results(output_path, 'BG', 'RU', was_BGRU, was_RUBG, char_entropy_BG, char_entropy_RU, surprisals_BGRU, surprisals_RUBG, mod_surprisals_BGRU, mod_surprisals_RUBG, probs_BGRU, probs_RUBG, costs_BGRU)

(Optional) Modify the surprisals directly in the output file.

(Optional) Read the modified surprisals from disk.

In [None]:
# Uncomment the following lines only if you edited the surprisals manually in Excel

# df = read_data(output_path, sheets=['BG-RU-mod-surprisals', 'RU-BG-mod-surprisals'], index_col=0)
# mod_surprisals_BGRU = df['BG-RU-mod-surprisals']
# mod_surprisals_RUBG = df['RU-BG-mod-surprisals']

----

Recompute the **full conditional entropies** based on modified surprisals.

In [None]:
H_RU_BG = full_conditional_entropy('RU', 'BG', levensthein_RUBG, mod_surprisals_BGRU, probs_BGRU)
H_BG_RU = full_conditional_entropy('BG', 'RU', levensthein_BGRU, mod_surprisals_RUBG, probs_RUBG)
print('H(RU|BG): ', H_RU_BG)
print('H(BG|RU): ', H_BG_RU)

Recompute the **word adaptation surprisals** based on modified surprisals.

In [None]:
mod_was_BGRU = word_adaptation_surprisal(levensthein_BGRU, mod_surprisals_BGRU, probs_BGRU)  # from the perspective of a RU native reader
mod_was_RUBG = word_adaptation_surprisal(levensthein_RUBG, mod_surprisals_RUBG, probs_RUBG)  # from the perspective of a BG native reader

In [None]:
mod_was_BGRU.head(2)  # inspect data

In [None]:
mod_was_RUBG.head(2)  # inspect data

Read and add scores from human **intelligibility experiments**.

In [None]:
intelligibility_scores_BGRU = '../inputs/intelligibility/RU_native_BG_foreign_120_items.xlsx'
intelligibility_scores_RU = read_data(intelligibility_scores_BGRU, sheets='Tabelle1', drop_duplicates=True)

In [None]:
was_BGRU = append_intelligibility_scores(was_BGRU, intelligibility_scores_RU)
mod_was_BGRU = append_intelligibility_scores(mod_was_BGRU, intelligibility_scores_RU)

In [None]:
print('Summed intelligibility score: ', np.sum(was_BGRU['intelligibility scores']))

In [None]:
was_BGRU.head(2)  # inspect data

In [None]:
mod_was_BGRU.head(2)  # inspect data

In [None]:
intelligibility_scores_RUBG = '../inputs/intelligibility/BG_native_RU_foreign_120_items.xlsx'
intelligibility_scores_BG = read_data(intelligibility_scores_RUBG, sheets='Tabelle1', drop_duplicates=True)

In [None]:
was_RUBG = append_intelligibility_scores(was_RUBG, intelligibility_scores_BG)
mod_was_RUBG = append_intelligibility_scores(mod_was_RUBG, intelligibility_scores_BG)

In [None]:
print('Summed intelligibility score: ', np.sum(was_RUBG['intelligibility scores']))

In [None]:
was_RUBG.head(2)  # inspect data

In [None]:
mod_was_RUBG.head(2)  # inspect data

**Write the final results to disk**.

In [None]:
output_path = '../outputs/results/RU_BG_120_results.xlsx'
store_results(output_path, 'BG', 'RU', was_BGRU, was_RUBG, char_entropy_BG, char_entropy_RU, surprisals_BGRU, surprisals_RUBG, mod_surprisals_BGRU, mod_surprisals_RUBG, probs_BGRU, probs_RUBG, costs_BGRU)

In [None]:
output_path = '../outputs/results/RU_BG_120_mod_results.xlsx'
store_results(output_path, 'BG', 'RU', mod_was_BGRU, mod_was_RUBG, char_entropy_BG, char_entropy_RU, surprisals_BGRU, surprisals_RUBG, mod_surprisals_BGRU, mod_surprisals_RUBG, probs_BGRU, probs_RUBG, costs_BGRU)

---

## Compute (multiple) linear regression with sklearn

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Prepare input matrix (n_samples, n_features)
for data in [was_BGRU, was_RUBG]:
    X1 = np.asarray([data['normalized LD']]).reshape(-1, 1)
    X2 = np.asarray([data['normalized WAS']]).reshape(-1, 1)
    X3 = np.asarray([data['normalized LD'], data['normalized WAS']]).T
    y = data['intelligibility scores'].values.reshape(-1, 1)
    X = [(X1, 'intelligibility score ~ normalized LD'), (X2, 'intelligibility score ~ normalized WAS'), (X3, 'intelligibility score ~ normalized LD + normalized WAS')]
    
    for (x, desc) in X: # select both normalized LD and normalized WAS as predictors, if you want to see the regression using only one of the vars as predictor replace X3 by either X2 or X1
        reg = LinearRegression(fit_intercept=True, normalize=True).fit(x, y)
        r_squared = reg.score(x, y)
        print(desc)
        print(f'R squared: {r_squared}')
        print(f'Coefficients: {reg.coef_[0]} \n')