### Import dependencies

In [31]:
# Dependencies
import pandas as pd
import numpy as np
import textdistance as td
import panphon as pp

# Make sure you can see all output
pd.options.display.max_rows = 4000

### Data set up

In [32]:
# Store filepath in a variable
df = pd.read_csv("Resources/all_data_pvm_acc2.csv")

df.head()

Unnamed: 0,PID,Target,Production,Prod_Word_Dur,NOTES,Prod_Arpabet,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,...,post-alveolar_Acc,dental_Acc,palatal_Acc,glottal_Acc,stop_Acc,fricative_Acc,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc
0,15,book,B UH K,0.295646,Article (É) before word,B,1,0,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
1,15,book,B UH K,0.295646,Article (É) before word,UH,1,0,1,2,...,1,1,1,1,1,1,1,1,1.0,1.0
2,15,book,B UH K,0.295646,Article (É) before word,K,1,0,1,3,...,1,1,1,1,1,1,1,1,1.0,1.0
3,15,ball,B AO L,0.397365,,B,2,0,2,1,...,1,1,1,1,1,1,1,1,1.0,1.0
4,15,ball,B AO L,0.397365,,AO,2,0,2,2,...,1,1,1,1,1,1,1,1,1.0,1.0


In [33]:
# Check columns
df.columns.tolist()

['PID',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'NOTES',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'PIDSESS_Code',
 'WordPhon_Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Word_IPA',
 'Prod_Phon_IPA',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_Word_IPA',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Characters',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Pr

### Damerau Levenshtein Edit Distance

In [34]:
#Get number of phonemes for each production
df['Prod_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Prod_Word_IPA']),
        axis=1
    )
)

#Get number of phonemes for each target
df['Target_N_Tot_Characters'] = (
    df
    .apply(
        lambda x:
        len(x['Target_Word_IPA']),
        axis=1
    )
)


In [35]:
# Calculate the Damerau_Levenshtein 
df['Damerau_Levenshtein'] = (
    df
    .apply(
        lambda x:
        td.damerau_levenshtein(str(x['Target_Word_IPA']),str(x['Prod_Word_IPA'])),
        axis=1
        )
)

In [36]:
df['Damerau_Levenshtein'].mean()

1.9748512709572743

In [37]:
print(sorted(df['Target_N_Tot_Characters'].unique()))
print(sorted(df['Damerau_Levenshtein'].unique()))

[2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


### Using PanPhon

In [38]:
# Check IPA symbols for fit with PanPhon
df['Target_Phon_IPA'].unique()

array(['b', 'ʊ', 'k', 'ɔ', 'l', 'n', 'ɐ', 'f', 'ʌ', 'p', 's', 'e', 't',
       'i', 'h', 'æ', 'm', 'ɚ', 'u', 'θ', 'r', 'ə', 'ɑ', nan, 'ɛ', 'd',
       'w', 'ʧ', 'o', 'ɪ', 'ʃ', 'z', 'v'], dtype=object)

In [39]:
df[df['Target_Phon_IPA'].isna()][['Prod_Word_IPA','Target_Word_IPA','Prod_Phon_IPA', 'Target_Phon_IPA']]

Unnamed: 0,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_IPA
88,lɑkɚ,lɑk,ɚ,
579,kɔfi,kʌp,i,
583,kɔfi,kʌp,i,
799,sioɛn,kom,ɛ,
800,sioɛn,kom,n,
844,kəsʌmθɪŋ,spun,m,
845,kəsʌmθɪŋ,spun,θ,
846,kəsʌmθɪŋ,spun,ɪ,
847,kəsʌmθɪŋ,spun,ŋ,
890,sɪgərɛt,mæʧəz,ɛ,


### Save file

In [40]:
df.to_csv('Resources/all_data_pvm_acc3.csv', index=False)