In [1]:
# Dependencies
import pandas as pd
import numpy as np
import textdistance as td

# Make sure you can see all output
pd.options.display.max_rows = 4000

### Data set up

In [2]:
# Store filepath in a variable
PVMdat = pd.read_csv("Resources/all_data_pvm_acc4.csv")

#Translate the ARPABET codes to IPA codes
dictionary = (
    pd.read_csv("Resources/dict.csv")
    .set_index("Arpabet")
)

display(PVMdat.head(), dictionary)

Unnamed: 0.1,Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,...,Height_Acc,Frontness_Acc,Tenseness_Acc,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,Improvement_Group,NWF_Improvement_Group,wabaq_start
0,0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
1,1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
2,2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
3,3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
4,4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70


Unnamed: 0_level_0,IPA_singles
Arpabet,Unnamed: 1_level_1
H,h
R,r
W,w
Y,j
B,b
CH,ʧ
D,d
DH,ð
DX,ɾ
F,f


In [3]:
# Check columns
PVMdat.columns.tolist()

['Unnamed: 0',
 'PID',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Prod_Phon_Dur',
 'NOTES',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Word_IPA',
 'Prod_Phon_IPA',
 'Prod_Phoneme_ID',
 'Prod_Prev_Phon',
 'Prod_Next_Phon',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_Word_IPA',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiode

In [4]:
# The IPA codes we have stored are corrupted, so we need to make them again
df = PVMdat[[
    'PID',
    'Target',
    'Production',
    'Prod_Word_Dur',
    'Prod_Arpabet',
    'Prod_Phon_Dur',
    'NOTES',
    'Word_ID',
    'Session_ID',
    'Prod_Word_N',
    'Prod_Phon_N',
    'Code',
    'Phon_Sess_Code',
    'Word_Sess_Code',
    'Prod_Last_Phon',
    'Prod_Phoneme_ID',
    'Target_Arpabet',
    'Target_N_Tot_Words',
    'Target_N_Tot_Syllables',
    'Target_N_Tot_Phonemes',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Prev_Phon',
    'Target_Next_Phon',
    'Prod_syllabic',
    'Prod_consonantal',
    'Prod_sonorant',
    'Prod_continuant',
    'Prod_delayed release',
    'Prod_approximant',
    'Prod_tap',
    'Prod_nasal',
    'Prod_voice',
    'Prod_spread gl',
    'Prod_constr gl',
    'Prod_labial',
    'Prod_round',
    'Prod_labiodental',
    'Prod_coronal',
    'Prod_anterior',
    'Prod_distributed',
    'Prod_strident',
    'Prod_lateral',
    'Prod_dorsal',
    'Prod_high',
    'Prod_low',
    'Prod_front',
    'Prod_back',
    'Prod_tense',
    'Prod_lax',
    'Prod_vowel',
    'Prod_consonant',
    'Prod_diphthong',
    'Prod_monophthong',
    'Prod_velar',
    'Prod_alveolar',
    'Prod_post-alveolar',
    'Prod_dental',
    'Prod_palatal',
    'Prod_glottal',
    'Prod_stop',
    'Prod_fricative',
    'Prod_affricate',
    'Prod_glide',
    'Prod_Place',
    'Prod_Manner',
    'Prod_Height',
    'Prod_Frontness',
    'Target_syllabic',
    'Target_consonantal',
    'Target_sonorant',
    'Target_continuant',
    'Target_delayed release',
    'Target_approximant',
    'Target_tap',
    'Target_nasal',
    'Target_voice',
    'Target_spread gl',
    'Target_constr gl',
    'Target_labial',
    'Target_round',
    'Target_labiodental',
    'Target_coronal',
    'Target_anterior',
    'Target_distributed',
    'Target_strident',
    'Target_lateral',
    'Target_dorsal',
    'Target_high',
    'Target_low',
    'Target_front',
    'Target_back',
    'Target_tense',
    'Target_lax',
    'Target_vowel',
    'Target_consonant',
    'Target_diphthong',
    'Target_monophthong',
    'Target_velar',
    'Target_alveolar',
    'Target_post-alveolar',
    'Target_dental',
    'Target_palatal',
    'Target_glottal',
    'Target_stop',
    'Target_fricative',
    'Target_affricate',
    'Target_glide',
    'Target_Place',
    'Target_Manner',
    'Target_Height',
    'Target_Frontness',
    'Phon_Acc',
    'Voicing_Acc',
    'Place_Acc',
    'Manner_Acc',
    'Height_Acc',
    'Frontness_Acc',
    'Tenseness_Acc',
    'Roundness_Acc',
    'wab1_aq',
    'wab1_nwf_total',
    'Session_Type',
    'wabaq_start'
]]

df.head()

Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,Prod_Word_N,...,Place_Acc,Manner_Acc,Height_Acc,Frontness_Acc,Tenseness_Acc,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,wabaq_start
0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70
1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70
2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70
3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70
4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70


In [5]:
#Check that target words are correctly written
df['Target_Arpabet'].unique()

array(['B UH K', 'B AO L', 'N AY F', 'K AH P', 'S EY F T IY P IH N',
       'H AE M AXR', 'T UW TH B R AX SH', 'AX R EY S AXR ', 'L AA K',
       'P EH N S AX L', 'S K R UW D R AY V AXR', 'K IY',
       'P EY P AXR K L IH P', 'W AA CH', 'K OW M', 'R AH B AXR B AE N D',
       'S P UW N', 'T EY P', 'F AO R K', 'M AE CH AX Z'], dtype=object)

In [6]:
#Replace incorrect target words
df['Target_Arpabet'] = (
    df['Target_Arpabet']
    .replace(
        {
        'AX R EY S AXR ':'AX R EY S AXR'
        }
    )
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target_Arpabet'] = (


In [7]:
#Check what each phoneme is being registered as
results = (
    # trans is a series, so use string accessor to split value strings
    df["Target_Arpabet"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
)

results.unique()

array(['B', 'UH', 'K', 'AO', 'L', 'N', 'AY', 'F', 'AH', 'P', 'S', 'EY',
       'T', 'IY', 'IH', 'H', 'AE', 'M', 'AXR', 'UW', 'TH', 'R', 'AX',
       'SH', 'AA', 'EH', 'D', 'V', 'W', 'CH', 'OW', 'Z'], dtype=object)

In [8]:
#Check that productions are correctly written
df['Production'].unique()

array(['B UH K', 'B AO L', 'SH', 'N', 'N AY F', 'K AH P', 'S',
       'S AY F T IY', 'H AE M AXR', 'T UW SH B OW N', 'T UW', 'B UH SH',
       'T UH', 'T IY', 'T UW TH P IY S', 'T P IY', 'T IY S', 'P IY', 'P',
       'IY', 'R AH B', 'R AH B AXR N', 'R AH B AXR', 'L AA K AXR',
       'T EH M', 'T', 'P EH N S AX L', 'S T R UW', 'S T R',
       'S T R UW DX OW', 'S K R UW', 'S K R UW CH', 'S K R UW S', 'K',
       'ZH', 'K IY', 'T S', 'S IH M', 'P EY P AXR', 'W AA CH', 'OW',
       'R AH B AXR L', 'R AH B AXR N AX Q', 'R AH', 'K AH', 'Q', 'S P UW',
       'S P UH SH', 'S P', 'S B AH', 'S UW P', 'T EY', 'SH AO R', 'B AX',
       'AY F', 'AY', 'K AO', 'K AH F', 'K AO Q', 'K AO F', 'K AH Q',
       'S T EY Q', 'S T EY', 'S T AY', 'H', 'H AE', 'T IH TH', 'P EY S',
       'B AA', 'P AH', 'B', 'B AH', 'L', 'L AA K', 'P EH N CH R',
       'P EH N S IH', 'P EH N', 'P EH', 'P IH N', 'K OW', 'K AX',
       'S T IH F', 'S P UW N', 'T AE', 'P AX', 'P IH', 'P T S', 'F AX',
       'F', 'S EY F T IY', '

In [9]:
df = df.dropna(subset=['Production'])

In [10]:
df["Target_Word_IPA"] = (
    # The production column is a series, so use string accessor to split value strings
    df["Target_Arpabet"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
    # group them by the original index
    .groupby(level=0)
    # "sum" them, which for string, concatonates them without any spaces
    .sum()
)

df.columns.to_list()


['PID',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Prod_Phon_Dur',
 'NOTES',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 

In [11]:
df["Prod_Word_IPA"] = (
    # The production column is a series, so use string accessor to split value strings
    df["Production"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
    # group them by the original index
    .groupby(level=0)
    # "sum" them, which for string, concatonates them without any spaces
    .sum()
)

df.head()

Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,Prod_Word_N,...,Height_Acc,Frontness_Acc,Tenseness_Acc,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,wabaq_start,Target_Word_IPA,Prod_Word_IPA
0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk
1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk
2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk
3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,2.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70,bɔl,bɔl
4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,2.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,61-70,bɔl,bɔl


In [1]:
#Get number of phonemes for each production
df['Prod_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Prod_Word_IPA']),
        axis=1
    )
)

#Get number of phonemes for each target
df['Target_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Target_Word_IPA']),
        axis=1
    )
)


NameError: name 'df' is not defined

In [13]:
#Replace incorrectly coded words
df['Prod_Word_IPA'] = (
    df['Prod_Word_IPA']
    .replace(
        {
        'i':'ə', 
        'ir':'ər', 
        'ɪ':'ə', 
        'ɛresɚ':'əresɚ',
        'ires':'əresɚ',
        'ɪresɚ':'əresɚ', 
        'iresɚ':'əresɚ'
        }
    )
)

In [14]:
#Import phonetic feature identifies
phon_dist_features = (
    pd.read_csv("Resources/phon_dist_features.csv")
)
phon_dist_features = phon_dist_features.dropna()
phon_dist_features['Phoneme_ID'] = phon_dist_features['Phoneme_ID'].astype('int')

In [15]:
#Create a dictionary for phoneme ID number

Prod_phon_ID = phon_dist_features[['IPA_singles','Phoneme_ID']].copy()
Target_phon_ID = phon_dist_features[['IPA_singles','Phoneme_ID']].copy()


# Creat dictionary for Prod_Phoneme_ID
Prod_phon_ID.rename(
    columns={
       'IPA_singles':'Prod_Phoneme_IPA', 
       'Phoneme_ID':'Prod_Phoneme_ID'
       }, inplace=True)

Target_phon_ID.rename(
    columns={
       'IPA_singles':'Target_Phoneme_IPA', 
       'Phoneme_ID':'Target_Phoneme_ID'
       }, inplace=True)

Prod_phon_ID
Target_phon_ID

Unnamed: 0,Target_Phoneme_IPA,Target_Phoneme_ID
0,h,1
1,r,2
2,w,3
3,j,4
4,b,5
5,ʧ,6
6,d,7
7,ð,8
8,ɾ,9
9,f,10


In [16]:
# Merge with original dataset
df = df.merge(Prod_phon_ID, on='Prod_Phoneme_ID', how='left').merge(Target_phon_ID, on='Target_Phoneme_ID', how='left')
df.head()

Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,Prod_Word_N,...,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,wabaq_start,Target_Word_IPA,Prod_Word_IPA,Prod_N_Tot_Phonemes,Prod_Phoneme_IPA,Target_Phoneme_IPA
0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk,3,b,b
1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk,3,ʊ,ʊ
2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,1.0,...,1.0,67.8,3.8,Baseline,61-70,bʊk,bʊk,3,k,k
3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,2.0,...,1.0,67.8,3.8,Baseline,61-70,bɔl,bɔl,3,b,b
4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,2.0,...,1.0,67.8,3.8,Baseline,61-70,bɔl,bɔl,3,ɔ,ɔ


In [17]:
# Calculate the Damerau_Levenshtein 
df['Damerau_Levenshtein'] = (
    df
    .apply(
        lambda x:
        td.damerau_levenshtein(str(x['Target_Word_IPA']),str(x['Prod_Word_IPA'])),
        axis=1
        )
)

In [18]:
df['Damerau_Levenshtein'].mean()

2.110030395136778

In [19]:
df.columns.to_list()

['PID',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Prod_Phon_Dur',
 'NOTES',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 

In [20]:
AccDat = pd.read_csv("Resources/AllAccScores_080123.csv")
AccDat.head()

Unnamed: 0,Phon_Sess_Code,syllabic_Acc,consonantal_Acc,sonorant_Acc,continuant_Acc,delayed release_Acc,approximant_Acc,tap_Acc,nasal_Acc,voice_Acc,...,post-alveolar_Acc,dental_Acc,palatal_Acc,glottal_Acc,stop_Acc,fricative_Acc,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc
0,15_0_1_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
1,15_0_1_2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
2,15_0_1_3,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
3,15_0_2_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
4,15_0_2_2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0


In [21]:
df = df.merge(AccDat, on='Phon_Sess_Code', how='left')
df.head()

Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,Prod_Word_N,...,post-alveolar_Acc,dental_Acc,palatal_Acc,glottal_Acc,stop_Acc,fricative_Acc,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc
0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,1.0,...,1,1,1,1,1,1,1,1,1.0,1.0
1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,1.0,...,1,1,1,1,1,1,1,1,1.0,1.0
2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,1.0,...,1,1,1,1,1,1,1,1,1.0,1.0
3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,2.0,...,1,1,1,1,1,1,1,1,1.0,1.0
4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,2.0,...,1,1,1,1,1,1,1,1,1.0,1.0


In [22]:
df.columns.to_list()

['PID',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Prod_Phon_Dur',
 'NOTES',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 

In [23]:
print(sorted(df['Target_N_Tot_Phonemes'].unique()))
print(sorted(df['Damerau_Levenshtein'].unique()))

[2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [24]:
df2 = df[[
    'Phon_Sess_Code',
    'Target_Word_IPA',
    'Prod_Word_IPA',
    'Target_Phoneme_IPA',
    'Prod_Phoneme_IPA',
    'Prod_N_Tot_Phonemes',
    'Damerau_Levenshtein',
    'syllabic_Acc',
    'consonantal_Acc',
    'sonorant_Acc',
    'continuant_Acc',
    'delayed release_Acc',
    'approximant_Acc',
    'tap_Acc',
    'nasal_Acc',
    'voice_Acc',
    'spread gl_Acc',
    'constr gl_Acc',
    'labial_Acc',
    'round_Acc',
    'labiodental_Acc',
    'coronal_Acc',
    'anterior_Acc',
    'distributed_Acc',
    'strident_Acc',
    'lateral_Acc',
    'dorsal_Acc',
    'high_Acc',
    'low_Acc',
    'front_Acc',
    'back_Acc',
    'tense_Acc',
    'lax_Acc',
    'vowel_Acc',
    'consonant_Acc',
    'diphthong_Acc',
    'monophthong_Acc',
    'velar_Acc',
    'alveolar_Acc',
    'post-alveolar_Acc',
    'dental_Acc',
    'palatal_Acc',
    'glottal_Acc',
    'stop_Acc',
    'fricative_Acc',
    'affricate_Acc',
    'glide_Acc',
    'FeatureWeighted_PhonAcc',
    'PVMWeighted_PhonAcc'
]].copy()
df2.head()

Unnamed: 0,Phon_Sess_Code,Target_Word_IPA,Prod_Word_IPA,Target_Phoneme_IPA,Prod_Phoneme_IPA,Prod_N_Tot_Phonemes,Damerau_Levenshtein,syllabic_Acc,consonantal_Acc,sonorant_Acc,...,post-alveolar_Acc,dental_Acc,palatal_Acc,glottal_Acc,stop_Acc,fricative_Acc,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc
0,15_0_1_1,bʊk,bʊk,b,b,3,0,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
1,15_0_1_2,bʊk,bʊk,ʊ,ʊ,3,0,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
2,15_0_1_3,bʊk,bʊk,k,k,3,0,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
3,15_0_2_1,bɔl,bɔl,b,b,3,0,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0
4,15_0_2_2,bɔl,bɔl,ɔ,ɔ,3,0,1,1,1,...,1,1,1,1,1,1,1,1,1.0,1.0


In [26]:
df2.columns.to_list()

['Phon_Sess_Code',
 'Target_Word_IPA',
 'Prod_Word_IPA',
 'Target_Phoneme_IPA',
 'Prod_Phoneme_IPA',
 'Prod_N_Tot_Phonemes',
 'Damerau_Levenshtein',
 'syllabic_Acc',
 'consonantal_Acc',
 'sonorant_Acc',
 'continuant_Acc',
 'delayed release_Acc',
 'approximant_Acc',
 'tap_Acc',
 'nasal_Acc',
 'voice_Acc',
 'spread gl_Acc',
 'constr gl_Acc',
 'labial_Acc',
 'round_Acc',
 'labiodental_Acc',
 'coronal_Acc',
 'anterior_Acc',
 'distributed_Acc',
 'strident_Acc',
 'lateral_Acc',
 'dorsal_Acc',
 'high_Acc',
 'low_Acc',
 'front_Acc',
 'back_Acc',
 'tense_Acc',
 'lax_Acc',
 'vowel_Acc',
 'consonant_Acc',
 'diphthong_Acc',
 'monophthong_Acc',
 'velar_Acc',
 'alveolar_Acc',
 'post-alveolar_Acc',
 'dental_Acc',
 'palatal_Acc',
 'glottal_Acc',
 'stop_Acc',
 'fricative_Acc',
 'affricate_Acc',
 'glide_Acc',
 'FeatureWeighted_PhonAcc',
 'PVMWeighted_PhonAcc']

In [27]:
df2.Damerau_Levenshtein.mean()

2.110030395136778

In [28]:
df.wabaq_start.unique()

array(['61-70', '91-100', '71-80', '41-50', '81-90'], dtype=object)

In [96]:
df.to_csv('Resources/all_data_pvm_acc5.csv', index=False)
df2.to_csv('Resources/AllAccScores_DamLev_080123.csv', index=False)