### Import dependencies

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import textdistance as td
import panphon as pp

# Make sure you can see all output
pd.options.display.max_rows = 4000

### Data set up

In [2]:
# Store filepath in a variable
df = pd.read_csv("Resources/baseline_data_pvm_acc2.csv")

df.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,Prod_Word_Dur,Prod_Arpabet,Word_ID,...,post-alveolar_Acc,dental_Acc,palatal_Acc,glottal_Acc,stop_Acc,fricative_Acc,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc
0,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,B,1,...,1,1,1,1,1,1,1,1,1.0,1.0
1,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,UH,1,...,1,1,1,1,1,1,1,1,1.0,1.0
2,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,K,1,...,1,1,1,1,1,1,1,1,1.0,1.0
3,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,B,2,...,1,1,1,1,1,1,1,1,1.0,1.0
4,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,AO,2,...,1,1,1,1,1,1,1,1,1.0,1.0


In [3]:
# Check columns
df.columns.tolist()

['RA',
 'Project',
 'PID',
 'Arm',
 'Week',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'PIDSESS_Code',
 'WordPhon_Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Word_IPA',
 'Prod_Phon_IPA',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_Word_IPA',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Characters',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distribute

### Damerau Levenshtein Edit Distance

In [4]:
#Check 
df.Target_Word_IPA.unique()

array(['bʊk', 'bɔl', 'nɐf', 'kʌp', 'seftipɪn', 'hæmɚ', 'tuθbrəʃ', 'ɪresɚ',
       'lɑk', 'pɛnsəl', 'skrudrɐvɚ', 'ki', 'pepɚklɪp', 'wɑʧ', 'kom',
       'rʌbɚbænd', 'spun', 'tep', 'fɔrk', 'mæʧəz'], dtype=object)

In [5]:
df[df['Target_Word_IPA'].isna()][['Prod_Word_IPA','Target_Word_IPA','Prod_Phon_IPA', 'Target_Phon_IPA']]

Unnamed: 0,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_IPA


In [6]:
#Get number of phonemes for each production
df['Prod_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Prod_Word_IPA']),
        axis=1
    )
)

#Get number of phonemes for each target
df['Target_N_Tot_Characters'] = (
    df
    .apply(
        lambda x:
        len(x['Target_Word_IPA']),
        axis=1
    )
)


In [7]:
# Calculate the Damerau_Levenshtein 
df['Damerau_Levenshtein'] = (
    df
    .apply(
        lambda x:
        td.damerau_levenshtein(str(x['Target_Word_IPA']),str(x['Prod_Word_IPA'])),
        axis=1
        )
)

In [8]:
df['Damerau_Levenshtein'].mean()

1.9319055464030752

In [9]:
print(sorted(df['Target_N_Tot_Characters'].unique()))
print(sorted(df['Damerau_Levenshtein'].unique()))

[2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


### Using PanPhon

In [10]:
# Check IPA symbols for fit with PanPhon
df['Target_Phon_IPA'].unique()

array(['b', 'ʊ', 'k', 'ɔ', 'l', 'n', 'ɐ', 'f', 'ʌ', 'p', 's', 'e', 't',
       'i', 'h', 'æ', 'm', 'ɚ', 'u', 'θ', 'r', 'ə', 'ɑ', nan, 'ɛ', 'd',
       'w', 'ʧ', 'ʃ', 'ɪ', 'o', 'z', 'v'], dtype=object)

In [11]:
df[df['Target_Phon_IPA'].isna()][['Prod_Word_IPA','Target_Word_IPA','Prod_Phon_IPA', 'Target_Phon_IPA']]

Unnamed: 0,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_IPA
88,lɑkɚ,lɑk,ɚ,
283,sefsefti,seftipɪn,s,
284,sefsefti,seftipɪn,e,
285,sefsefti,seftipɪn,f,
299,ssefti,seftipɪn,s,
480,sioɛn,kom,ɛ,
481,sioɛn,kom,n,
715,brʌʃɚtiθ,tuθbrəʃ,ɚ,
749,srudrovɚdrɐvɚ,skrudrɐvɚ,d,
750,srudrovɚdrɐvɚ,skrudrɐvɚ,r,


In [12]:
df.columns.to_list()

['RA',
 'Project',
 'PID',
 'Arm',
 'Week',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'PIDSESS_Code',
 'WordPhon_Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Word_IPA',
 'Prod_Phon_IPA',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_Word_IPA',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Characters',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distribute

# Create Bins based on phonological accuracy

In [13]:
# Create a data frame with mean scores for Accuracy at Baseline
BaseAcc = df[df['Session_ID'] == 0].groupby('PID').agg({
    'Phon_Acc': 'mean',  # Calculate mean Phonological Accuracy
    'Manner_Acc': 'mean',  # Calculate mean Manner of Articulation Accuracy
    'Place_Acc': 'mean', #Calulate mean Place of Articulation Accuracy
    'Voicing_Acc': 'mean', #Calulate mean Place of Articulation Accuracy
    'Damerau_Levenshtein': 'mean'
}).reset_index()

# Display the resulting data frame
BaseAcc

Unnamed: 0,PID,Phon_Acc,Manner_Acc,Place_Acc,Voicing_Acc,Damerau_Levenshtein
0,1,0.609091,0.724242,0.721212,0.875758,2.972727
1,4,0.860759,0.911392,0.898734,0.93038,2.056962
2,5,0.825,0.875,0.858333,0.941667,1.15
3,7,0.966667,0.988889,0.988889,0.988889,0.255556
4,8,0.932692,0.971154,0.951923,0.961538,0.634615
5,9,0.7,0.775,0.775,0.883333,2.108333
6,12,0.779817,0.926606,0.899083,0.917431,2.174312
7,13,0.915789,0.936842,0.947368,0.936842,0.926316
8,14,0.742647,0.816176,0.808824,0.889706,2.132353
9,15,0.636364,0.782609,0.73913,0.889328,3.837945


In [14]:
# Create bins in which to place values based on Baseline Phon_Acc scores
bins = [
    0, 
    .05, 
    .1, 
    .15, 
    .2, 
    .25, 
    .3, 
    .35, 
    .4, 
    .45, 
    .5, 
    .55, 
    .6, 
    .65, 
    .7, 
    .75, 
    .8, 
    .85, 
    .9, 
    .95, 
    1
        ]

# Create labels for these bins
group_labels = [
    '1-5%', 
    '6-10%', 
    '11-15%', 
    '16-20%', 
    '21-25%',
    '26-30%',
    '31-35%', 
    '36-40%',
    '41-45%',
    '46-50%',
    '51-55%',
    '56-60%',
    '61-65%',
    '66-70%',
    '71-75%',
    '76-80%',
    '81-85%',
    '86-90%',
    '91-95%',
    '96-100%'
    ]

In [15]:
# Slice the data and place it into bins
BaseAcc['Phon_Acc_Group']=pd.cut(BaseAcc["Phon_Acc"], bins, labels=group_labels)
BaseAcc['Place_Acc_Group']=pd.cut(BaseAcc["Place_Acc"], bins, labels=group_labels)
BaseAcc['Manner_Acc_Group']=pd.cut(BaseAcc["Manner_Acc"], bins, labels=group_labels)
BaseAcc['Voicing_Acc_Group']=pd.cut(BaseAcc["Voicing_Acc"], bins, labels=group_labels)

BaseAcc

Unnamed: 0,PID,Phon_Acc,Manner_Acc,Place_Acc,Voicing_Acc,Damerau_Levenshtein,Phon_Acc_Group,Place_Acc_Group,Manner_Acc_Group,Voicing_Acc_Group
0,1,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%
1,4,0.860759,0.911392,0.898734,0.93038,2.056962,86-90%,86-90%,91-95%,91-95%
2,5,0.825,0.875,0.858333,0.941667,1.15,81-85%,86-90%,86-90%,91-95%
3,7,0.966667,0.988889,0.988889,0.988889,0.255556,96-100%,96-100%,96-100%,96-100%
4,8,0.932692,0.971154,0.951923,0.961538,0.634615,91-95%,96-100%,96-100%,96-100%
5,9,0.7,0.775,0.775,0.883333,2.108333,66-70%,76-80%,76-80%,86-90%
6,12,0.779817,0.926606,0.899083,0.917431,2.174312,76-80%,86-90%,91-95%,91-95%
7,13,0.915789,0.936842,0.947368,0.936842,0.926316,91-95%,91-95%,91-95%,91-95%
8,14,0.742647,0.816176,0.808824,0.889706,2.132353,71-75%,81-85%,81-85%,86-90%
9,15,0.636364,0.782609,0.73913,0.889328,3.837945,61-65%,71-75%,76-80%,86-90%


In [16]:
# Save it
BaseAcc.to_csv('Resources/lvPhonBaseAcc.csv', index=False)

In [17]:
# Reduce the dataset to just contain the groups
BaseAcc2 = BaseAcc[['PID','Phon_Acc_Group','Place_Acc_Group','Manner_Acc_Group','Voicing_Acc_Group']]
BaseAcc2

Unnamed: 0,PID,Phon_Acc_Group,Place_Acc_Group,Manner_Acc_Group,Voicing_Acc_Group
0,1,61-65%,71-75%,71-75%,86-90%
1,4,86-90%,86-90%,91-95%,91-95%
2,5,81-85%,86-90%,86-90%,91-95%
3,7,96-100%,96-100%,96-100%,96-100%
4,8,91-95%,96-100%,96-100%,96-100%
5,9,66-70%,76-80%,76-80%,86-90%
6,12,76-80%,86-90%,91-95%,91-95%
7,13,91-95%,91-95%,91-95%,91-95%
8,14,71-75%,81-85%,81-85%,86-90%
9,15,61-65%,71-75%,76-80%,86-90%


In [18]:
#Merge in starting Phon_Acc_start score
df2 = df.merge(BaseAcc2, on='PID', how='left')
df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,Prod_Word_Dur,Prod_Arpabet,Word_ID,...,affricate_Acc,glide_Acc,FeatureWeighted_PhonAcc,PVMWeighted_PhonAcc,Prod_N_Tot_Phonemes,Damerau_Levenshtein,Phon_Acc_Group,Place_Acc_Group,Manner_Acc_Group,Voicing_Acc_Group
0,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,B,1,...,1,1,1.0,1.0,3,0,61-65%,71-75%,76-80%,86-90%
1,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,UH,1,...,1,1,1.0,1.0,3,0,61-65%,71-75%,76-80%,86-90%
2,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,K,1,...,1,1,1.0,1.0,3,0,61-65%,71-75%,76-80%,86-90%
3,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,B,2,...,1,1,1.0,1.0,3,0,61-65%,71-75%,76-80%,86-90%
4,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,AO,2,...,1,1,1.0,1.0,3,0,61-65%,71-75%,76-80%,86-90%


### Save file

In [19]:
df2.to_csv('Resources/baseline_data_pvm_acc3.csv', index=False)

# Merge MRI data with the Accuracy Data

In [20]:
# Load MRI data
MRIdat = pd.read_csv("Resources/lvPPA-MRI-data.csv")
MRIdat.head()

Unnamed: 0,PID,Study,Session_ID,system,label,name,measure,metric,value
0,1,rPPA,0,brainnetome246ext,1,A8m_left,volume,numeric,1523.7121
1,1,rPPA,0,brainnetome246ext,2,A8m_right,volume,numeric,2060.2881
2,1,rPPA,0,brainnetome246ext,3,A8dl_left,volume,numeric,2237.4401
3,1,rPPA,0,brainnetome246ext,4,A8dl_right,volume,numeric,2172.9281
4,1,rPPA,0,brainnetome246ext,5,A9l_left,volume,numeric,1989.6321


In [21]:
# Check out the participants
MRIdat.PID.unique()

array([ 1,  4,  5,  8,  9, 12, 13, 14, 15, 16, 22, 28])

In [22]:
# Merge in the BaseAcc data
MRIdat2 = MRIdat.merge(BaseAcc, on='PID', how='left')
MRIdat2.head()

Unnamed: 0,PID,Study,Session_ID,system,label,name,measure,metric,value,Phon_Acc,Manner_Acc,Place_Acc,Voicing_Acc,Damerau_Levenshtein,Phon_Acc_Group,Place_Acc_Group,Manner_Acc_Group,Voicing_Acc_Group
0,1,rPPA,0,brainnetome246ext,1,A8m_left,volume,numeric,1523.7121,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%
1,1,rPPA,0,brainnetome246ext,2,A8m_right,volume,numeric,2060.2881,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%
2,1,rPPA,0,brainnetome246ext,3,A8dl_left,volume,numeric,2237.4401,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%
3,1,rPPA,0,brainnetome246ext,4,A8dl_right,volume,numeric,2172.9281,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%
4,1,rPPA,0,brainnetome246ext,5,A9l_left,volume,numeric,1989.6321,0.609091,0.724242,0.721212,0.875758,2.972727,61-65%,71-75%,71-75%,86-90%


In [23]:
MRIdat2.to_csv('Resources/lvPhon-MRI-Acc-dat.csv', index=False)