# PVM Analysis

## Set-up

In [17]:
# Dependencies
import pandas as pd
import numpy as np
import textdistance as td

# Make sure you can see all output
pd.options.display.max_rows = 4000
# pd.options.display.max_columns = 4000

In [18]:
# Store filepath in a variable
df = pd.read_csv("Resources/all_data_pvm_acc4.csv")

#Translate the ARPABET codes to IPA codes
dictionary = (
    pd.read_csv("Resources/dict.csv")
    .set_index("Arpabet")
)

display(df.head(), dictionary)


Unnamed: 0.1,Unnamed: 0,PID,Target,Production,Prod_Word_Dur,Prod_Arpabet,Prod_Phon_Dur,NOTES,Word_ID,Session_ID,...,Height_Acc,Frontness_Acc,Tenseness_Acc,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,Improvement_Group,NWF_Improvement_Group,wabaq_start
0,0,15,book,B UH K,0.295646,B,0.024363,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
1,1,15,book,B UH K,0.295646,UH,0.163408,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
2,2,15,book,B UH K,0.295646,K,0.107875,Article (ÃÂ) before word,1.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
3,3,15,ball,B AO L,0.397365,B,0.014197,,2.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70
4,4,15,ball,B AO L,0.397365,AO,0.211006,,2.0,0.0,...,1.0,1.0,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70


Unnamed: 0_level_0,IPA_singles
Arpabet,Unnamed: 1_level_1
H,h
R,r
W,w
Y,j
B,b
CH,ʧ
D,d
DH,ð
DX,ɾ
F,f


### Fixing IPA

In [19]:
# The IPA codes we have stored are corrupted, so we need to make them again
df = df.drop(
    [
    'Unnamed: 0',
    'NOTES',
    'Prod_Word_Dur',
    'Prod_Phon_Dur',
    'Prod_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Word_IPA',
    'Target_Phon_IPA'
    ], 
    axis='columns')

In [20]:
#Check that target words are correctly written
df['Target_Arpabet'].unique()

array(['B UH K', 'B AO L', 'N AY F', 'K AH P', 'S EY F T IY P IH N',
       'H AE M AXR', 'T UW TH B R AX SH', 'AX R EY S AXR ', 'L AA K',
       'P EH N S AX L', 'S K R UW D R AY V AXR', 'K IY',
       'P EY P AXR K L IH P', 'W AA CH', 'K OW M', 'R AH B AXR B AE N D',
       'S P UW N', 'T EY P', 'F AO R K', 'M AE CH AX Z'], dtype=object)

In [21]:
#Replace incorrect target words
df['Target_Arpabet'] = (
    df['Target_Arpabet']
    .replace(
        {
        'AX R EY S AXR ':'AX R EY S AXR'
        }
    )
)

In [22]:
#Check what each phoneme is being registered as
results = (
    # trans is a series, so use string accessor to split value strings
    df["Target_Arpabet"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
)

results.unique()

array(['B', 'UH', 'K', 'AO', 'L', 'N', 'AY', 'F', 'AH', 'P', 'S', 'EY',
       'T', 'IY', 'IH', 'H', 'AE', 'M', 'AXR', 'UW', 'TH', 'R', 'AX',
       'SH', 'AA', 'EH', 'D', 'V', 'W', 'CH', 'OW', 'Z'], dtype=object)

In [23]:
#Check that productions are correctly written
df['Production'].unique()

array(['B UH K', 'B AO L', 'SH', 'N', 'N AY F', 'K AH P', 'S',
       'S AY F T IY', 'H AE M AXR', 'T UW SH B OW N', 'T UW', 'B UH SH',
       'T UH', 'T IY', 'T UW TH P IY S', 'T P IY', 'T IY S', 'P IY', 'P',
       'IY', 'R AH B', 'R AH B AXR N', 'R AH B AXR', 'L AA K AXR',
       'T EH M', 'T', 'P EH N S AX L', 'S T R UW', 'S T R',
       'S T R UW DX OW', 'S K R UW', 'S K R UW CH', 'S K R UW S', 'K',
       'ZH', 'K IY', 'T S', 'S IH M', 'P EY P AXR', 'W AA CH', 'OW',
       'R AH B AXR L', 'R AH B AXR N AX Q', 'R AH', 'K AH', 'Q', 'S P UW',
       'S P UH SH', 'S P', 'S B AH', 'S UW P', 'T EY', 'SH AO R', 'B AX',
       'AY F', 'AY', 'K AO', 'K AH F', 'K AO Q', 'K AO F', 'K AH Q',
       'S T EY Q', 'S T EY', 'S T AY', 'H', 'H AE', 'T IH TH', 'P EY S',
       'B AA', 'P AH', 'B', 'B AH', 'L', 'L AA K', 'P EH N CH R',
       'P EH N S IH', 'P EH N', 'P EH', 'P IH N', 'K OW', 'K AX',
       'S T IH F', 'S P UW N', 'T AE', 'P AX', 'P IH', 'P T S', 'F AX',
       'F', 'S EY F T IY', '

In [24]:
df = df.dropna(subset=['Production'])

In [25]:
df["Target_Word_IPA"] = (
    # The production column is a series, so use string accessor to split value strings
    df["Target_Arpabet"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
    # group them by the original index
    .groupby(level=0)
    # "sum" them, which for string, concatonates them without any spaces
    .sum()
)

df.columns.to_list()


['PID',
 'Target',
 'Production',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Phoneme_ID',
 'Prod_Prev_Phon',
 'Prod_Next_Phon',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 'Prod_lo

In [26]:
df["Prod_Word_IPA"] = (
    # The production column is a series, so use string accessor to split value strings
    df["Production"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
    # group them by the original index
    .groupby(level=0)
    # "sum" them, which for string, concatonates them without any spaces
    .sum()
)

df.head()

Unnamed: 0,PID,Target,Production,Prod_Arpabet,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,Code,Phon_Sess_Code,...,Tenseness_Acc,Roundness_Acc,wab1_aq,wab1_nwf_total,Session_Type,Improvement_Group,NWF_Improvement_Group,wabaq_start,Target_Word_IPA,Prod_Word_IPA
0,15,book,B UH K,B,1.0,0.0,1.0,1.0,1_1,15_0_1_1,...,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk
1,15,book,B UH K,UH,1.0,0.0,1.0,2.0,1_2,15_0_1_2,...,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk
2,15,book,B UH K,K,1.0,0.0,1.0,3.0,1_3,15_0_1_3,...,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk
3,15,ball,B AO L,B,2.0,0.0,2.0,1.0,2_1,15_0_2_1,...,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bɔl,bɔl
4,15,ball,B AO L,AO,2.0,0.0,2.0,2.0,2_2,15_0_2_2,...,1.0,1.0,67.8,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bɔl,bɔl


In [27]:
#Get number of phonemes for each production
df['Prod_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Prod_Word_IPA']),
        axis=1
    )
)

#Get number of phonemes for each target
df['Target_N_Tot_Phonemes'] = (
    df
    .apply(
        lambda x:
        len(x['Target_Word_IPA']),
        axis=1
    )
)


In [28]:
#Replace incorrectly coded words
df['Prod_Word_IPA'] = (
    df['Prod_Word_IPA']
    .replace(
        {
        'i':'ə', 
        'ir':'ər', 
        'ɪ':'ə', 
        'ɛresɚ':'əresɚ',
        'ires':'əresɚ',
        'ɪresɚ':'əresɚ', 
        'iresɚ':'əresɚ'
        }
    )
)

In [29]:
#Import phonetic feature identifies
phon_dist_features = (
    pd.read_csv("Resources/phon_dist_features.csv")
)
phon_dist_features = phon_dist_features.dropna()
phon_dist_features['Phoneme_ID'] = phon_dist_features['Phoneme_ID'].astype('int')

In [30]:
#Create a dictionary for phoneme ID number

Prod_phon_ID = phon_dist_features[['IPA_singles','Phoneme_ID']].copy()
Target_phon_ID = phon_dist_features[['IPA_singles','Phoneme_ID']].copy()


# Creat dictionary for Prod_Phoneme_ID
Prod_phon_ID.rename(
    columns={
       'IPA_singles':'Prod_Phoneme_IPA', 
       'Phoneme_ID':'Prod_Phoneme_ID'
       }, inplace=True)

Target_phon_ID.rename(
    columns={
       'IPA_singles':'Target_Phoneme_IPA', 
       'Phoneme_ID':'Target_Phoneme_ID'
       }, inplace=True)

Prod_phon_ID
Target_phon_ID

Unnamed: 0,Target_Phoneme_IPA,Target_Phoneme_ID
0,h,1
1,r,2
2,w,3
3,j,4
4,b,5
5,ʧ,6
6,d,7
7,ð,8
8,ɾ,9
9,f,10


In [31]:
# Merge with original dataset
df = df.merge(Prod_phon_ID, on='Prod_Phoneme_ID', how='left').merge(Target_phon_ID, on='Target_Phoneme_ID', how='left')
df.head()

Unnamed: 0,PID,Target,Production,Prod_Arpabet,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,Code,Phon_Sess_Code,...,wab1_nwf_total,Session_Type,Improvement_Group,NWF_Improvement_Group,wabaq_start,Target_Word_IPA,Prod_Word_IPA,Prod_N_Tot_Phonemes,Prod_Phoneme_IPA,Target_Phoneme_IPA
0,15,book,B UH K,B,1.0,0.0,1.0,1.0,1_1,15_0_1_1,...,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk,3,b,b
1,15,book,B UH K,UH,1.0,0.0,1.0,2.0,1_2,15_0_1_2,...,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk,3,ʊ,ʊ
2,15,book,B UH K,K,1.0,0.0,1.0,3.0,1_3,15_0_1_3,...,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bʊk,bʊk,3,k,k
3,15,ball,B AO L,B,2.0,0.0,2.0,1.0,2_1,15_0_2_1,...,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bɔl,bɔl,3,b,b
4,15,ball,B AO L,AO,2.0,0.0,2.0,2.0,2_2,15_0_2_2,...,3.8,Baseline,No_Improved,NWF_No_Improved,61-70,bɔl,bɔl,3,ɔ,ɔ


In [34]:
# Check out columns
df.columns.tolist()

['PID',
 'Target',
 'Production',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Phoneme_ID',
 'Prod_Prev_Phon',
 'Prod_Next_Phon',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Prev_Phon',
 'Target_Next_Phon',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 'Prod_lo

In [38]:
# Create a shortened dataset
dfShort = df.drop(
    [
    'Production',
    'Prod_Arpabet',
    'Code',
    'Prod_Last_Phon',
    'Prod_Prev_Phon',
    'Prod_Next_Phon',
    'Target_Arpabet',
    'Target_Phon_Arpabet',
    'Target_Prev_Phon',
    'Target_Next_Phon',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Improvement_Group',
    'NWF_Improvement_Group'
    ], 
    axis='columns')

In [37]:
dfShort.columns.tolist()

['PID',
 'Target',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Phoneme_ID',
 'Target_N_Tot_Words',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Phonemes',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Prod_syllabic',
 'Prod_consonantal',
 'Prod_sonorant',
 'Prod_continuant',
 'Prod_delayed release',
 'Prod_approximant',
 'Prod_tap',
 'Prod_nasal',
 'Prod_voice',
 'Prod_spread gl',
 'Prod_constr gl',
 'Prod_labial',
 'Prod_round',
 'Prod_labiodental',
 'Prod_coronal',
 'Prod_anterior',
 'Prod_distributed',
 'Prod_strident',
 'Prod_lateral',
 'Prod_dorsal',
 'Prod_high',
 'Prod_low',
 'Prod_front',
 'Prod_back',
 'Prod_tense',
 'Prod_lax',
 'Prod_vowel',
 'Prod_consonant',
 'Prod_diphthong',
 'Prod_monophthong',
 'Prod_velar',
 'Prod_alveolar',
 'Prod_post-alv

In [59]:
# Testing center to make sure functions worked correctly in identifying the phonological process
dfShort.iloc[844:848].to_dict()

{'PID': {844: 1, 845: 1, 846: 1, 847: 1},
 'Target': {844: 'spoon', 845: 'spoon', 846: 'spoon', 847: 'spoon'},
 'Word_ID': {844: 17.0, 845: 17.0, 846: 17.0, 847: 17.0},
 'Session_ID': {844: 0.0, 845: 0.0, 846: 0.0, 847: 0.0},
 'Prod_Word_N': {844: 86.0, 845: 86.0, 846: 86.0, 847: 86.0},
 'Prod_Phon_N': {844: 1.0, 845: 2.0, 846: 3.0, 847: 4.0},
 'Phon_Sess_Code': {844: '1_0_86_1',
  845: '1_0_86_2',
  846: '1_0_86_3',
  847: '1_0_86_4'},
 'Word_Sess_Code': {844: '1_0_86',
  845: '1_0_86',
  846: '1_0_86',
  847: '1_0_86'},
 'Prod_Phoneme_ID': {844: 19.0, 845: 18.0, 846: 38.0, 847: 18.0},
 'Target_N_Tot_Words': {844: 1.0, 845: 1.0, 846: 1.0, 847: 1.0},
 'Target_N_Tot_Syllables': {844: 1.0, 845: 1.0, 846: 1.0, 847: 1.0},
 'Target_N_Tot_Phonemes': {844: 4, 845: 4, 846: 4, 847: 4},
 'Target_Phoneme_ID': {844: 19.0, 845: 18.0, 846: 38.0, 847: 16.0},
 'Target_Syll_Env': {844: '#_V', 845: '#_V', 846: 'C_C', 847: 'V_#'},
 'Target_Word_Pos': {844: '#_V', 845: 'C_V', 846: 'C_C', 847: 'V_#'},
 'Sy

## PVM Analyses

In [60]:
# Apply function to determine if change constituted gliding
# Gliding: When /r/ or /l/ are produced as a /w/ or /j/, such as “wabbit" for "rabbit" or "yeyow" for "yellow”)
# 1 = yes; 0 = no
df['gliding'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Manner_Acc'] == 0
        and x['Target_approximant'] == 1
        and x['Prod_glide'] == 1
        else 
        0, 
        axis=1
    )
)

In [61]:
# Apply function to determine if change constituted stopping
# Stopping: When a fricative (e.g., /f/ or /s/) or affricate (/ʧ/ or /ʤ/) is substituted with a stop consonant, such as “pan" for "fan" or "dump for "jump”)
# 1 = yes; 0 = no
df['stopping'] = (
    df
    .apply(
        lambda x: 
            1 
            if x['Manner_Acc'] == 0
            and (
                x['Target_affricate'] == 1 
                or 
                x['Target_fricative'] == 1
                )
            and x['Prod_stop'] == 1
            else 
            0, 
        axis=1
    )
)

In [62]:
# Apply function to determine if change constituted affrication
# Affrication: When a nonaffricate is replaced with an affricate, such as “joor" for "door” 
# 1 = yes; 0 = no
df['affrication'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Manner_Acc'] == 0
        and x['Prod_affricate'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [63]:
# Apply function to determine if change constituted deaffrication
# Deaffrication: When an affricate is replaced with a stop or fricative, such as “ships” for “chips”
# 1 = yes; 0 = no
df['deaffrication'] = (
    df
    .apply(
        lambda x: 
            1 
            if x['Manner_Acc'] == 0
            and x['Target_affricate'] == 1 
            and (
                x['Prod_stop'] == 1
                or
                x['Prod_fricative'] == 1
                )
            else 
            0, 
        axis=1
    )
)

In [64]:
# Apply function to determine if change constituted denasalization
# Denasalization: When a nasal consonant changes to a non-nasal consonant, such as “doze” for “nose”
# 1 = yes; 0 = no
df['denasalization'] = (
    df
    .apply(
        lambda x: 
            1 
            if x['Manner_Acc'] == 0
            and x['Place_Acc'] == 1
            and x['Target_nasal'] == 1
            else 
            0, 
        axis=1
    )
)

In [65]:
# Apply function to determine if change constituted nasalization
# Nasalization: When a non-nasal consonant changes to a nasal consonant, such as "nose" for "doze”
# 1 = yes; 0 = no
df['nasalization'] = (
    df
    .apply(
        lambda x: 
            1 
            if x['Manner_Acc'] == 0
            and x['Place_Acc'] == 1
            and x['Prod_nasal'] == 1
            else 
            0, 
        axis=1
    )
)

In [66]:
# Apply function to determine if change constituted backing
# Backing: When sounds produced forward in the mouth are substituted with sounds produced farther back in the mouth; e.g., alveolar for velar, such as “got” for “dot”
# Note: Only accounts for when consonants replace consonants or vowels replace vowels. Not consonant becomes vowel or vice versa.
# 1 = yes; 0 = no
df['backing'] = (
    df
    .apply(
        lambda x: 
            1 
            if 
                (x['Target_vowel'] == 1
                and x['Prod_vowel'] == 1
                and x['Target_front'] == 1
                and x['Prod_front'] == -1)
            or
                (x['Target_vowel'] == -1
                and x['Prod_vowel'] == -1
                and x['Target_Place_N'] < x['Prod_Place_N'])
            else 
                0, 
        axis=1
    )
)

In [67]:
# Apply function to determine if change constituted fronting
# Fronting: When sounds produced in the backward in the mouth are substituted with sounds produced more forward in the mouth; e.g., alveolar for bilabial, such as "bot" for "dot”
# Note: Only accounts for when consonants replace consonants or vowels replace vowels. Not consonant becomes vowel or vice versa.
# 1 = yes; 0 = no
df['fronting'] = (
    df
    .apply(
        lambda x: 
            1 
            if 
                (x['Target_vowel'] == 1
                and x['Prod_vowel'] == 1
                and x['Target_back'] == 1
                and x['Prod_back'] == -1)
            or
                (x['Target_vowel'] == -1
                and x['Prod_vowel'] == -1
                and x['Target_Place_N'] > x['Prod_Place_N'])
            else 
                0, 
        axis=1
    )
)

In [68]:
# Apply function to determine if change constituted alveolarization
# Alveolarization: When a nonalveolar sound is substituted with an alveolar sound, such as “tu" for "shoe”
# 1 = yes; 0 = no
df['alveolarization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_alveolar'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [69]:
# Apply function to determine if change constituted labialization
# Labialization: When a non-bilabial sound is replaced with a bilabial sound, such as “pie" for "tie”
# 1 = yes; 0 = no
df['labialization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_labial'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [70]:
# Apply function to determine if change constituted velarization
# Velarization: When a non-velar sound is replaced with a velar sound, such as “kite” for “light”
# 1 = yes; 0 = no
df['velarization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_velar'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [71]:
# Apply function to determine if change constituted post-alveolarization
# Post-alveolarization: When a non-post-alveolar sound is replaced with a post-alveolar sound, such as “chair” for “care
# 1 = yes; 0 = no
df['post-alveolarization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_post-alveolar'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [72]:
# Apply function to determine if change constituted dentalization
# Dentalization: When a non-dental sound is replaced with a dental sound, such as “teeth” for “thief”
# 1 = yes; 0 = no
df['dentalization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_dental'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [73]:
# Apply function to determine if change constituted palatalization
# Palatalization: When a non-palatal sound is replaced with a palatal sound, such as “year” for “rear”
# 1 = yes; 0 = no
df['palatalization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_palatal'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [74]:
# Apply function to determine if change constituted glottalization
# Glottalization: When a non-glottal sound is replaced with a glottal sound, such as “here” for “fear”
# 1 = yes; 0 = no
df['glottalization'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Place_Acc'] == 0
        and x['Prod_glottal'] == 1
        and x['Target_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [75]:
# Apply function to determine if change constituted devoicing
# Devoicing: When a voiced production is substituted for a voiceless production, such as “pin” for “bin”
# 1 = yes; 0 = no
df['devoicing'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Voicing_Acc'] == 0
        and x['Prod_voice'] == -1
        and x['Target_vowel'] == -1
        and x['Prod_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [76]:
# Apply function to determine if change constituted voicing
# Voicing: When a voiceless production is substituted for a voiced production, such as “bin” for “pin”
# 1 = yes; 0 = no
df['voicing'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Voicing_Acc'] == 0
        and x['Prod_voice'] == 1
        and x['Target_vowel'] == -1
        and x['Prod_vowel'] == -1
        else 
        0, 
        axis=1
    )
)

In [77]:
# Apply function to determine if change constituted prevocalic voicing
# Prevocalic Voicing: When a voiceless consonant at the preceding a vowel in a syllable like /k/ or /f/ is substituted with a voiced consonant like /g/ or /v/, such as “gup” for "cup”
# 1 = yes; 0 = no
df['prevocalic_voicing'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Voicing_Acc'] == 0
        and x['Target_Syll_Env'] == '#_V'
        and x['Prod_vowel'] == -1
        and x['Prod_voice'] == 1
        else 
        0, 
        axis=1
    )
)

In [78]:
# Apply function to determine if change constituted postvocalic voicing
# Postvocalic Voicing: When a voiceless consonant following a vowel in a syllable like /k/ or /f/ is substituted with a voiced consonant like /g/ or /v/, such as “pod” for "pot”
# 1 = yes; 0 = no
df['postvocalic_voicing'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Voicing_Acc'] == 0
        and x['Target_Syll_Env'] == 'V_#'
        and x['Prod_vowel'] == -1
        and x['Prod_voice'] == 1
        else 
        0, 
        axis=1
    )
)

In [79]:
# Apply function to determine if change constituted final consonant devoicing
# Final Consonant Devoicing: When a voiced consonant at the end of a word like /b/ or /d/ is substituted with a voiceless consonant like /p/ or /t/, such as "pick" for "pig”
# 1 = yes; 0 = no
df['final_consonant_devoicing'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Voicing_Acc'] == 0
        and '_#' in x['Target_Syll_Env']
        and x['Prod_vowel'] == -1
        and x['Prod_voice'] == 1
        else 
        0, 
        axis=1
    )
)

In [80]:
# Apply function to determine if change constituted final consonant deletion
# Final Consonant Deletion: When the final consonant in a word is left off, such as “toe” for ”toad”
# 1 = yes; 0 = no
df['final_consonant_deletion'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Prod_Last_Phon'] == 1
        and x['Target_Word_Pos'].find('_#') == -1  
        and x['Target_Word_Pos'].find('addition') == -1
        else 
        0, 
        axis=1
    )
)

In [81]:
# Apply function to determine if change constituted initial consonant deletion
# Initial Consonant Deletion: When the initial consonant in a word is left off, such as “ode” for ”toad”
# 1 = yes; 0 = no
df['initial_consonant_deletion'] = (
    df
    .apply(
        lambda x: 
        1 
        if x['Prod_Phon_N'] == 1
        and x['Phon_Acc'] == 0
        and x['Prod_Phoneme_ID'] == x['Target_Next_Phon']
        else 
        0, 
        axis=1
    )
)


In [82]:
# Apply function to determine if change constituted epenthesis
# Epenthesis: When an extra sound is added to a word, such as “bu-lue" for "blue”
# 1 = yes; 0 = no

df['epenthesis'] = (
    df
    .apply(
        lambda x: 
        1 
        if len(x['Prod_Word_IPA']) > len(x['Target_Word_IPA'])
        else 
        0, 
        axis=1
    )
)

In [83]:
# Create a column to calculate the total number of extra phonemes produced by the participant
df['Tot_Additions'] = (
    df
    .apply(
        lambda x: 
        len(x['Prod_Word_IPA']) - len(x['Target_Word_IPA'])
        if len(x['Prod_Word_IPA']) > len(x['Target_Word_IPA'])
        else 
        0, 
        axis=1
    )
)

In [84]:
# Create a column to calculate the total number of missing phonemes from the production based on the target
df['Tot_Deletions'] = (
    df
    .apply(
        lambda x: 
        len(x['Target_Word_IPA']) - len(x['Prod_Word_IPA'])
        if len(x['Prod_Word_IPA']) < len(x['Target_Word_IPA'])
        else 
        0, 
        axis=1
    )
)

In [86]:
# Apply function to determine if change constituted assimilation
# Assimilation: When a consonant sound starts to sound like another sound in the word, such as “bub" for "bus”
# 1 = yes; 0 = no

df['Target_Word_IPA']=df['Target_Word_IPA'].astype('str')
df['Prod_Word_IPA']=df['Prod_Word_IPA'].astype('str')
df['Target_Phoneme_IPA']=df['Target_Phoneme_IPA'].astype('str')

df['assimilation'] = (
    df
    .apply(
        lambda x: 
        1
        if (
            x['Target_Word_IPA'].count(x['Target_Phoneme_IPA']) <
            x['Prod_Word_IPA'].count(x['Target_Phoneme_IPA'])
        )
        else 
        0, 
        axis=1
    )
)

In [None]:
# Apply function to determine if change constituted postvocalic assimilation
# Postvocalic Assimilation: When a consonant borrows features from a vowel that follows it in the word production (e.g., becomes more fronted or backed due to frontness of the vowel), such as “school” for “spool”
# 1 = yes; 0 = no

# First, need to determine if the target was a consonant using the 'Target_consonantal' column
# And if it was produced in error based on the 'Phon_Acc' column
# And if its expected to have a vowel following it based on the 'Target_Syll_Env' column (C_V, #_V, or V_V)

# Then, need to determine if the produced consonant has any features that match the following target vowel based on the vowel's height and frontness


In [None]:
# Apply function to determine if change constituted prevocalic assimilation
# Prevocalic Assimilation: When a consonant borrows features from a vowel that precedes it in the word production (e.g., becomes more fronted or backed due to frontness of the vowel), such as “leap for “leak”
# 1 = yes; 0 = no

# First, need to determine if the target was a consonant using the 'Target_consonantal' column
# And if it was produced in error based on the 'Phon_Acc' column
# And if its expected to have a vowel preceding it based on the 'Target_Syll_Env' column (V_C, V_#, or V_V)

# Then, need to determine if the produced consonant has any features that match the preceding target vowel based on the vowel's height and frontness


In [None]:
# Apply function to determine if change constituted coalescence
# Coalescence: When two phonemes are substituted with a different phoneme that still has similar features, such as “fort” for “sport”
# 1 = yes; 0 = no

# Would need to see if target was produced inaccurately and if preceding or succeeding sound is deleted


In [None]:
# Apply function to determine if change constituted reduplication
# Reduplication: When a complete or incomplete syllable is repeated, such as “baba" for "battle”
# 1 = yes; 0 = no

# Not sure our current dataset could do this
# Would need to identify syllable boundaries for each word first


In [None]:
# Apply function to determine if change constituted cluster reduction
# Cluster Reduction: When a consonant cluster is reduced to a single consonant, such as “soon” for “spoon”
# 1 = yes; 0 = no

# First need to determine if target is a part of a cluster
# Then need to determine if the target was deleted


In [None]:
# Apply function to determine if change constituted weak syllable deletion
# Weak Syllable Deletion: When the weak syllable in a word is deleted, such as “nana" for "banana”
# 1 = yes; 0 = no

# Not sure if we can do this with the way the data is currently set up. 
# Would need to identify strong an weak syllables for each word, then tie those syllables to the phonemes

In [None]:
# Apply function to determine if change constituted anticipation
# Anticipation: When a speech sound that occurs later in a word/sentence is produced earlier, such as “cork” for “take my bike”
# 1 = yes; 0 = no

# Would need to identify each word based on its collection of sounds 
# (e.g., fork would be [10,30,2,13], cork would be [13,20,2,13])
# Then, you would need to determine if one of the sounds was repeated (e.g., /k/ is expected to occur once, but it occurs twice)
# And whether the repeated sound happens earlier in the list then expected (position 1, when it should be in position 4)


In [None]:
# Apply function to determine if change constituted preservation
# Preservation: When a speech sound that occurs earlier in a word/sentence is produced later, such as “nine” for “knife”
# 1 = yes; 0 = no

# Would need to identify each word based on its collection of sounds 
# (e.g., knife would be [16,40,10], nine would be [16,40,16])
# Then, you would need to determine if one of the sounds was repeated (e.g., /n/ is expected to occur once, but it occurs twice)
# And whether the repeated sound happens later in the list then expected (position 3, when it should be in position 1)


In [87]:
# Apply function to determine if change constituted a shift
# Shift: When a speech sound that is supposed to occur in one part of the word/sentence is produced at a different part of the word/sentence, such as “poons” for “spoon”
# 1 = yes; 0 = no

def identify_shift(row):
    target_phonemes = list(row['Target_Word_IPA'])  # Assuming IPA representation is a string where each character represents a phoneme
    prod_phonemes = list(row['Prod_Word_IPA'])

    if row['Phon_Acc'] == 0:  # Check if there is an error in production
        target_phoneme = row['Target_Phoneme_IPA']

        # Check if the target phoneme appears in the produced word
        if target_phoneme in prod_phonemes:
            target_position = row['Target_Phoneme_ID']
            prod_position = prod_phonemes.index(target_phoneme)

            # Check if the target phoneme appears at a different position in the produced word
            if target_position != prod_position:
                return 1

    return 0

df['shift'] = df.apply(identify_shift, axis=1)

# # Would need to identify each word based on its collection of sounds 
# (e.g., spoon would be [19,18,38,16], poons would be [18,38,16,19])
# Then, would need to see if all sounds that should be present are present regardless of position
# Then, would determine whether the order of sounds shifted position, so if +/- 1 position would result in a series of correct positions for more than one sound in the word

In [None]:
# Apply function to determine if change constituted an exchange
# Exchange: When a sound in one part of a word/sentence trades places with a sound in another part of the word/sentence, such as 'call' for 'lock'
# 1 = yes; 0 = no

# # Would need to identify each word based on its collection of sounds 
# (e.g., spoon would be [19,18,38,16], poons would be [18,38,16,19])
# Then, would need to see if all sounds that should be present are present regardless of position



In [None]:
# Apply function to determine if change constituted compound word reduction
# Compound Word Reduction: When a compound word is reduced to a single root word or syllable, such as “lunch” for “lunchbox”
# 1 = yes; 0 = no


In [90]:
# Testing center to make sure functions worked correctly in identifying the phonological process
df[df['shift']==1][['Prod_Word_IPA','Target_Word_IPA','Prod_Phoneme_IPA', 'Target_Phoneme_IPA']]

Unnamed: 0,Prod_Word_IPA,Target_Word_IPA,Prod_Phoneme_IPA,Target_Phoneme_IPA
72,ə,əresɚ,i,ə
74,rʌb,əresɚ,ʌ,r
77,rʌbɚn,əresɚ,ʌ,r
80,rʌbɚn,əresɚ,n,ɚ
82,rʌbɚ,əresɚ,ʌ,r
194,rʌbɚl,rʌbɚbænd,l,b
203,rʌbɚnəʔ,rʌbɚbænd,n,b
205,rʌbɚnəʔ,rʌbɚbænd,ʔ,n
210,rʌbɚn,rʌbɚbænd,n,b
262,ɐf,nɐf,f,ɐ
