# Baseline lvPPA WAB Naming Data Cleaning Script

### Load dependencies and data

In [80]:
# Dependencies
import pandas as pd
import numpy as np
from collections import defaultdict
import re

# Make sure you can see all output
pd.options.display.max_rows = 4000

In [81]:
# Store filepath in a variable
dfAll = pd.read_csv("Resources/baseline_data-v032924.csv", encoding="ISO-8859-1")
dfAll.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646
1,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365
2,DS,rPPA,15,1,Baseline,knife,SH,1928.584784,1928.765644,0.18086
3,DS,rPPA,15,1,Baseline,knife,SH,1939.241356,1939.408798,0.167443
4,DS,rPPA,15,1,Baseline,knife,N,1946.663867,1948.174319,1.510452


In [82]:
dfAll.columns.to_list()

['RA',
 'Project',
 'PID',
 'Arm',
 'Week',
 'Target',
 'Production',
 'PRODUCTION_START',
 'PRODUCTION_END',
 'PRODUCTION_DURATION']

In [83]:
# Drop duplicates and NaNs
df = dfAll.dropna(subset=['Target','Production']).drop_duplicates()

# Check data
df.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646
1,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365
2,DS,rPPA,15,1,Baseline,knife,SH,1928.584784,1928.765644,0.18086
3,DS,rPPA,15,1,Baseline,knife,SH,1939.241356,1939.408798,0.167443
4,DS,rPPA,15,1,Baseline,knife,N,1946.663867,1948.174319,1.510452


In [84]:
# Check how many participants we have
df['PID'].unique()

# Note: 28 & 22 won't have pause data

array([15,  1,  8, 12,  9,  5, 13, 16,  7,  4, 14, 28, 22])

## Clean target items & transcription misspellings

In [85]:
#Check that target words are correctly written
print(df['Target'].nunique())
df['Target'].unique() # Should be 20 items

22


array(['book', 'ball', 'knife', 'cup', 'safety pin', 'hammer',
       'toothbrush', 'eraser', 'lock', 'pencil', 'screwdriver', 'key',
       'paper clip', 'watch', 'comb', 'rubber band', 'spoon', 'tape',
       'fork', 'matches', 'pen', 'clip'], dtype=object)

In [86]:
#Replace incorrect target words
df['Target'] = (
    df['Target']
    .replace(
        {
        'pen':'pencil', 
        'clip':'paper clip'
        }
    )
)

#Check to see if they all look good now
print(df['Target'].nunique())
df['Target'].unique()

20


array(['book', 'ball', 'knife', 'cup', 'safety pin', 'hammer',
       'toothbrush', 'eraser', 'lock', 'pencil', 'screwdriver', 'key',
       'paper clip', 'watch', 'comb', 'rubber band', 'spoon', 'tape',
       'fork', 'matches'], dtype=object)

In [87]:
# Check the transcriptions for errors
productions = sorted(df['Production'].unique().tolist())

In [88]:
# Replace extra substrings and spaces
df['Production'] = (
    df['Production']
    .replace(
            {
            '\.|\..':'', # Can leave this line out if you want to maintain the syllable markers (i.e., '.')
            'UY':'UW',
            'EI':'EY',
            '\s\s|\s\s\s':' '
            },
        regex=True)
    .str.strip()
)

In [89]:
#Some of the words still have some extra spaces (determined by error message when trying to convert the productions to IPA), so going to delete all spaces and then add them back in

df["Production"] = (
    df["Production"].str.split(" ")  # Split the string by space
    .explode()  # Turn each item in split string into own row maintaining index value
    .dropna()  # Remove NaN values
    .str.strip()  # Strip leading and trailing spaces from each exploded string
    .loc[lambda x: x != ""]  # Keep only non-empty strings after stripping
    .groupby(level=0)  # Group by the original index
    .agg(" ".join)  # Aggregate them with a space
)


#Check
productions = sorted(df['Production'].unique().tolist())

In [90]:
#Replace problematic transcriptions
df['Production'] = (
    df['Production'].replace({
        'AX R EY S AX':'AX R EY S AXR',
        'AX R IY S':'AX R EY S',
        'AX R IY S AXR':'AX R EY S AXR',
        'AX IH R EY S AXR':'AX R EY S AXR',
        'AX R EY Z AXR':'AX R EY S AXR',
        'EH R EY S AXR':'AX R EY S AXR',
        'AXR H AE M AXR':'H AE M AXR', # Preamble included in transcription
        'B R AH SH':'B R AX SH',
        'B R UH SH':'B R AX SH',
        'B R UH SH T IY TH':'B R AH SH T IY TH',
        'B R UH CH AXR T IY TH':'B R AH SH AXR T IY TH',
        'F AO R W EY ':'F AO R W EY',
        'F AO R K ':'F AO R K',
        'F AO RD EY':'F AO R D EY',
        'F OA R K':'F AO R K',
        'F OW R K':'F AO R K',
        'F S EY F T IY P IH N':'S EY F T IY P IH N', # This should have been separated into two production attempts in Praat
        'H AE M AX':'H AE M AXR',
        'H AE M AE R':'H AE M AXR',
        'H AE M AX R':'H AE M AXR',
        'HAE':'H AE',
        'IH R EY S':'AX R EY S',
        'IH R EY S AXR':'AX R EY S AXR',
        'IH R EY S AXR ':'AX R EY S AXR',
        'IY EY IY IY R EY S AXR':'AX R EY S AXR', # This should have been separated into four production attempts in Praat
        'IY R EY S':'AX R EY S',
        'IY R EY S AXR':'AX R EY S AXR',
        'J UH M P IH NG B UW':'JH UH M P IH NG B UW',
        'J UH M P IH NG':'JH UH M P IH NG',
        'K OW M B':'K OW M',
        'K AO M':'K OW M',
        'K AO P':'K OW P',
        'K AO L':'K OW L',
        'L AAK':'L AA K',
        'M AE CH AX S':'M AE CH AX Z',
        'M AE CH IH S':'M AE CH AX Z',
        'M AE CH IH Z':'M AE CH AX Z',
        'M AE ZH IH Z':'M AE CH AX Z',
        'M AE S T AXR L AA K':'L AA K', # Technically a correct production, so accounting for that in the transcription
        'P EH N S L':'P EH N S AX L',
        'P EH N Z AX L':'P EH N S AX L',
        'P EY P ER K L IH ':'P EY P AXR K L IH',
        'P EY P ER K L IH P':'P EY P AXR K L IH P',
        'PAA':'P AA',
        'P EY P AXRR':'P EY P AXR',
        'P EH N S AXL':'P EH N S AX L',
        'N IX ':'N IH',
        'P EY P AXR K LIH P':'P EY P AXR K L IH P',
        'P EY P ER':'P EY P AXR',
        'P EY P AXR  K L IH P':'P EY P AXR K L IH P',
        'P EY P ER D EY':'P EY P AXR D EY',
        'P EY P ER K IH':'P EY P AXR K IH',
        'P EY P ER K L IH ':'P EY P AXR K L IH',
        'P EY P ER K L IH P':'P EY P AXR K L IH P',
        'R AX B AXR B AE N':'R AH B AXR B AE N D',
        'R AX B AXR B AE N D':'R AH B AXR B AE N D',
        'R AX B AXR P IH':'R AH B AXR B P IH',
        'R AX B AXR':'R AH B AXR',
        'R AH B B AE N D Z':'R AH B AE N D Z',
        'R AH AH AH R':'R AH R', # continuance of vowel
        'R EH D D':'R EH D',
        'R EH S S':'R EH S',
        'R IY S S AX':'R IY S AX',
        'R IH S T W AO CH':'W AA CH', # Technically a correct production, so accounting for that in the transcription
        'R UH B ER B A N D':'R AH B AXR B AE N D',
        'R UH B ER B AE N D':'R AH B AXR B AE N D',
        'R UH B ER R UH B ER B AE N D':'R AH B ER R AH B ER B AE N D',
        'R UH B AXR B AE N D':'R AH B AXR B AE N D',
        'S AY F T IY':'S EY F T IY',
        'S AY F T IY P IH N':'S EY F T IY P IH N',
        'S EI F T IY':'S EY F T IY',
        #'K K':'K', # false start
        #'K K N':'K N', # false start
        #'K K UH M':'K UH M', # false start
        #'L L AA K':'L AA K', # false start
        #'S S EY F T IY':'S EY F T IY',  # false start
        #'S S EY F T IY P IH N':'S EY F T IY P IH N', # false start
        #'S S K R UW':'S K R UW', # false start
        #'S S K R UW D R AY V AXR':'S K R UW D R AY V AXR', # false start
        #'S S K R UW S':'S K R UW S', # false start
        #'F F AO':'F AO', # false start
        #'F F AO R K':'F AO R K', # false start
        #'H H AE M AXR':'H AE M AXR', # false start
        'SEYF SEYF T IY':'S EY F S EY F T IY', # reduplication
        'SEYF T IY':'S EY F T IY',
        'SEYF':'S EY F',
        'SN SN SN':'S N',
        'T ':'T',
        'S K R U JH AXR':'S K R UW JH AXR',
        'SK R UW D R AY V AXR':'S K R UW D R AY V AXR',
        'S K UW D R AY V ER':'S K UW D R AY V AXR',
        'S K R UW D R D RAY V AXR':'S K R UW D R D R AY V AXR',
        'T UX':'T UW',
        'T UX SH B OW N':'T UW SH B OW N',
        'T UY': 'T UW',
        'T UY TH P IY S':'T UW TH P IY S',
        'T UW TH B R AH SH':'T UW TH B R AX SH',
        'T UW TH B R UH SH':'T UW TH B R AX SH',
        'T UW TH W IH Z':'T UW TH W IH Z',
        'W AA CH ':'W AA CH',
        'WIH':'W IH'
        }))

In [91]:
# Define a list of Arpabet transcriptions to exclude
excluded_transcriptions = [
    'B AA B IY P IH N', #synonym
    'B AO B IY P IH N', #synonym
    'B EY B IY',
    'B EY B IY S P IH N', #phonological error, but for synonym
    'D AY P ER', #Semantic error
    'D UH B UW EY',
    'K AO F IY',
    'K AX S AH M TH IH NG',
    'P IH GP IH',
    'S IH G AX R EH T',
    'S IH NG IH T P IH NG IH M AE',
    'S K IH S K AA S K IH',
    'S T IH K IY TH AE N G',
    'L AE S T AXR M AE S T AXR',
    'SH UH AO AO'
]

# Filter out the rows where 'Production' is in the list of excluded transcriptions
df = df.loc[~df["Production"].isin(excluded_transcriptions)]

In [92]:
#Check what each phoneme is being registered as
results = (
    # trans is a series, so use string accessor to split value strings
    df["Production"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
)

print(results.nunique()) # amount should be <= 43
results.unique()

39


array(['B', 'UH', 'K', 'AO', 'L', 'SH', 'N', 'AY', 'F', 'AH', 'P', 'S',
       'EY', 'T', 'IY', 'H', 'AE', 'M', 'ER', 'UW', 'OW', 'TH', 'R',
       'AXR', 'AA', 'EH', 'AX', 'DX', 'CH', 'ZH', 'IH', 'W', 'Q', 'JH',
       'D', 'Z', 'V', 'G', 'Y'], dtype=object)

In [93]:
df.reset_index(inplace=True,drop=True)
df.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646
1,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365
2,DS,rPPA,15,1,Baseline,knife,SH,1928.584784,1928.765644,0.18086
3,DS,rPPA,15,1,Baseline,knife,SH,1939.241356,1939.408798,0.167443
4,DS,rPPA,15,1,Baseline,knife,N,1946.663867,1948.174319,1.510452


## Add in phoneme & word information

In [94]:
Phoneme = (
    # Production is a series, so use string accessor to split value strings
    df["Production"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    )

In [95]:
# Merge phonemes with data
df2 = pd.merge(df, Phoneme, left_index=True, right_index=True)

#Reset index
df2.reset_index(inplace=True,drop=True)

df2 = df2.rename(columns={'Production_x':'Production','Production_y':'Phoneme'})

In [96]:
df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO


In [97]:
#Create function to assign a unique ID to words
word_id = defaultdict(lambda: len(word_id))
df2['Word_ID'] = [word_id[i]+1 for i in df2['Target']]

df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme,Word_ID
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B,1
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH,1
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K,1
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B,2
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO,2


In [98]:
#Assign a numeric ID to each session

df2['Session_ID'] = df2["Arm"].astype(str) + df2["Week"].astype(str)

df2['Session_ID'] = (
    df2['Session_ID']
    .replace(
        {
        '1Baseline':'0', 
        '112':'1',
        '212':'2'
        }
    )
)
df2['Session_ID'].unique()

array(['0'], dtype=object)

In [99]:
#Assign production number to label when each production attempt was made for each word
df2['Prod_Word_N'] = df2.groupby(['PID','Session_ID'])['PRODUCTION_START'].rank('dense').astype(int)

# Reset index and drop unneeded one
df2.reset_index(inplace=True,drop=True)
df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme,Word_ID,Session_ID,Prod_Word_N
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B,1,0,1
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH,1,0,1
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K,1,0,1
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B,2,0,2
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO,2,0,2


In [100]:
#Add a new column for repeated index
df2.reset_index(inplace=True)

In [101]:
#Assign production number to label when each production attempt was made for each phoneme
df2['Prod_Phon_N'] = df2.groupby(['PID','Session_ID','PRODUCTION_START'])['index'].rank('dense').astype(int)

#drop unneeded index column
df2 = df2.drop(columns={'index'})

#check
df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B,1,0,1,1
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH,1,0,1,2
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K,1,0,1,3
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B,2,0,2,1
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO,2,0,2,2


In [102]:
#Create a code to tie each participant to their session
df2['PIDSESS_Code'] = (df2["Project"]
                       .astype(str) + 
                       '-' + 
                       df2["PID"].astype(str).str.zfill(2) + 
                       '_' + 
                       df2["Session_ID"].astype(str)
                       )

df2['PIDSESS_Code'].unique()

array(['rPPA-15_0', 'rPPA-01_0', 'rPPA-08_0', 'rPPA-12_0', 'dPPA-09_0',
       'rPPA-05_0', 'rPPA-13_0', 'rPPA-16_0', 'rPPA-07_0', 'rPPA-04_0',
       'rPPA-14_0', 'rPPA-28_0', 'rPPA-22_0'], dtype=object)

In [103]:
#Create a code to tie each word and phoneme together based on phoneme position
df2['WordPhon_Code'] = df2["Word_ID"].astype(str) + str('_') + df2["Prod_Phon_N"].astype(str)

#Create a separate code to tie each participant to their word-phoneme production pairs
df2['Phon_Sess_Code'] = (
    df2["PID"].astype(str) + 
    str('_') + 
    df2["Session_ID"].astype(str) + 
    str('_') + 
    df2["Prod_Word_N"].astype(str) +
    str('_') + 
    df2["Prod_Phon_N"].astype(str)
    .astype(str)
)

#Create a separate code to tie each participant to their word productions
df2['Word_Sess_Code'] = (
    df2["PID"].astype(str) + 
    str('_') + 
    df2["Session_ID"].astype(str) + 
    str('_') + 
    df2["Prod_Word_N"].astype(str)
    .astype(str)
)

df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,PIDSESS_Code,WordPhon_Code,Phon_Sess_Code,Word_Sess_Code
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B,1,0,1,1,rPPA-15_0,1_1,15_0_1_1,15_0_1
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH,1,0,1,2,rPPA-15_0,1_2,15_0_1_2,15_0_1
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K,1,0,1,3,rPPA-15_0,1_3,15_0_1_3,15_0_1
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B,2,0,2,1,rPPA-15_0,2_1,15_0_2_1,15_0_2
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO,2,0,2,2,rPPA-15_0,2_2,15_0_2_2,15_0_2


In [104]:
#Before we can do anything with the syllables we need to identify the last phoneme produced within each production attempt

last_prod_phon = df2.groupby('Word_Sess_Code')['Prod_Phon_N'].last().to_frame()

last_prod_phon['Prod_Last_Phon'] = 1

last_prod_phon = last_prod_phon.reset_index()

last_prod_phon['Phon_Sess_Code'] = (
    last_prod_phon["Word_Sess_Code"].astype(str) + 
    str('_') + 
    last_prod_phon["Prod_Phon_N"].astype(str)
)

last_prod_phon = last_prod_phon.drop(columns=['Word_Sess_Code','Prod_Phon_N'])

last_prod_phon.head()

Unnamed: 0,Prod_Last_Phon,Phon_Sess_Code
0,1,12_0_1_3
1,1,12_0_10_5
2,1,12_0_11_6
3,1,12_0_12_13
4,1,12_0_13_2


In [105]:
#Add the identification for the last phoneme produced to the main dataset
df2 = pd.merge(df2, last_prod_phon, on='Phon_Sess_Code', how='left')
df2['Prod_Last_Phon'] = df2['Prod_Last_Phon'].fillna(0).astype('int')
df2.head(5)

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,Phoneme,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,PIDSESS_Code,WordPhon_Code,Phon_Sess_Code,Word_Sess_Code,Prod_Last_Phon
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,B,1,0,1,1,rPPA-15_0,1_1,15_0_1_1,15_0_1,0
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,UH,1,0,1,2,rPPA-15_0,1_2,15_0_1_2,15_0_1,0
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,K,1,0,1,3,rPPA-15_0,1_3,15_0_1_3,15_0_1,1
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,B,2,0,2,1,rPPA-15_0,2_1,15_0_2_1,15_0_2,0
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,AO,2,0,2,2,rPPA-15_0,2_2,15_0_2_2,15_0_2,0


## Add in IPA transcriptions

In [106]:
#Translate the ARPABET codes to IPA codes 
# NOTE:An error at this step will tell you if any other transcriptions are messed up
dictionary = (
    pd.read_csv("Resources/dict.csv")
    .set_index("Arpabet")
)

df2["Production_IPA"] = (
    # The production column is a series, so use string accessor to split value strings
    df2["Production"].str.split(" ")
    # turn each item in split string into own row maintaining index value
    .explode()
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
    # group them by the original index
    .groupby(level=0)
    # "sum" them, which for string, concatonates them without any spaces
    .sum()
)

df2.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,...,Word_ID,Session_ID,Prod_Word_N,Prod_Phon_N,PIDSESS_Code,WordPhon_Code,Phon_Sess_Code,Word_Sess_Code,Prod_Last_Phon,Production_IPA
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,0,1,1,rPPA-15_0,1_1,15_0_1_1,15_0_1,0,bʊk
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,0,1,2,rPPA-15_0,1_2,15_0_1_2,15_0_1,0,bʊk
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,0,1,3,rPPA-15_0,1_3,15_0_1_3,15_0_1,1,bʊk
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,2,0,2,1,rPPA-15_0,2_1,15_0_2_1,15_0_2,0,bɔl
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,2,0,2,2,rPPA-15_0,2_2,15_0_2_2,15_0_2,0,bɔl


In [107]:
# Check the transcriptions
df2.Production_IPA.unique()

array(['bʊk', 'bɔl', 'ʃ', 'n', 'nɐf', 'kʌp', 's', 'sefti', 'hæmɝ',
       'tuʃbon', 'tu', 'bʊʃ', 'tʊ', 'ti', 'tuθpis', 'tpi', 'tis', 'pi',
       'p', 'i', 'rʌb', 'rʌbɚn', 'rʌbɚ', 'lɑkɚ', 'tɛm', 't', 'pɛnsəl',
       'stru', 'str', 'struɾo', 'skru', 'skruʧ', 'skrus', 'k', 'ʒ', 'ki',
       'ts', 'sɪm', 'pepɚ', 'wɑʧ', 'o', 'rʌbɚl', 'rʌbɚnəʔ', 'rʌ', 'kʌ',
       'ʔ', 'spu', 'spʊʃ', 'sp', 'sbʌ', 'sup', 'te', 'ʃɔr', 'sefsefti',
       'sef', 'ssefti', 'kə', 'si', 'kon', 'hæt', 'brəʃ', 'tuθbrəʃ', 'ɛs',
       'bi', 'bʌs', 'bis', 'resɚ', 'lʌk', 'ʤe', 'ʤi', 'skrubə', 'skrubɐ',
       'skrubɪt', 'skrub', 'pepɚklɪp', 'sio', 'sioɛn', 'hɛr', 'kot',
       'kom', 'rop', 'rʌbɚbænd', 'tri', 'spup', 'skup', 'spun', 'biʧ',
       'tep', 'fʊt', 'fɔrk', 'mɛt', 'mæʔ', 'mæ', 'mæʃ', 'mæʧəz',
       'seftipɪn', 'hæmɚ', 'əresɚ', 'lɑk', 'skrudrɐvɚ', 'f', 'bɑl', 'kʊp',
       'brʌʃɚtiθ', 'brʌʃtiθ', 'ɛresɝ', 'srudrovɚdrɐvɚ', 'ko', 'tæk',
       'tæt', 'frɑg', 'ɑrju', 'bɑ', 'kʊ', 'pɛn', 'pɛ', 'skudrɐvɚ',
       

In [108]:
#Import phonetic feature identifies
phon_dist_features = (
    pd.read_csv("Resources/phon_dist_features.csv")
)
phon_dist_features = phon_dist_features.dropna()
phon_dist_features['Phoneme_ID'] = phon_dist_features['Phoneme_ID'].astype('int')

In [109]:
#Create a dictionary for phoneme ID number

phon_ID = phon_dist_features[['IPA_singles','Arpabet','Phoneme_ID']].copy()

phon_ID.rename(
    columns={
       'IPA_singles':'IPA_singles', 
       'Arpabet':'Phoneme'
       }, inplace=True)

phon_ID

Unnamed: 0,IPA_singles,Phoneme,Phoneme_ID
0,h,H,1
1,r,R,2
2,w,W,3
3,j,Y,4
4,b,B,5
5,ʧ,CH,6
6,d,D,7
7,ð,DH,8
8,ɾ,DX,9
9,f,F,10


In [110]:
#Merge the dictionary to the main dataset on Phoneme, so that corresponding IPA symbols are added as a column
df3 = pd.merge(df2, phon_ID, on='Phoneme', how='left')
df3.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,...,Prod_Word_N,Prod_Phon_N,PIDSESS_Code,WordPhon_Code,Phon_Sess_Code,Word_Sess_Code,Prod_Last_Phon,Production_IPA,IPA_singles,Phoneme_ID
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,1,rPPA-15_0,1_1,15_0_1_1,15_0_1,0,bʊk,b,5
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,2,rPPA-15_0,1_2,15_0_1_2,15_0_1,0,bʊk,ʊ,36
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,1,3,rPPA-15_0,1_3,15_0_1_3,15_0_1,1,bʊk,k,13
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,2,1,rPPA-15_0,2_1,15_0_2_1,15_0_2,0,bɔl,b,5
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,2,2,rPPA-15_0,2_2,15_0_2_2,15_0_2,0,bɔl,ɔ,30


In [111]:
#Add in word information file for target items
word_info = (
    pd.read_csv("Resources/word_info.csv")
)

#Drop unneeded columns and rename other columns
word_info = (
    word_info
    .drop(columns=['N_Tot_Words.1','Target','Target_Word_Struc','Target_Syll_Struc'])
    .rename(columns={
        'Arpabet':'Target_Arpabet',
        'N_Tot_Syllables':'Target_N_Tot_Syllables',
        'N_Tot_Target_Phonemes':'Target_N_Tot_Characters',
        'N_Tot_Words':'Target_N_Tot_Words'
        })
)

word_info.head()

Unnamed: 0,Target_Arpabet,Word_ID,Target_N_Tot_Words,Target_IPA,Target_N_Tot_Syllables,Target_N_Tot_Characters
0,B UH K,1,1,bʊk,1,3
1,B AO L,2,1,bɔl,1,3
2,N AY F,3,1,nɐf,1,3
3,K AH P,4,1,kʌp,1,3
4,S EY F T IY P IH N,5,2,seftipɪn,3,8


In [112]:
# Get rid of syllable symbols
word_info['Target_IPA'] = (
    word_info['Target_IPA']
    .replace(
            {
            '\.|\..':'' # Can leave this line out if you want to maintain the syllable markers (i.e., '.')
            },
        regex=True)
    .str.strip()
)

word_info.Target_IPA.unique()

array(['bʊk', 'bɔl', 'nɐf', 'kʌp', 'seftipɪn', 'hæmɚ', 'tuθbrəʃ', 'ɪresɚ',
       'lɑk', 'pɛnsəl', 'skrudrɐvɚ', 'ki', 'pepɚklɪp', 'wɑʧ', 'kom',
       'rʌbɚbænd', 'spun', 'tep', 'fɔrk', 'mæʧəz'], dtype=object)

In [113]:
# Merge word information with the main dataset
df4 = pd.merge(df3, word_info, on='Word_ID', how='left')
df4.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,...,Word_Sess_Code,Prod_Last_Phon,Production_IPA,IPA_singles,Phoneme_ID,Target_Arpabet,Target_N_Tot_Words,Target_IPA,Target_N_Tot_Syllables,Target_N_Tot_Characters
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,15_0_1,0,bʊk,b,5,B UH K,1,bʊk,1,3
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,15_0_1,0,bʊk,ʊ,36,B UH K,1,bʊk,1,3
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,15_0_1,1,bʊk,k,13,B UH K,1,bʊk,1,3
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,15_0_2,0,bɔl,b,5,B AO L,1,bɔl,1,3
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,15_0_2,0,bɔl,ɔ,30,B AO L,1,bɔl,1,3


In [114]:
#Load the phonological information for the target items
word_phon_info = (
    pd.read_csv("Resources/word_phon_info.csv")
)

#Drop unneeded columns and rename other columns
word_phon_info = (
    word_phon_info
    .drop(columns=[
        'Word',
        'Word_ID',
        'Production_N'
        ])
    .rename(columns={
        'Phoneme':'Target_Phon_Arpabet',
        'Phoneme_ID':'Target_Phoneme_ID',
        'Word_NumID':'Target_Word_NumID',
        'Con_Cluster':'Target_Con_Cluster',
        'Clust_ID':'Target_Clust_ID',
        'Clus_Type':'Target_Clus_Type',
        'Clust_Phon_Pos':'Target_Clust_Phon_Pos',
        'Clust_Phon_Env':'Target_Clust_Phon_Env',
        'PhonClus_Syll_Env':'Target_Syll_Env',
        'Phoneme_Pos_Word':'Target_Word_Pos',
        'Target_IPA':'Target_Phon_IPA',
        'Code':'WordPhon_Code'
        })
)

#Get target IPA symbols
word_phon_info["Target_Phon_IPA"] = (
    # trans is a series, so use string accessor to split value strings
    word_phon_info["Target_Phon_Arpabet"]
    # perform the lookup in the dictionary of each individual value
    .apply(lambda v: dictionary.loc[v])
)

word_phon_info.head()

Unnamed: 0,Target_Phon_Arpabet,WordPhon_Code,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
0,B,1_1,5,#_V,#_V,1,1,0,,,,,b
1,UH,1_2,36,C_C,C_C,1,1,0,,,,,ʊ
2,K,1_3,13,V_#,V_#,1,1,0,,,,,k
3,B,2_1,5,#_V,#_V,1,1,0,,,,,b
4,AO,2_2,30,C_C,C_C,1,1,0,,,,,ɔ


In [115]:
word_phon_info.Target_Phon_IPA.unique()

array(['b', 'ʊ', 'k', 'ɔ', 'l', 'n', 'ɐ', 'f', 'ʌ', 'p', 's', 'e', 't',
       'i', 'ɪ', 'h', 'æ', 'm', 'ɚ', 'u', 'θ', 'r', 'ə', 'ʃ', 'ɑ', 'ɛ',
       'd', 'v', 'w', 'ʧ', 'o', 'z'], dtype=object)

In [116]:
# Merge target phonological information with the main dataset
df5 = pd.merge(df4, word_phon_info, on='WordPhon_Code', how='left')
df5.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,PRODUCTION_START,PRODUCTION_END,PRODUCTION_DURATION,...,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
0,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,#_V,#_V,1.0,1.0,0.0,,,,,b
1,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,C_C,C_C,1.0,1.0,0.0,,,,,ʊ
2,DS,rPPA,15,1,Baseline,book,B UH K,1900.200674,1900.496319,0.295646,...,V_#,V_#,1.0,1.0,0.0,,,,,k
3,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,#_V,#_V,1.0,1.0,0.0,,,,,b
4,DS,rPPA,15,1,Baseline,ball,B AO L,1905.496713,1905.894079,0.397365,...,C_C,C_C,1.0,1.0,0.0,,,,,ɔ


In [117]:
#Clean up main dataset's columns and rename things
df6 = (
    df5
    .drop(columns=[
        'PRODUCTION_START',
        'PRODUCTION_END'
        ])
    .rename(columns={
        'Phoneme':'Prod_Arpabet',
        'IPA_singles':'Prod_Phon_IPA',
        'Production_IPA':'Prod_Word_IPA',
        'Phoneme_ID':'Prod_Phoneme_ID',
        'Target_IPA':'Target_Word_IPA',
        'PRODUCTION_DURATION':'Prod_Word_Dur'
        })
)

#Fill in NAs with integer or string values, depending on column
df6['Target_Phoneme_ID'] = df6['Target_Phoneme_ID'].fillna(44).astype('int')
df6['Target_Syll_Env'] = df6['Target_Syll_Env'].fillna('addition').astype('str')
df6['Target_Word_Pos'] = df6['Target_Word_Pos'].fillna('addition').astype('str')

# Check
df6[27:33][['Prod_Word_IPA','Target_Word_IPA','Prod_Phon_IPA', 'Target_Phon_IPA']]


Unnamed: 0,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_IPA
27,hæmɝ,hæmɚ,h,h
28,hæmɝ,hæmɚ,æ,æ
29,hæmɝ,hæmɚ,m,m
30,hæmɝ,hæmɚ,ɝ,ɚ
31,tuʃbon,tuθbrəʃ,t,t
32,tuʃbon,tuθbrəʃ,u,u


# WARNING: 
### If you changed anything in the script above or are using a new dataset, you will need to update the row numbers referenced in the next section accordingly

# Reduplication and shift check

The following items have reduplicated or shifted phonemes/syllables
* 'srudrovɚdrɐvɚ'
* 'rʌbɝrʌbɝbænd'
* 'brʌʃtiθ'
* 'brʌʃɚtiθ'
* 'sefsefti'
* 'ssefti'
* 'sskrus'

In [118]:
# Make a copy of the dataset
df7 = df6.copy()

In [119]:
# Identify the columns to be moved
columns_to_move = df7.columns[df7.columns.get_loc('Target_Phon_Arpabet'):].tolist()
columns_to_move

['Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA']

In [120]:
# Show rows for srudrovɚdrɐvɚ
df6[
    df6['Prod_Word_IPA'].isin([
        'srudrovɚdrɐvɚ'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
741,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,s,S,19,#_V,#_V,1.0,1.0,1.0,skr,r,1.0,#_C,s
742,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,K,13,#_V,C_C,1.0,1.0,1.0,skr,r,2.0,C_C,k
743,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,u,R,2,#_V,C_V,1.0,1.0,1.0,skr,r,3.0,C_V,r
744,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,d,UW,38,C_#,C_C,1.0,1.0,0.0,,,,,u
745,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,D,7,#_V,V_C,2.0,1.0,1.0,dr,r,1.0,#_C,d
746,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,o,R,2,#_V,C_V,2.0,1.0,1.0,dr,r,2.0,C_V,r
747,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,v,AY,40,C_C,C_C,2.0,1.0,0.0,,,,,ɐ
748,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,ɚ,V,23,#_V,V_V,3.0,1.0,0.0,,,,,v
749,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,d,AXR,41,C_#,C_#,3.0,1.0,0.0,,,,,ɚ
750,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,,44,addition,addition,,,,,,,,


In [121]:
# Fix row numbers for srudrovɚdrɐvɚ
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[742:748, columns_to_move] = df6.loc[743:749, columns_to_move].values

# Add in the information for the additions
df7.loc[749:753, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'srudrovɚdrɐvɚ'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
741,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,s,S,19,#_V,#_V,1.0,1.0,1.0,skr,r,1.0,#_C,s
742,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,R,2,#_V,C_V,1.0,1.0,1.0,skr,r,3.0,C_V,r
743,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,u,UW,38,C_#,C_C,1.0,1.0,0.0,,,,,u
744,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,d,D,7,#_V,V_C,2.0,1.0,1.0,dr,r,1.0,#_C,d
745,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,R,2,#_V,C_V,2.0,1.0,1.0,dr,r,2.0,C_V,r
746,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,o,AY,40,C_C,C_C,2.0,1.0,0.0,,,,,ɐ
747,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,v,V,23,#_V,V_V,3.0,1.0,0.0,,,,,v
748,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,ɚ,AXR,41,C_#,C_#,3.0,1.0,0.0,,,,,ɚ
749,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,d,,44,addition,addition,,,,,,,,
750,12_0_12,srudrovɚdrɐvɚ,skrudrɐvɚ,r,,44,addition,addition,,,,,,,,


In [122]:
# Show rows for rʌbɝrʌbɝbænd
df6[
    df6['Prod_Word_IPA'].isin([
        'rʌbɝrʌbɝbænd'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
877,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,r,R,2,#_V,#_V,1.0,1.0,0.0,,,,,r
878,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ʌ,AH,29,C_#,C_C,1.0,1.0,0.0,,,,,ʌ
879,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,B,5,#_V,V_V,2.0,1.0,0.0,,,,,b
880,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ɝ,AXR,41,C_#,C_#,2.0,1.0,0.0,,,,,ɚ
881,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,r,B,5,#_V,#_V,3.0,2.0,0.0,,,,,b
882,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ʌ,AE,28,C_C,C_C,3.0,2.0,0.0,,,,,æ
883,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,N,16,V_#,V_C,3.0,2.0,1.0,nd,n,1.0,V_C,n
884,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ɝ,D,7,V_#,V_#,3.0,2.0,1.0,nd,n,2.0,C_#,d
885,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,,44,addition,addition,,,,,,,,
886,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,æ,,44,addition,addition,,,,,,,,


In [123]:
# Fix row numbers for rʌbɝrʌbɝbænd
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[881:888, columns_to_move] = df6.loc[877:884, columns_to_move].values

# Add in the information for the additions
df7.loc[877:880, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'rʌbɝrʌbɝbænd'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
877,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,r,,44,addition,addition,,,,,,,,
878,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ʌ,,44,addition,addition,,,,,,,,
879,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,,44,addition,addition,,,,,,,,
880,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ɝ,,44,addition,addition,,,,,,,,
881,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,r,R,2,#_V,#_V,1.0,1.0,0.0,,,,,r
882,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ʌ,AH,29,C_#,C_C,1.0,1.0,0.0,,,,,ʌ
883,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,B,5,#_V,V_V,2.0,1.0,0.0,,,,,b
884,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,ɝ,AXR,41,C_#,C_#,2.0,1.0,0.0,,,,,ɚ
885,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,b,B,5,#_V,#_V,3.0,2.0,0.0,,,,,b
886,9_0_23,rʌbɝrʌbɝbænd,rʌbɚbænd,æ,AE,28,C_C,C_C,3.0,2.0,0.0,,,,,æ


In [124]:
# Show rows for brʌʃtiθ
df6[
    df6['Prod_Word_IPA'].isin([
        'brʌʃtiθ'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
719,12_0_8,brʌʃtiθ,tuθbrəʃ,b,T,21,#_V,#_V,1.0,1.0,0.0,,,,,t
720,12_0_8,brʌʃtiθ,tuθbrəʃ,r,UW,38,C_C,C_C,1.0,1.0,0.0,,,,,u
721,12_0_8,brʌʃtiθ,tuθbrəʃ,ʌ,TH,22,V_#,V_C,1.0,1.0,0.0,,,,,θ
722,12_0_8,brʌʃtiθ,tuθbrəʃ,ʃ,B,5,#_V,C_C,2.0,1.0,1.0,br,r,1.0,#_C,b
723,12_0_8,brʌʃtiθ,tuθbrəʃ,t,R,2,#_V,C_V,2.0,1.0,1.0,br,r,2.0,C_V,r
724,12_0_8,brʌʃtiθ,tuθbrəʃ,i,AX,31,C_C,C_C,2.0,1.0,0.0,,,,,ə
725,12_0_8,brʌʃtiθ,tuθbrəʃ,θ,SH,20,V_#,V_#,2.0,1.0,0.0,,,,,ʃ


In [125]:
# Fix row numbers for brʌʃtiθ
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[719:722, columns_to_move] = df6.loc[722:725, columns_to_move].values
df7.loc[723:725, columns_to_move] = df6.loc[719:721, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'brʌʃtiθ'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
719,12_0_8,brʌʃtiθ,tuθbrəʃ,b,B,5,#_V,C_C,2.0,1.0,1.0,br,r,1.0,#_C,b
720,12_0_8,brʌʃtiθ,tuθbrəʃ,r,R,2,#_V,C_V,2.0,1.0,1.0,br,r,2.0,C_V,r
721,12_0_8,brʌʃtiθ,tuθbrəʃ,ʌ,AX,31,C_C,C_C,2.0,1.0,0.0,,,,,ə
722,12_0_8,brʌʃtiθ,tuθbrəʃ,ʃ,SH,20,V_#,V_#,2.0,1.0,0.0,,,,,ʃ
723,12_0_8,brʌʃtiθ,tuθbrəʃ,t,T,21,#_V,#_V,1.0,1.0,0.0,,,,,t
724,12_0_8,brʌʃtiθ,tuθbrəʃ,i,UW,38,C_C,C_C,1.0,1.0,0.0,,,,,u
725,12_0_8,brʌʃtiθ,tuθbrəʃ,θ,TH,22,V_#,V_C,1.0,1.0,0.0,,,,,θ


In [126]:
# Show rows for brʌʃɚtiθ
df6[
    df6['Prod_Word_IPA'].isin([
        'brʌʃɚtiθ'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
711,12_0_7,brʌʃɚtiθ,tuθbrəʃ,b,T,21,#_V,#_V,1.0,1.0,0.0,,,,,t
712,12_0_7,brʌʃɚtiθ,tuθbrəʃ,r,UW,38,C_C,C_C,1.0,1.0,0.0,,,,,u
713,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ʌ,TH,22,V_#,V_C,1.0,1.0,0.0,,,,,θ
714,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ʃ,B,5,#_V,C_C,2.0,1.0,1.0,br,r,1.0,#_C,b
715,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ɚ,R,2,#_V,C_V,2.0,1.0,1.0,br,r,2.0,C_V,r
716,12_0_7,brʌʃɚtiθ,tuθbrəʃ,t,AX,31,C_C,C_C,2.0,1.0,0.0,,,,,ə
717,12_0_7,brʌʃɚtiθ,tuθbrəʃ,i,SH,20,V_#,V_#,2.0,1.0,0.0,,,,,ʃ
718,12_0_7,brʌʃɚtiθ,tuθbrəʃ,θ,,44,addition,addition,,,,,,,,


In [127]:
# Fix row numbers for brʌʃɚtiθ
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[711:714, columns_to_move] = df6.loc[714:717, columns_to_move].values
df7.loc[716:718, columns_to_move] = df6.loc[711:713, columns_to_move].values

# Add in the information for the additions
df7.loc[715, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'brʌʃɚtiθ'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
711,12_0_7,brʌʃɚtiθ,tuθbrəʃ,b,B,5,#_V,C_C,2.0,1.0,1.0,br,r,1.0,#_C,b
712,12_0_7,brʌʃɚtiθ,tuθbrəʃ,r,R,2,#_V,C_V,2.0,1.0,1.0,br,r,2.0,C_V,r
713,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ʌ,AX,31,C_C,C_C,2.0,1.0,0.0,,,,,ə
714,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ʃ,SH,20,V_#,V_#,2.0,1.0,0.0,,,,,ʃ
715,12_0_7,brʌʃɚtiθ,tuθbrəʃ,ɚ,,44,addition,addition,,,,,,,,
716,12_0_7,brʌʃɚtiθ,tuθbrəʃ,t,T,21,#_V,#_V,1.0,1.0,0.0,,,,,t
717,12_0_7,brʌʃɚtiθ,tuθbrəʃ,i,UW,38,C_C,C_C,1.0,1.0,0.0,,,,,u
718,12_0_7,brʌʃɚtiθ,tuθbrəʃ,θ,TH,22,V_#,V_C,1.0,1.0,0.0,,,,,θ


In [128]:
# Show rows for sefsefti
df6[
    df6['Prod_Word_IPA'].isin([
        'sefsefti'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
283,1_0_9,sefsefti,seftipɪn,s,S,19,#_V,#_V,1.0,1.0,0.0,,,,,s
284,1_0_9,sefsefti,seftipɪn,e,EY,33,C_C,C_C,1.0,1.0,0.0,,,,,e
285,1_0_9,sefsefti,seftipɪn,f,F,10,V_#,V_C,1.0,1.0,0.0,,,,,f
286,1_0_9,sefsefti,seftipɪn,s,T,21,#_V,C_V,2.0,1.0,0.0,,,,,t
287,1_0_9,sefsefti,seftipɪn,e,IY,35,C_#,C_C,2.0,1.0,0.0,,,,,i
288,1_0_9,sefsefti,seftipɪn,f,P,18,#_V,#_V,3.0,2.0,0.0,,,,,p
289,1_0_9,sefsefti,seftipɪn,t,IH,34,C_C,C_C,3.0,2.0,0.0,,,,,ɪ
290,1_0_9,sefsefti,seftipɪn,i,N,16,V_#,V_#,3.0,2.0,0.0,,,,,n


In [129]:
# Fix row numbers for sefsefti
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[286:290, columns_to_move] = df6.loc[283:287, columns_to_move].values

# Add in the information for the additions
df7.loc[283:285, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'sefsefti'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
283,1_0_9,sefsefti,seftipɪn,s,,44,addition,addition,,,,,,,,
284,1_0_9,sefsefti,seftipɪn,e,,44,addition,addition,,,,,,,,
285,1_0_9,sefsefti,seftipɪn,f,,44,addition,addition,,,,,,,,
286,1_0_9,sefsefti,seftipɪn,s,S,19,#_V,#_V,1.0,1.0,0.0,,,,,s
287,1_0_9,sefsefti,seftipɪn,e,EY,33,C_C,C_C,1.0,1.0,0.0,,,,,e
288,1_0_9,sefsefti,seftipɪn,f,F,10,V_#,V_C,1.0,1.0,0.0,,,,,f
289,1_0_9,sefsefti,seftipɪn,t,T,21,#_V,C_V,2.0,1.0,0.0,,,,,t
290,1_0_9,sefsefti,seftipɪn,i,IY,35,C_#,C_C,2.0,1.0,0.0,,,,,i


In [130]:
# Show rows for ssefti
df6[
    df6['Prod_Word_IPA'].isin([
        'ssefti'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
299,1_0_12,ssefti,seftipɪn,s,S,19,#_V,#_V,1.0,1.0,0.0,,,,,s
300,1_0_12,ssefti,seftipɪn,s,EY,33,C_C,C_C,1.0,1.0,0.0,,,,,e
301,1_0_12,ssefti,seftipɪn,e,F,10,V_#,V_C,1.0,1.0,0.0,,,,,f
302,1_0_12,ssefti,seftipɪn,f,T,21,#_V,C_V,2.0,1.0,0.0,,,,,t
303,1_0_12,ssefti,seftipɪn,t,IY,35,C_#,C_C,2.0,1.0,0.0,,,,,i
304,1_0_12,ssefti,seftipɪn,i,P,18,#_V,#_V,3.0,2.0,0.0,,,,,p


In [131]:
# Fix row numbers for ssefti
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[300:304, columns_to_move] = df6.loc[299:303, columns_to_move].values

# Add in the information for the additions
df7.loc[299, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'ssefti'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
299,1_0_12,ssefti,seftipɪn,s,,44,addition,addition,,,,,,,,
300,1_0_12,ssefti,seftipɪn,s,S,19,#_V,#_V,1.0,1.0,0.0,,,,,s
301,1_0_12,ssefti,seftipɪn,e,EY,33,C_C,C_C,1.0,1.0,0.0,,,,,e
302,1_0_12,ssefti,seftipɪn,f,F,10,V_#,V_C,1.0,1.0,0.0,,,,,f
303,1_0_12,ssefti,seftipɪn,t,T,21,#_V,C_V,2.0,1.0,0.0,,,,,t
304,1_0_12,ssefti,seftipɪn,i,IY,35,C_#,C_C,2.0,1.0,0.0,,,,,i


In [132]:
# Show rows for sskrus
df6[
    df6['Prod_Word_IPA'].isin([
        'sskrus'
        ])][[
            'Word_Sess_Code',
            'Prod_Word_IPA',
            'Target_Word_IPA',
            'Prod_Phon_IPA',
            'Target_Phon_Arpabet',
            'Target_Phoneme_ID',
            'Target_Syll_Env',
            'Target_Word_Pos',
            'Syllable_NumID',
            'Target_Word_NumID',
            'Target_Con_Cluster',
            'Target_Clust_ID',
            'Target_Clus_Type',
            'Target_Clust_Phon_Pos',
            'Target_Clust_Phon_Env',
            'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
1382,4_0_22,sskrus,skrudrɐvɚ,s,S,19,#_V,#_V,1.0,1.0,1.0,skr,r,1.0,#_C,s
1383,4_0_22,sskrus,skrudrɐvɚ,s,K,13,#_V,C_C,1.0,1.0,1.0,skr,r,2.0,C_C,k
1384,4_0_22,sskrus,skrudrɐvɚ,k,R,2,#_V,C_V,1.0,1.0,1.0,skr,r,3.0,C_V,r
1385,4_0_22,sskrus,skrudrɐvɚ,r,UW,38,C_#,C_C,1.0,1.0,0.0,,,,,u
1386,4_0_22,sskrus,skrudrɐvɚ,u,D,7,#_V,V_C,2.0,1.0,1.0,dr,r,1.0,#_C,d
1387,4_0_22,sskrus,skrudrɐvɚ,s,R,2,#_V,C_V,2.0,1.0,1.0,dr,r,2.0,C_V,r


In [133]:
# Fix row numbers for sskrus
# CONFIRM ROW NUMBERS BEFORE RUNNING

# Shift the data down
df7.loc[1383:1387, columns_to_move] = df6.loc[1382:1386, columns_to_move].values

# Add in the information for the additions
df7.loc[1382, columns_to_move] = df6.loc[750, columns_to_move].values

# Check
df7[df7['Prod_Word_IPA'] == 'sskrus'][[
    'Word_Sess_Code',
    'Prod_Word_IPA',
    'Target_Word_IPA',
    'Prod_Phon_IPA',
    'Target_Phon_Arpabet',
    'Target_Phoneme_ID',
    'Target_Syll_Env',
    'Target_Word_Pos',
    'Syllable_NumID',
    'Target_Word_NumID',
    'Target_Con_Cluster',
    'Target_Clust_ID',
    'Target_Clus_Type',
    'Target_Clust_Phon_Pos',
    'Target_Clust_Phon_Env',
    'Target_Phon_IPA']]

Unnamed: 0,Word_Sess_Code,Prod_Word_IPA,Target_Word_IPA,Prod_Phon_IPA,Target_Phon_Arpabet,Target_Phoneme_ID,Target_Syll_Env,Target_Word_Pos,Syllable_NumID,Target_Word_NumID,Target_Con_Cluster,Target_Clust_ID,Target_Clus_Type,Target_Clust_Phon_Pos,Target_Clust_Phon_Env,Target_Phon_IPA
1382,4_0_22,sskrus,skrudrɐvɚ,s,,44,addition,addition,,,,,,,,
1383,4_0_22,sskrus,skrudrɐvɚ,s,S,19,#_V,#_V,1.0,1.0,1.0,skr,r,1.0,#_C,s
1384,4_0_22,sskrus,skrudrɐvɚ,k,K,13,#_V,C_C,1.0,1.0,1.0,skr,r,2.0,C_C,k
1385,4_0_22,sskrus,skrudrɐvɚ,r,R,2,#_V,C_V,1.0,1.0,1.0,skr,r,3.0,C_V,r
1386,4_0_22,sskrus,skrudrɐvɚ,u,UW,38,C_#,C_C,1.0,1.0,0.0,,,,,u
1387,4_0_22,sskrus,skrudrɐvɚ,s,D,7,#_V,V_C,2.0,1.0,1.0,dr,r,1.0,#_C,d


## Assign Phonetic Features

In [134]:
#Assign target phonetic features
target_phon_dist_features = (
    phon_dist_features
    .drop(columns=[
        'IPA_singles',
        'Arpabet'
        ])
    .add_prefix("Target_")
)
target_phon_dist_features.head()

Unnamed: 0,Target_Phoneme_ID,Target_syllabic,Target_consonantal,Target_sonorant,Target_continuant,Target_delayed release,Target_approximant,Target_tap,Target_nasal,Target_voice,...,Target_velar,Target_alveolar,Target_post-alveolar,Target_dental,Target_palatal,Target_glottal,Target_stop,Target_fricative,Target_affricate,Target_glide
0,1,-1,-1,-1,1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,1,-1,1,-1,-1
1,2,-1,-1,1,1,0,1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,-1
2,3,-1,-1,1,1,0,1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,4,-1,-1,1,1,0,1,-1,-1,1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,1
4,5,-1,1,-1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1


In [135]:
#Assign production phonetic features
prod_phon_dist_features = (
    phon_dist_features
    .drop(columns=[
        'IPA_singles',
        'Arpabet'
        ])
    .add_prefix("Prod_")
)
prod_phon_dist_features.head()

Unnamed: 0,Prod_Phoneme_ID,Prod_syllabic,Prod_consonantal,Prod_sonorant,Prod_continuant,Prod_delayed release,Prod_approximant,Prod_tap,Prod_nasal,Prod_voice,...,Prod_velar,Prod_alveolar,Prod_post-alveolar,Prod_dental,Prod_palatal,Prod_glottal,Prod_stop,Prod_fricative,Prod_affricate,Prod_glide
0,1,-1,-1,-1,1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,1,-1,1,-1,-1
1,2,-1,-1,1,1,0,1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,-1
2,3,-1,-1,1,1,0,1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,4,-1,-1,1,1,0,1,-1,-1,1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,1
4,5,-1,1,-1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1


In [136]:
#Load place-voicing-manner information
phon_pvm = pd.read_csv("Resources/phon_pvm.csv")
phon_pvm.head()

Unnamed: 0,IPA_singles,Arpabet,Phoneme_ID,Place,Manner,Place_N,Manner_N,Height,Frontness,Height_N,Frontness_N
0,h,H,1,glottal,fricative,8,4,consonant,consonant,0,0
1,r,R,2,alveolar,retroflex approximant,4,6,consonant,consonant,0,0
2,w,W,3,bilabial,glide,1,8,consonant,consonant,0,0
3,j,Y,4,palatal,glide,6,8,consonant,consonant,0,0
4,b,B,5,bilabial,stop,1,1,consonant,consonant,0,0


In [137]:
#Assign pvm features for productions
prod_phon_pvm = (
    phon_pvm
    .drop(columns=[
        'IPA_singles',
        'Arpabet'
        ])
    .add_prefix("Prod_")
)
prod_phon_pvm.head()

Unnamed: 0,Prod_Phoneme_ID,Prod_Place,Prod_Manner,Prod_Place_N,Prod_Manner_N,Prod_Height,Prod_Frontness,Prod_Height_N,Prod_Frontness_N
0,1,glottal,fricative,8,4,consonant,consonant,0,0
1,2,alveolar,retroflex approximant,4,6,consonant,consonant,0,0
2,3,bilabial,glide,1,8,consonant,consonant,0,0
3,4,palatal,glide,6,8,consonant,consonant,0,0
4,5,bilabial,stop,1,1,consonant,consonant,0,0


In [138]:
#Assign pvm features for targets
target_phon_pvm = (
    phon_pvm
    .drop(columns=[
        'IPA_singles',
        'Arpabet'
        ])
    .add_prefix("Target_")
)
target_phon_pvm.head()

Unnamed: 0,Target_Phoneme_ID,Target_Place,Target_Manner,Target_Place_N,Target_Manner_N,Target_Height,Target_Frontness,Target_Height_N,Target_Frontness_N
0,1,glottal,fricative,8,4,consonant,consonant,0,0
1,2,alveolar,retroflex approximant,4,6,consonant,consonant,0,0
2,3,bilabial,glide,1,8,consonant,consonant,0,0
3,4,palatal,glide,6,8,consonant,consonant,0,0
4,5,bilabial,stop,1,1,consonant,consonant,0,0


In [139]:
#Merge all the feature information with the main dataset
df8 = (
    df7.merge(
            prod_phon_dist_features, 
            on='Prod_Phoneme_ID', 
            how='left')
        .merge(
            prod_phon_pvm, 
            on='Prod_Phoneme_ID', 
            how='left')
        .merge( 
            target_phon_dist_features, 
            on='Target_Phoneme_ID', 
            how='left')
        .merge( 
            target_phon_pvm, 
            on='Target_Phoneme_ID', 
            how='left')
        
        )
df7.columns.tolist()

['RA',
 'Project',
 'PID',
 'Arm',
 'Week',
 'Target',
 'Production',
 'Prod_Word_Dur',
 'Prod_Arpabet',
 'Word_ID',
 'Session_ID',
 'Prod_Word_N',
 'Prod_Phon_N',
 'PIDSESS_Code',
 'WordPhon_Code',
 'Phon_Sess_Code',
 'Word_Sess_Code',
 'Prod_Last_Phon',
 'Prod_Word_IPA',
 'Prod_Phon_IPA',
 'Prod_Phoneme_ID',
 'Target_Arpabet',
 'Target_N_Tot_Words',
 'Target_Word_IPA',
 'Target_N_Tot_Syllables',
 'Target_N_Tot_Characters',
 'Target_Phon_Arpabet',
 'Target_Phoneme_ID',
 'Target_Syll_Env',
 'Target_Word_Pos',
 'Syllable_NumID',
 'Target_Word_NumID',
 'Target_Con_Cluster',
 'Target_Clust_ID',
 'Target_Clus_Type',
 'Target_Clust_Phon_Pos',
 'Target_Clust_Phon_Env',
 'Target_Phon_IPA']

## Add WAB Scores

In [156]:
# Import the WAB dataset
WAB = pd.read_csv("Resources/Baseline_WABs.csv", encoding="ISO-8859-1")
WAB.head()

Unnamed: 0,Project,PID,redcap_event_name,Session_ID,wab1_aq,wab1_ss_total,wab1_avc_total,wab1_r_total,wab1_nwf_total
0,rPPA,1,arm_1_baseline_arm_1,0,64.7,14,8.25,6.0,4.1
1,rPPA,2,arm_1_baseline_arm_1,0,57.6,13,7.6,2.7,5.5
2,rPPA,3,arm_1_baseline_arm_1,0,89.2,18,10.0,6.8,9.8
3,rPPA,4,arm_1_baseline_arm_1,0,73.7,14,8.25,7.1,7.5
4,rPPA,5,arm_1_baseline_arm_1,0,78.0,15,9.0,6.7,8.3


In [157]:
#see what participants are in the dataset
WAB['PID'].unique()

array([ 1,  2,  3,  4,  5,  7,  8, 10, 12, 13, 14, 15, 16, 18, 20, 22, 23,
       25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38,  9])

In [158]:
#Create a code to tie each participant to their session
WAB['PIDSESS_Code'] = (WAB["Project"]
                       .astype(str) + 
                       '-' +
                       WAB["PID"].astype(str).str.zfill(2) + 
                       '_' + 
                       WAB["Session_ID"].astype(str)
                       )

WAB['PIDSESS_Code'].unique()

array(['rPPA-01_0', 'rPPA-02_0', 'rPPA-03_0', 'rPPA-04_0', 'rPPA-05_0',
       'rPPA-07_0', 'rPPA-08_0', 'rPPA-10_0', 'rPPA-12_0', 'rPPA-13_0',
       'rPPA-14_0', 'rPPA-15_0', 'rPPA-16_0', 'rPPA-18_0', 'rPPA-20_0',
       'rPPA-22_0', 'rPPA-23_0', 'rPPA-25_0', 'rPPA-26_0', 'rPPA-28_0',
       'rPPA-29_0', 'rPPA-30_0', 'rPPA-31_0', 'rPPA-32_0', 'rPPA-33_0',
       'rPPA-34_0', 'rPPA-35_0', 'rPPA-36_0', 'rPPA-38_0', 'dPPA-09_0'],
      dtype=object)

In [159]:
#Check column names
WAB.columns.to_list()

['Project',
 'PID',
 'redcap_event_name',
 'Session_ID',
 'wab1_aq',
 'wab1_ss_total',
 'wab1_avc_total',
 'wab1_r_total',
 'wab1_nwf_total',
 'PIDSESS_Code']

In [160]:
#Create a list of subject WAB-AQ scores at Baseline
wabaq = WAB[WAB['Session_ID']==0].groupby('PID')['wab1_aq'].mean().to_frame().reset_index()
wabaq

Unnamed: 0,PID,wab1_aq
0,1,64.7
1,2,57.6
2,3,89.2
3,4,73.7
4,5,78.0
5,7,94.0
6,8,91.8
7,9,41.7
8,10,92.6
9,12,73.0


In [161]:
# Create bins in which to place values based on Baseline WAB-AQ scores
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Create labels for these bins
group_labels = ['1-10', '11-20', '21-30', '31-40','41-50','51-60','61-70','71-80','81-90','91-100']

In [162]:
# Slice the data and place it into bins
wabaq['wabaq_start']=pd.cut(wabaq["wab1_aq"], bins, labels=group_labels)
wabaq=wabaq.drop(columns=['wab1_aq'])
wabaq

Unnamed: 0,PID,wabaq_start
0,1,61-70
1,2,51-60
2,3,81-90
3,4,71-80
4,5,71-80
5,7,91-100
6,8,91-100
7,9,41-50
8,10,91-100
9,12,71-80


In [163]:
#Merge in starting WAB-AQ score
WAB = WAB.merge(wabaq, on='PID', how='left')
WAB['wabaq_start'] = WAB['wabaq_start'].astype(str)
WAB.head()

Unnamed: 0,Project,PID,redcap_event_name,Session_ID,wab1_aq,wab1_ss_total,wab1_avc_total,wab1_r_total,wab1_nwf_total,PIDSESS_Code,wabaq_start
0,rPPA,1,arm_1_baseline_arm_1,0,64.7,14,8.25,6.0,4.1,rPPA-01_0,61-70
1,rPPA,2,arm_1_baseline_arm_1,0,57.6,13,7.6,2.7,5.5,rPPA-02_0,51-60
2,rPPA,3,arm_1_baseline_arm_1,0,89.2,18,10.0,6.8,9.8,rPPA-03_0,81-90
3,rPPA,4,arm_1_baseline_arm_1,0,73.7,14,8.25,7.1,7.5,rPPA-04_0,71-80
4,rPPA,5,arm_1_baseline_arm_1,0,78.0,15,9.0,6.7,8.3,rPPA-05_0,71-80


In [164]:
#Drop unneeded columns
WAB = WAB.drop(['PID','Session_ID','Project'], axis=1)
WAB.head()

Unnamed: 0,redcap_event_name,wab1_aq,wab1_ss_total,wab1_avc_total,wab1_r_total,wab1_nwf_total,PIDSESS_Code,wabaq_start
0,arm_1_baseline_arm_1,64.7,14,8.25,6.0,4.1,rPPA-01_0,61-70
1,arm_1_baseline_arm_1,57.6,13,7.6,2.7,5.5,rPPA-02_0,51-60
2,arm_1_baseline_arm_1,89.2,18,10.0,6.8,9.8,rPPA-03_0,81-90
3,arm_1_baseline_arm_1,73.7,14,8.25,7.1,7.5,rPPA-04_0,71-80
4,arm_1_baseline_arm_1,78.0,15,9.0,6.7,8.3,rPPA-05_0,71-80


In [165]:
# Check that the codes are correct
WAB['PIDSESS_Code'].unique()

array(['rPPA-01_0', 'rPPA-02_0', 'rPPA-03_0', 'rPPA-04_0', 'rPPA-05_0',
       'rPPA-07_0', 'rPPA-08_0', 'rPPA-10_0', 'rPPA-12_0', 'rPPA-13_0',
       'rPPA-14_0', 'rPPA-15_0', 'rPPA-16_0', 'rPPA-18_0', 'rPPA-20_0',
       'rPPA-22_0', 'rPPA-23_0', 'rPPA-25_0', 'rPPA-26_0', 'rPPA-28_0',
       'rPPA-29_0', 'rPPA-30_0', 'rPPA-31_0', 'rPPA-32_0', 'rPPA-33_0',
       'rPPA-34_0', 'rPPA-35_0', 'rPPA-36_0', 'rPPA-38_0', 'dPPA-09_0'],
      dtype=object)

In [166]:
#merge the WAB data in with the main data
df9 = df8.merge(WAB, how = 'left', on = 'PIDSESS_Code').drop_duplicates()
df9.head()

Unnamed: 0,RA,Project,PID,Arm,Week,Target,Production,Prod_Word_Dur,Prod_Arpabet,Word_ID,...,Target_Frontness,Target_Height_N,Target_Frontness_N,redcap_event_name,wab1_aq,wab1_ss_total,wab1_avc_total,wab1_r_total,wab1_nwf_total,wabaq_start
0,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,B,1,...,consonant,0,0,arm_1_baseline_arm_1,67.8,15,9.8,5.3,3.8,61-70
1,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,UH,1,...,back,1,3,arm_1_baseline_arm_1,67.8,15,9.8,5.3,3.8,61-70
2,DS,rPPA,15,1,Baseline,book,B UH K,0.295646,K,1,...,consonant,0,0,arm_1_baseline_arm_1,67.8,15,9.8,5.3,3.8,61-70
3,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,B,2,...,consonant,0,0,arm_1_baseline_arm_1,67.8,15,9.8,5.3,3.8,61-70
4,DS,rPPA,15,1,Baseline,ball,B AO L,0.397365,AO,2,...,back,4,3,arm_1_baseline_arm_1,67.8,15,9.8,5.3,3.8,61-70


In [167]:
#Check what groups we have
df9.wabaq_start.unique()

array(['61-70', '91-100', '71-80', '41-50', '81-90'], dtype=object)

In [168]:
# Check our subjects
df9.PID.unique()

array([15,  1,  8, 12,  9,  5, 13, 16,  7,  4, 14, 28, 22])

In [169]:
#Save as pvm_all_data.csv
df9.to_csv("Resources/cleaned_data.csv", index=False, header=True)