In [1]:
import pandas as pd
import re 
import numpy as np

1. Lining up and mapping between CELEX and IPA syllables

In [None]:
subtlex = pd.read_csv('/home/neel/Desktop/MOUS_hierarchical-representations/subtlex_v3_IPA_syllables.csv')
subtlex

In [None]:
subtlex_IPA = subtlex[['Word','Syllables','FREQcount','Zipf']]
subtlex_IPA.head()

In [None]:
# Load the CSV file
celex = pd.read_csv('/home/neel/Desktop/MOUS_hierarchical-representations/dutch_celex_database_updatedv2.csv')

# Check the columns of the DataFrame
print(celex.columns)

# Assuming the correct column name is 'phone_full', if it is not, replace 'phone_full' with the correct column name
celex_syllables = celex[['Head', 'phone_full']]   

# Rename columns
celex_syllables = celex_syllables.rename(columns={'Head': 'Word', 'phone_full': 'CELEX'})

# Remove apostrophes (stress marks) from every entry in 'CELEX'
celex_syllables['CELEX'] = celex_syllables['CELEX'].str.replace("'", "")

# Display the first few rows
celex_syllables.head()

In [None]:
#merge the two dataframes on Word
merged = pd.merge(subtlex_IPA, celex_syllables, on='Word', how='inner')
#insert a space before and after every dash in the phone_full column
merged['CELEX'] = merged['CELEX'].str.replace("-", " - ")
#rename Syllables to IPA
merged = merged.rename(columns={'Syllables': 'IPA'})
merged.to_csv('/home/neel/Desktop/MOUS_hierarchical-representations/merged-IPA_CELEX.csv', index=False)
merged

In [6]:
#create an empty column for whether the number of syllables matches
merged['Equal # of Syllables'] = np.nan



Syllable comparison

In [None]:

syllables_mapping_master = {}
conflict_mapping_master = []
for row, word in enumerate(merged.iterrows()):
    Celex2IPA_syllables_mapping = {}
    IPA = word[1]['IPA']
    CELEX = word[1]['CELEX']
    # Split the IPA and CELEX strings into lists
    IPA_list = IPA.split(" - ")
    CELEX_list = CELEX.split(" - ")
    # If the number of syllables in the lists is equal, create a mapping
    if len(IPA_list) == len(CELEX_list):
        merged.at[row, 'Equal # of Syllables'] = True
        for i in range(len(IPA_list)):
            Celex2IPA_syllables_mapping[CELEX_list[i]] = IPA_list[i]
        # Concatenate the mappings for all words
        # If an entry already exists in the dictionary, check if the mapping is the same
        for key, value in Celex2IPA_syllables_mapping.items():
            if key in syllables_mapping_master:
                if syllables_mapping_master[key] != value:
                    print(f"Key {key} already exists in syllables_mapping_master with a different value.")
                    conflict_mapping_master.append((key, syllables_mapping_master[key], value))
            else:
                syllables_mapping_master[key] = value
    else:
        merged.at[row, 'Equal # of Syllables'] = False

In [None]:
#Check for convergence of syllable mapping conflicts (n = 37565)
#e.g if the same CELEX syllable maps to different IPA syllables
#eventually, pool: combine the frequency counts of all the IPA syllables that map to the same CELEX syllable
conflict_mapping_master # (CELEX, IPA1, IPA2)
#REFORMAT into a dictionary, where the first value in each tuple is the key, and the second and third values are the values. Merge all the entries which have the same key
conflict_mapping_dict = {}
for conflict in conflict_mapping_master:
    if conflict[0] in conflict_mapping_dict:
        conflict_mapping_dict[conflict[0]].append(conflict[1:])
    else:
        conflict_mapping_dict[conflict[0]] = [conflict[1:]]
#only keep unique values for each key
for key, value in conflict_mapping_dict.items():
    conflict_mapping_dict[key] = list(set(value))

#combine all the values for each key into a single list, then keep the unique elements
for key, value in conflict_mapping_dict.items():
    conflict_mapping_dict[key] = list(set([item for sublist in value for item in sublist]))
conflict_mapping_dict


In [None]:
num_keys = len(conflict_mapping_dict)
print(num_keys)

In [None]:
#Number of syllabification conflicts (n = 6292)
merged[merged['Equal # of Syllables'] == False]


2. Frequencies of IPA Syllables

In [None]:
subtlex

In [None]:
IPA_syllables = subtlex['Syllables']
all_ipa_syllables = set()
for index, value in IPA_syllables.items():
    if pd.isna(value):
        continue
    word_syllables = [syl for syl in value.split("-") if isinstance(syl,str)]
    all_ipa_syllables.update(word_syllables)
IPA_syllables = pd.DataFrame(all_ipa_syllables, columns=['Syllables'])
IPA_syllables

In [None]:
#quick check to see how many CELEX syllables there are
celex_syllables 

In [14]:
all_celex_syllables = set()
for index, row in celex_syllables.iterrows():
    value = row['CELEX']  # Replace 'Syllables' with the actual column name if different
    if pd.isna(value):
        continue
    word_syllables = [syl for syl in value.split("-") if isinstance(syl, str)]
    all_celex_syllables.update(word_syllables)

CELEX_syllables_df = pd.DataFrame(list(all_celex_syllables), columns=['Syllables'])
CELEX_syllables_df


Unnamed: 0,Syllables
0,mAGd
1,r@nd
2,k@n
3,bEkt
4,l}z
...,...
8746,tlOn
8747,snuk
8748,j|
8749,rINz


37526 > 8751. Interesting. 

Calculating Frequencies.

In [15]:
IPA_syllables['Cumulative FREQcount'] = None
IPA_syllables

Unnamed: 0,Syllables,Cumulative FREQcount
0,leːw,
1,ɵf,
2,oːh,
3,ɵkst,
4,kərt,
...,...,...
37521,mɑntl,
37522,tɑhn,
37523,fɑbr,
37524,ʃɪf,


In [None]:
for index, row in IPA_syllables.iterrows():
    syllable = row['Syllables']
    contains_syllable = subtlex_IPA['Syllables'].str.contains(syllable, na=False)
    cumulative_FREQcount = subtlex_IPA[contains_syllable]['FREQcount'].sum()
    IPA_syllables.at[index,'Cumulative FREQcount'] = cumulative_FREQcount

  contains_syllable = subtlex_IPA['Syllables'].str.contains(syllable, na=False)


In [None]:
# Save IPA_syllables to a CSV file
IPA_syllables.to_csv('IPA_individual_syllable_frequencies.csv', index=False)

3. Calculate minimum syllable frequencies for all MOUS study words

In [13]:
import pandas as pd
IPA_syllables  = pd.read_csv('IPA_individual_syllable_frequencies.csv')
mous_ipa = pd.read_csv('MOUS_IPA_transcriptions.csv')

In [14]:
for index, row in mous_ipa.iterrows():
    word = row['Word']
    transcription = row['Syllables']
    print(f'{word} in IPA is {transcription}')
    
    # Split transcription into syllables
    transcription_syllables = transcription.split(" - ")
    
    longest_matches = {}
    for syllable in transcription_syllables:
        # Find rows in IPA_syllables where 'Syllables' is exactly the current syllable
        matches = IPA_syllables[IPA_syllables['Syllables'] == syllable]
        
        # Find the longest match (though in this case, it will be the same as the syllable)
        if not matches.empty:
            longest_match = matches.iloc[0]['Syllables']
            longest_matches[syllable] = longest_match
    
    for syllable in transcription_syllables:
        match = longest_matches.get(syllable, "")
        print(f'Syllable: {syllable}, Longest match: {match}')

toen in IPA is tun
Syllable: tun, Longest match: tun
die in IPA is di
Syllable: di, Longest match: di
de in IPA is də
Syllable: də, Longest match: də
barkeeper in IPA is bɑr - keː - pər
Syllable: bɑr, Longest match: bɑr
Syllable: keː, Longest match: keː
Syllable: pər, Longest match: pər
irritante in IPA is ɪɾ - ri - tɑn - tə
Syllable: ɪɾ, Longest match: ɪɾ
Syllable: ri, Longest match: ri
Syllable: tɑn, Longest match: tɑn
Syllable: tə, Longest match: tə
bediende in IPA is bə - din - də
Syllable: bə, Longest match: bə
Syllable: din, Longest match: din
Syllable: də, Longest match: də
wegliep in IPA is ʋɛ - ɣlip
Syllable: ʋɛ, Longest match: ʋɛ
Syllable: ɣlip, Longest match: ɣlip
manke in IPA is mɑŋ - kə
Syllable: mɑŋ, Longest match: mɑŋ
Syllable: kə, Longest match: kə
gingen in IPA is ɣɪŋ - ən
Syllable: ɣɪŋ, Longest match: ɣɪŋ
Syllable: ən, Longest match: ən
klant in IPA is klɑnt
Syllable: klɑnt, Longest match: klɑnt
dronkaard in IPA is drɔŋ - kaːrt
Syllable: drɔŋ, Longest match: drɔŋ
Syll

In [16]:
# Iterate over each row in mous_ipa
for index, row in mous_ipa.iterrows():
    word = row['Word']
    transcription = row['Syllables'].strip()
    print(f"{word} in IPA is {transcription}")
    
    # Split transcription into syllables and strip whitespace
    transcription_syllables = [syllable.strip() for syllable in transcription.split(" - ")]
    
    freq_counts = []  # List to store Cumulative FREQcount values
    for syllable in transcription_syllables:
        # Find the row in IPA_syllables where 'Syllables' equals the current syllable
        match = IPA_syllables[IPA_syllables['Syllables'].str.strip() == syllable]
        
        if not match.empty:
            # Get the 'Cumulative FREQcount' value
            freq_count = match.iloc[0]['CumulativeFREQcount']
            freq_counts.append(freq_count)
        else:
            # If no match is found, append 0 or None
            freq_counts.append(0)
            print(f"No match found for syllable: '{syllable}'")
    
    # Print the word and the array of Cumulative FREQcount values
    print(f"Word: '{word}', Cumulative FREQcounts: {freq_counts}")

toen in IPA is tun
Word: 'toen', Cumulative FREQcounts: [177]
die in IPA is di
Word: 'die', Cumulative FREQcounts: [441857]
de in IPA is də
Word: 'de', Cumulative FREQcounts: [195887]
barkeeper in IPA is bɑr - keː - pər
Word: 'barkeeper', Cumulative FREQcounts: [5468, 6764, 5498]
irritante in IPA is ɪɾ - ri - tɑn - tə
Word: 'irritante', Cumulative FREQcounts: [1105, 108273, 14939, 173653]
bediende in IPA is bə - din - də
Word: 'bediende', Cumulative FREQcounts: [20990, 20416, 195887]
wegliep in IPA is ʋɛ - ɣlip
Word: 'wegliep', Cumulative FREQcounts: [39901, 132]
manke in IPA is mɑŋ - kə
Word: 'manke', Cumulative FREQcounts: [77, 671009]
gingen in IPA is ɣɪŋ - ən
Word: 'gingen', Cumulative FREQcounts: [21253, 2449]
klant in IPA is klɑnt
Word: 'klant', Cumulative FREQcounts: [1855]
dronkaard in IPA is drɔŋ - kaːrt
Word: 'dronkaard', Cumulative FREQcounts: [6279, 197]
open in IPA is oː - pən
Word: 'open', Cumulative FREQcounts: [23008, 179814]
deuren in IPA is døː - rən
Word: 'deuren', C

In [17]:
# Initialize lists to store min, max, and mean frequency counts
min_freq_counts = []
max_freq_counts = []
mean_freq_counts = []

# Iterate over each row in mous_ipa
for index, row in mous_ipa.iterrows():
    word = row['Word']
    transcription = row['Syllables'].strip()
    print(f"{word} in IPA is {transcription}")
    
    # Split transcription into syllables and strip whitespace
    transcription_syllables = [syllable.strip() for syllable in transcription.split(" - ")]
    
    freq_counts = []  # List to store Cumulative FREQcount values
    for syllable in transcription_syllables:
        # Find the row in IPA_syllables where 'Syllables' equals the current syllable
        match = IPA_syllables[IPA_syllables['Syllables'].str.strip() == syllable]
        
        if not match.empty:
            # Get the 'CumulativeFREQcount' value
            freq_count = match.iloc[0]['CumulativeFREQcount']
            freq_counts.append(freq_count)
        else:
            # If no match is found, append 0
            freq_counts.append(0)
            print(f"No match found for syllable: '{syllable}'")
    
    # Calculate min, max, and mean of freq_counts
    if freq_counts:
        min_freq = min(freq_counts)
        max_freq = max(freq_counts)
        mean_freq = sum(freq_counts) / len(freq_counts)
    else:
        min_freq = max_freq = mean_freq = 0
    
    # Append the results to the lists
    min_freq_counts.append(min_freq)
    max_freq_counts.append(max_freq)
    mean_freq_counts.append(mean_freq)
    
    # Print the word and the array of Cumulative FREQcount values
    print(f"Word: '{word}', Cumulative FREQcounts: {freq_counts}")

# Add the min, max, and mean frequency counts as new columns to mous_ipa
mous_ipa['Min_Freq_Count'] = min_freq_counts
mous_ipa['Max_Freq_Count'] = max_freq_counts
mous_ipa['Mean_Freq_Count'] = mean_freq_counts

toen in IPA is tun
Word: 'toen', Cumulative FREQcounts: [177]
die in IPA is di
Word: 'die', Cumulative FREQcounts: [441857]
de in IPA is də
Word: 'de', Cumulative FREQcounts: [195887]
barkeeper in IPA is bɑr - keː - pər
Word: 'barkeeper', Cumulative FREQcounts: [5468, 6764, 5498]
irritante in IPA is ɪɾ - ri - tɑn - tə
Word: 'irritante', Cumulative FREQcounts: [1105, 108273, 14939, 173653]
bediende in IPA is bə - din - də
Word: 'bediende', Cumulative FREQcounts: [20990, 20416, 195887]
wegliep in IPA is ʋɛ - ɣlip
Word: 'wegliep', Cumulative FREQcounts: [39901, 132]
manke in IPA is mɑŋ - kə
Word: 'manke', Cumulative FREQcounts: [77, 671009]
gingen in IPA is ɣɪŋ - ən
Word: 'gingen', Cumulative FREQcounts: [21253, 2449]
klant in IPA is klɑnt
Word: 'klant', Cumulative FREQcounts: [1855]
dronkaard in IPA is drɔŋ - kaːrt
Word: 'dronkaard', Cumulative FREQcounts: [6279, 197]
open in IPA is oː - pən
Word: 'open', Cumulative FREQcounts: [23008, 179814]
deuren in IPA is døː - rən
Word: 'deuren', C

In [19]:
mous_ipa.to_csv('MOUS_IPA_SyllableFrequencies.csv')


In [24]:
mous_ipa

Unnamed: 0,Word,IPA,Syllables,Min_Freq_Count,Max_Freq_Count,Mean_Freq_Count
0,toen,tˈun,tun,177,177,177.000000
1,die,dˈi,di,441857,441857,441857.000000
2,de,dˈə,də,195887,195887,195887.000000
3,barkeeper,bˈɑrkeːpər,bɑr - keː - pər,5468,6764,5910.000000
4,irritante,ˌɪɾritˈɑntə,ɪɾ - ri - tɑn - tə,1105,173653,74492.500000
...,...,...,...,...,...,...
1937,betalen,bətˈaːlən,bə - taː - lən,7556,29153,19233.000000
1938,intens,ˈɪntəns,ɪn - təns,575,655575,328075.000000
1939,jongeren,jˈɔŋərən,jɔŋ - ə - rən,932,2276250,858348.666667
1940,plezier,pleːzˈir,pleː - zir,6109,14759,10434.000000


In [None]:
#Generate regressor files for every subject.

In [33]:
from pathlib import Path
source = Path('/media/neel/MOUS/MOUS/MOUS/SynologyDrive/source')
for subject in source.iterdir():
    if subject.name.startswith('sub-A'):
        events = subject / 'func' / f'{subject.name}_transcription.csv'
        events_df = pd.read_csv(events)
        events_df = events_df.rename(columns={'Transcription': 'Word'})
        combined_events = pd.merge(events_df, mous_ipa, on='Word')
        combined_events = combined_events.sort_values(by='AlignOnset', ascending=True)
        combined_events.to_csv(str(events.parent / f'{subject.name}_IPA_syllable_frequency.csv'))

In [32]:
events.parent

PosixPath('/media/neel/MOUS/MOUS/MOUS/SynologyDrive/source/sub-A2002/func')