In [4]:
import numpy as np
import pandas as pd
from preprocessing_utils import bt_merge_mappings, apply_bt_gender_merge_mappings, apply_bt_merge_mappings, apply_bt_speaker_merge_mappings, drop_diverse_gender

# Files temporarily uploaded to run time

In [5]:
file_path = "/content/bt_redone.xlsx"
boundary_tones = pd.read_excel(file_path)
boundary_tones_cleaned = boundary_tones[~boundary_tones['1_meta_speaker-id'].isin(["'NULL'"])]
boundary_tones_cleaned.head(5)

FileNotFoundError: [Errno 2] No such file or directory: '/content/bt_redone.xlsx'

# Discard labels set

In [4]:
# Define the set of labels to discard
boundary_tones_to_discard = [ "%H", "!H-", "L-%", "L-L", "-?%?", "L_", "L-(H)%", "(L)-%", "(L)-", "LHH%", "L-H*",
    "L*+H", "(L)-(L)%", "L*", "!H-L", "HL-%", "^H*", "H_", "H-HL", "H-H", "-%", "!H*",
    "L-L*", "L-L&"]

boundary_tones_cleaned['1_anno_default_ns:bt'] = boundary_tones_cleaned['1_anno_default_ns:bt'].replace(boundary_tones_to_discard, np.nan)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boundary_tones_cleaned['1_anno_default_ns:bt'] = boundary_tones_cleaned['1_anno_default_ns:bt'].replace(boundary_tones_to_discard, np.nan)


In [5]:
# Define the replacements as a list of tuples
bt_replacements = [
    ("L-L%%", "L-L%"),
    ("H-L%%", "H-L%"),
    ("L-H%%", "L-H%"),
    ("L-L5", "L-L%"),
    ("H-H5", "H-H%"),
    ("H-H%%", "H-H%")
]


In [6]:
for to_replace, value in bt_replacements:
    boundary_tones_cleaned = boundary_tones_cleaned.replace(to_replace=to_replace, value=value)

In [7]:
bt_cleaned = boundary_tones_cleaned

total_bt_counts = bt_cleaned['1_anno_default_ns:bt'].value_counts()
merged_bt_counts = apply_bt_merge_mappings(total_bt_counts, bt_merge_mappings)
merged_bt_counts = merged_bt_counts.sort_values(ascending=False).reset_index(drop=False)
merged_bt_counts.columns = ['1_anno_default_ns:bt', 'count']
merged_bt_counts

Unnamed: 0,1_anno_default_ns:bt,count
0,L-L%,4388
1,H-L%,1677
2,H-H%,979
3,L-H%,734
4,L-,339
5,H-,234
6,!H-L%,0
7,H-^H%,0
8,^H-L%,0
9,L-^H%,0


# Number of male and female speakers

In [8]:
number_of_bt = bt_cleaned['1_anno_default_ns:bt'].count()
print(f"Total number of Boundary Tones: {number_of_bt}")
number_of_male_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'male']['1_meta_speaker-id'].nunique()
number_of_female_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'female']['1_meta_speaker-id'].nunique()
number_of_diverse_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'diverse']['1_meta_speaker-id'].nunique()

print(f"Number of male speakers: {number_of_male_speakers}")
print(f"Number of female speakers: {number_of_female_speakers}")
print(f"Number of diverse speakers: {number_of_diverse_speakers}")
bt_cleaned['1_meta_speaker-gender'].unique()


Total number of Boundary Tones: 8351
Number of male speakers: 32
Number of female speakers: 53
Number of diverse speakers: 1


array(['male', 'female', 'diverse'], dtype=object)

# After dropping diverse speakers

In [9]:
bt_cleaned = drop_diverse_gender(bt_cleaned)
number_of_bt = bt_cleaned['1_anno_default_ns:bt'].count()
print(f"Total number of Boundary Tones: {number_of_bt}")
number_of_male_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'male']['1_meta_speaker-id'].nunique()
number_of_female_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'female']['1_meta_speaker-id'].nunique()
number_of_diverse_speakers = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'diverse']['1_meta_speaker-id'].nunique()

print(f"Number of male speakers: {number_of_male_speakers}")
print(f"Number of female speakers: {number_of_female_speakers}")
print(f"Number of diverse speakers: {number_of_diverse_speakers}")


Total number of Boundary Tones: 8292
Number of male speakers: 32
Number of female speakers: 53
Number of diverse speakers: 0


In [10]:
total_bt_counts = bt_cleaned['1_anno_default_ns:bt'].value_counts()
merged_bt_counts = apply_bt_merge_mappings(total_bt_counts, bt_merge_mappings)
merged_bt_counts = merged_bt_counts.sort_values(ascending=False).reset_index(drop=False)
merged_bt_counts.columns = ['1_anno_default_ns:bt', 'count']
merged_bt_counts

Unnamed: 0,1_anno_default_ns:bt,count
0,L-L%,4346
1,H-L%,1672
2,H-H%,975
3,L-H%,727
4,L-,339
5,H-,233
6,!H-L%,0
7,H-^H%,0
8,^H-L%,0
9,L-^H%,0


# BT and Speaker Group

In [11]:
total_number_of_speakers = bt_cleaned['1_meta_speaker-id'].nunique()
print(f"Total number of speakers: {total_number_of_speakers}")
number_of_bilinguals = bt_cleaned[bt_cleaned['1_meta_speaker-bilingual'] == 'yes']['1_meta_speaker-id'].nunique()
print(f"Number of bilingual speakers: {number_of_bilinguals}")
number_of_monolinguals = bt_cleaned[bt_cleaned['1_meta_speaker-bilingual'] == 'no']['1_meta_speaker-id'].nunique()
print(f"Number of monolingual speakers: {number_of_monolinguals}")


Total number of speakers: 85
Number of bilingual speakers: 62
Number of monolingual speakers: 23


In [12]:
bilingual_bt_count = bt_cleaned[bt_cleaned['1_meta_speaker-bilingual'] == 'yes']['1_anno_default_ns:bt'].value_counts()
monolingual_bt_count = bt_cleaned[bt_cleaned['1_meta_speaker-bilingual'] == 'no']['1_anno_default_ns:bt'].value_counts()

# Create dataframes for bilingual and monolingual counts
bilingual_df = pd.DataFrame(bilingual_bt_count).reset_index()
bilingual_df.columns = ['Boundary Tone', 'Bilingual Count']

monolingual_df = pd.DataFrame(monolingual_bt_count).reset_index()
monolingual_df.columns = ['Boundary Tone', 'Monolingual Count']

speaker_group_bt = pd.merge(bilingual_df, monolingual_df, on='Boundary Tone', how='outer').fillna(0)

# Ensure count columns are integers
speaker_group_bt['Bilingual Count'] = speaker_group_bt['Bilingual Count'].astype(int)
speaker_group_bt['Monolingual Count'] = speaker_group_bt['Monolingual Count'].astype(int)

# Display the combined dataframe
speaker_group_bt.sort_values(by=['Bilingual Count', 'Monolingual Count'], ascending=False).reset_index(drop=True)

Unnamed: 0,Boundary Tone,Bilingual Count,Monolingual Count
0,L-L%,3256,1090
1,H-L%,1170,404
2,H-H%,742,196
3,L-H%,548,167
4,L-,243,96
5,H-,176,57
6,!H-L%,57,28
7,H-^H%,16,14
8,^H-L%,12,1
9,L-^H%,11,1


# BT and Gender

In [13]:
# Calculate pitch accent counts for male and female groups
male_bt_count = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'male']['1_anno_default_ns:bt'].value_counts()
female_bt_count = bt_cleaned[bt_cleaned['1_meta_speaker-gender'] == 'female']['1_anno_default_ns:bt'].value_counts()

# Create DataFrames
male_df = pd.DataFrame(male_bt_count).reset_index()
male_df.columns = ['Boundary Tone', 'Male Count']

female_df = pd.DataFrame(female_bt_count).reset_index()
female_df.columns = ['Boundary Tone', 'Female Count']

# Merge DataFrames
gender_group_bt = pd.merge(male_df, female_df, on='Boundary Tone', how='outer').fillna(0)

# Convert counts to integers
gender_group_bt['Male Count'] = gender_group_bt['Male Count'].astype(int)
gender_group_bt['Female Count'] = gender_group_bt['Female Count'].astype(int)

gender_group_bt
merged_gender_group_bt = apply_bt_gender_merge_mappings(gender_group_bt, bt_merge_mappings)
merged_gender_group_bt = merged_gender_group_bt.sort_values(by=['Male Count', 'Female Count'], ascending=False).reset_index(drop=True)
merged_gender_group_bt

Unnamed: 0,Boundary Tone,Male Count,Female Count
0,L-L%,1930,2416
1,H-L%,685,987
2,H-H%,338,637
3,L-H%,206,521
4,L-,136,203
5,H-,71,162
6,!H-H%,0,0
7,!H-L%,0,0
8,H-^H%,0,0
9,L-^H%,0,0


Number of boundary tones across speakers + formality

In [3]:
boundary_tones_cleaned = bt_cleaned
high_boundary_tones = ['H-H%', 'L-H%', 'H-', 'H-^H%', 'L-^H%', '!H-H%', '^H-H%']
low_boundary_tones = ['L-L%', 'H-L%', 'L-', '!H-L%', '^H-L%']


def count_boundary_tones(group, bt_list):
    return group[group['1_anno_default_ns:bt'].isin(bt_list)]['1_anno_default_ns:bt'].count()

data = []

for speaker_type, bilingual_value in [("Majority English", "yes"), ("Monolingual English", "no")]:
    for formality_level in ["formal", "informal"]:
        subset = boundary_tones_cleaned[
            (boundary_tones_cleaned['1_meta_speaker-bilingual'] == bilingual_value) &
            (boundary_tones_cleaned['1_meta_setting'] == formality_level)
        ]

        high_bt_count = count_boundary_tones(subset, high_boundary_tones)
        low_bt_count = count_boundary_tones(subset, low_boundary_tones)

        total_bt_count = high_bt_count + low_bt_count
        high_bt_percentage = (high_bt_count / total_bt_count * 100) if total_bt_count > 0 else 0
        low_bt_percentage = (low_bt_count / total_bt_count * 100) if total_bt_count > 0 else 0

        data.append({
            "Speaker Group": speaker_type,
            "Formality": formality_level,
            "High BT Count": high_bt_count,
            "Low BT Count": low_bt_count,
            "High BT Percentage": f"{high_bt_percentage:.1f}%",
            "Low BT Percentage": f"{low_bt_percentage:.1f}%"
        })

bt_counts_df = pd.DataFrame(data)
bt_counts_df

NameError: name 'bt_cleaned' is not defined