# Function Word Extraction from CHILDES Transcripts
Important Note: this script must be in the same directory as the transcripts

In [12]:
import os
import csv
from collections import Counter

## Function Word Categories

In [13]:
# See https://talkbank.org/manuals/MOR.html#_Toc65933285 for part of speech codes
# Note: This tagset only works after running the MOR program in CLAN
function_word_categories = {'comp', 'conj', 'coord', 'det:art', 'inf', 'part', 'prep', 'qn', 'aux', 'cop', 'mod'}

## Computes function word proportions for each transcript and outputs to a CSV file for each child

### 50 Utterances

In [15]:
# Get the current working directory
current_directory = os.getcwd()

# Loop through all folders in the current working directory
for root, dirs, files in os.walk(current_directory):
    # Create a new CSV file for each folder to store results
    folder_name = os.path.basename(root)  # Get the name of the current folder
    folder_csv_file = os.path.join(root, f'{folder_name}_function50.csv')
    with open(folder_csv_file, mode='w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['File', 'Most Common Function Words', 'Function Word Proportion'])

        for filename in files:
            if filename.endswith('.ipcore.cex'):  # Changed from .cha to .ipcore.cex
                # Open the file and split it into lines
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Only gets %mor lines if they are after a *CHI line
                mor_lines = []
                collect_mor = False
                for line in lines:
                    if line.startswith('*CHI:'):
                        collect_mor = True
                    elif line.startswith('%mor:\t') and collect_mor:
                        mor_lines.append(line[6:])  # Remove the first 6 characters which are '%mor:\t'
                        collect_mor = False

                # Only replace ~ with spaces, keep | for processing
                mor_lines = [line.replace('~', ' ') for line in mor_lines]

                # Split the lines into tokens
                mor_tokens = [line.split() for line in mor_lines]

                # Count all tokens in the mor lines for the total token count
                total_tokens = sum(len(sublist) for sublist in mor_tokens)

                # Frequency of function words only (counted before '|')
                function_word_counts = Counter()
                for sublist in mor_tokens:
                    for token in sublist:
                        # Split the token on '|'
                        parts = token.split('|')
                        # Only count if the first part is a function word category
                        if len(parts) > 1 and parts[0] in function_word_categories:
                            function_word_counts[parts[0]] += 1

                # Get the most common function words
                most_common_function_words = function_word_counts.most_common()

                # Calculate the proportion of function words out of all tokens
                function_word_tokens = sum(function_word_counts.values())
                function_word_proportion = function_word_tokens / total_tokens if total_tokens > 0 else 0  # prevents division by zero

                # Write the results to the CSV file for the current folder
                csv_writer.writerow([filename, most_common_function_words, function_word_proportion])

### 100 Utterances

In [16]:
# Get the current working directory
current_directory = os.getcwd()

# Loop through all folders in the current working directory
for root, dirs, files in os.walk(current_directory):
    # Create a new CSV file for each folder to store results
    folder_name = os.path.basename(root)  # Get the name of the current folder
    folder_csv_file = os.path.join(root, f'{folder_name}_function100.csv')
    with open(folder_csv_file, mode='w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['File', 'Most Common Function Words', 'Function Word Proportion'])

        for filename in files:
            if filename.endswith('.ipcore-100.cex'):  # Changed from .cha to .ipcore.cex
                # Open the file and split it into lines
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Only gets %mor lines if they are after a *CHI line
                mor_lines = []
                collect_mor = False
                for line in lines:
                    if line.startswith('*CHI:'):
                        collect_mor = True
                    elif line.startswith('%mor:\t') and collect_mor:
                        mor_lines.append(line[6:])  # Remove the first 6 characters which are '%mor:\t'
                        collect_mor = False

                # Only replace ~ with spaces, keep | for processing
                mor_lines = [line.replace('~', ' ') for line in mor_lines]

                # Split the lines into tokens
                mor_tokens = [line.split() for line in mor_lines]

                # Count all tokens in the mor lines for the total token count
                total_tokens = sum(len(sublist) for sublist in mor_tokens)

                # Frequency of function words only (counted before '|')
                function_word_counts = Counter()
                for sublist in mor_tokens:
                    for token in sublist:
                        # Split the token on '|'
                        parts = token.split('|')
                        # Only count if the first part is a function word category
                        if len(parts) > 1 and parts[0] in function_word_categories:
                            function_word_counts[parts[0]] += 1

                # Get the most common function words
                most_common_function_words = function_word_counts.most_common()

                # Calculate the proportion of function words out of all tokens
                function_word_tokens = sum(function_word_counts.values())
                function_word_proportion = function_word_tokens / total_tokens if total_tokens > 0 else 0  # prevents division by zero

                # Write the results to the CSV file for the current folder
                csv_writer.writerow([filename, most_common_function_words, function_word_proportion])

### All Utterances

In [None]:
# Get the current working directory
current_directory = os.getcwd()

# Loop through all folders in the current working directory
for root, dirs, files in os.walk(current_directory):
    # Create a new CSV file for each folder to store results
    folder_name = os.path.basename(root)  # Get the name of the current folder
    folder_csv_file = os.path.join(root, f'{folder_name}_function.csv')
    with open(folder_csv_file, mode='w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['File', 'Most Common Function Words', 'Function Word Proportion'])

        for filename in files:
            if filename.endswith('.cha'):
                # Open the file and split it into lines
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Only gets %mor lines if they are after a *CHI line
                mor_lines = []
                collect_mor = False
                for line in lines:
                    if line.startswith('*CHI:'):
                        collect_mor = True
                    elif line.startswith('%mor:\t') and collect_mor:
                        mor_lines.append(line[6:])  # Remove the first 6 characters which are '%mor:\t'
                        collect_mor = False

                # Only replace ~ with spaces, keep | for processing
                mor_lines = [line.replace('~', ' ') for line in mor_lines]

                # Split the lines into tokens
                mor_tokens = [line.split() for line in mor_lines]

                # Count all tokens in the mor lines for the total token count
                total_tokens = sum(len(sublist) for sublist in mor_tokens)

                # Frequency of function words only (counted before '|')
                function_word_counts = Counter()
                for sublist in mor_tokens:
                    for token in sublist:
                        # Split the token on '|'
                        parts = token.split('|')
                        # Only count if the first part is a function word category
                        if len(parts) > 1 and parts[0] in function_word_categories:
                            function_word_counts[parts[0]] += 1

                # Get the most common function words
                most_common_function_words = function_word_counts.most_common()

                # Calculate the proportion of function words out of all tokens
                function_word_tokens = sum(function_word_counts.values())
                function_word_proportion = function_word_tokens / total_tokens if total_tokens > 0 else 0  # prevents division by zero

                # Write the results to the CSV file for the current folder
                csv_writer.writerow([filename, most_common_function_words, function_word_proportion])