# Final formatting notebook

This notebook contains all the code to go from the initially generated datasets (naturalistic sentences and controlled ones) to build the final files:
- In one folder: stim/audio, we have all the audio files, with a naming convention for each dataset:
    - Naturalistic: sentence_id.wav
    - Controlled: tense_theme_plurality_variation.wav
- In another folder: stim/sentences/clean, we have:
    - a csv file with all the sentences: final.csv, and two columns: one called audio_filename, to link it to the audio file, and another one called block_number_subX which links it to the block it belongs for a given subject.
    - the naturalistic.csv file which contains the inital 32K sentences.
    - the controlled sentences dataset in the controlled.csv file.
    - a notebook that was used to generate all the given files, to track how it was done and reproduce it if needed.

# Natural dataset



## Extract N-words vocab, tag them if incorrect and fix them, while filtering for negative sentences

I need 10 subjects x 2000 sentences: 20 000 sentences

Remove most negative sentences

In [1]:
from pathlib import Path
import pandas as pd

naturalistic_path = Path('./../stim/text/full.csv')
naturalistic_df = pd.read_csv(naturalistic_path)

# Remove the \n from the sentence text field
naturalistic_df['sentence'] = naturalistic_df['sentence'].str.replace('\n', '')

# Adjust ratio of negative sentences
negative_sentences = naturalistic_df[naturalistic_df['polarity'] == 'negative']
non_negative_sentences = naturalistic_df[naturalistic_df['polarity'] != 'negative']

# Calculate how many negative sentences to keep (10% of total)
total_desired = len(non_negative_sentences) // 4  # This will make negatives 10% of final
negative_to_keep = min(len(negative_sentences), total_desired)


# Randomly sample negative sentences
kept_negative = negative_sentences.sample(n=negative_to_keep)

# Combine with non-negative sentences
polarity_balanced_df = pd.concat([non_negative_sentences, kept_negative])


Filter the vocab so we have around 30K sentences

In [2]:
# Filter naturalistic dataset
# Get the 3000 most common words
words_count = {}
for sentence in polarity_balanced_df['sentence']:
    words = str(sentence).split()
    for word in words:
        words_count[word] = words_count.get(word, 0) + 1

most_common_words = sorted(words_count, key=words_count.get, reverse=True)[:12000]

# Filter sentences with common words and length limit
filtered_sentences = []
for sentence in polarity_balanced_df['sentence'][polarity_balanced_df['num_words'] <= 12]:
    words = set(str(sentence).split())
    if all(word in most_common_words for word in words):
        filtered_sentences.append(sentence)

# Create final filtered dataframe
final_df = polarity_balanced_df[polarity_balanced_df['sentence'].isin(filtered_sentences)]


In [3]:
final_df.polarity.value_counts()

polarity
affirmative    19072
negative        4232
Name: count, dtype: int64

In [38]:
final_df_tagged.validity_huge_issue.value_counts()

valid_share = final_df_tagged.validity_huge_issue.value_counts()[0] / len(final_df_tagged)
valid_share

  valid_share = final_df_tagged.validity_huge_issue.value_counts()[0] / len(final_df_tagged)


0.8471507037418469

In [4]:
final_df.num_words.value_counts()

num_words
10    5558
9     5338
11    3610
8     3472
7     2316
6     1270
12    1170
5      443
4      123
3        4
Name: count, dtype: int64

Check the validity of all the sentences

#### Functions

In [7]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from tqdm import tqdm

def process_dataframe_parallel(df, max_workers=50, check=""):
    # Create a copy of the dataframe with a temporary sequential index
    temp_df = df.reset_index(drop=False)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        if check == "huge_issue":
            # Include both index and sentence in the future
            futures = {
                executor.submit(
                    check_sentence_huge_issue, 
                    row['sentence'],
                    temp_idx
                ): temp_idx 
                for temp_idx, row in temp_df.iterrows()
            }
        else:
            raise ValueError("Invalid check")
            
        results = []
        for future in tqdm(futures, total=len(futures), desc="Processing sentences"):
            try:
                result = future.result()
                temp_idx = futures[future]
                results.append((temp_idx, result))
            except Exception as e:
                temp_idx = futures[future]
                results.append((temp_idx, 'unsure'))
    
    # Sort by the temporary index
    results.sort(key=lambda x: x[0])
    
    # Create the new column
    temp_df[f'validity_{check}'] = [r[1] for r in results]
    
    # Restore the original index and return
    return temp_df.set_index(df.index)

def check_sentence_huge_issue(sentence, idx):
    prompt = f"""Is this sentence correct? 
Reply with either one word: 'correct' if the sentence is perfectly fine, otherwise, if there's any issue, output 'incorrect'.
Sentence: "{sentence}"
"""
    try:
        response = gpt4_response(prompt).strip().lower()
        return response
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        return 'bugged'
    

OPENAI_API_KEY = "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"
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)


def gpt4_response(prompt, model="gpt-4o-mini"):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=model,
    )

    return chat_completion.choices[0].message.content

#### Check validty

In [22]:
final_df

Unnamed: 0,theme,type,structure,numer,tense,polarity,sentence,num_words
60,weather,interrogative,simple,plural,present,affirmative,Are the temperatures rising during the summer ...,8
61,weather,interrogative,simple,plural,present,affirmative,Do the clouds usually appear thicker in spring?,8
62,weather,interrogative,simple,plural,present,affirmative,Are winter storms becoming more frequent lately?,7
63,weather,interrogative,simple,plural,present,affirmative,Have the winds picked up speed this afternoon?,8
64,weather,interrogative,simple,plural,present,affirmative,Is it true that flowers only bloom in spring?,9
...,...,...,...,...,...,...,...,...
24949,transport,imperative,preposition,plural,future,negative,Will you not drive through the tunnel at night?,9
23194,sport,imperative,independent,singular,past,negative,The runner has not stopped because he hadn’t r...,11
35435,sport,imperative,simple,singular,future,negative,Don't ignore the rules outlined before the eve...,9
18124,sport,declarative,preposition,plural,future,negative,The athletes will not run among the trees duri...,10


#### Final run on all 

In [8]:
final_df_tagged = process_dataframe_parallel(final_df, max_workers=50, check="huge_issue")

Processing sentences: 100%|██████████| 23304/23304 [06:33<00:00, 59.18it/s] 


### Add the index in the file 

In [1]:
import pandas as pd
from pathlib import Path

base_path = Path("/home/co/git/neurospin-new-protocol/")
naturalistic_df = pd.read_csv(base_path / "stim/sentences/V2/processed/natural_two_passes_fixed.csv")

df = naturalistic_df

In [2]:
# Add a column which refers to the sentence id, basically the index encoded as XXXXX, and another column as the audio file name
df['sentence_id'] = df.index.map(lambda x: f"nat_{x:05d}")
df['audio_filename'] = df['sentence_id'].apply(lambda x: f"{x}.wav")

df.to_csv(base_path / "stim/sentences/V2/processed/natural_two_passes_fixed.csv", index=False)

# Controlled dataset

## Reformatting it correctly

In [31]:
controlled_raw_path = Path("/home/co/git/neurospin-new-protocol/stim/sentences/V2/processed/controlled_revised.csv")
controlled_raw_df = pd.read_csv(controlled_raw_path)
# Remove the nan values
controlled_raw_df = controlled_raw_df.dropna(subset=['sentence'])

# Remvoe some useless conditions for our experiment
controlled_raw_df.structure.unique()

# Check how many sentences we have if we filter out: the nested_* structures
filtered_controlled_df = controlled_raw_df[~controlled_raw_df.structure.str.contains("nested_object")]

# Add now a column that gives this info:df['sentence_id'] = df.index.map(lambda x: f"nat_{x:05d}")
# df['audio_filename'] = df['sentence_id'].apply(lambda x: f"{x}.wav")

filtered_controlled_df['sentence_id'] = filtered_controlled_df.index.map(lambda x: f"ctr_{x:05d}")
filtered_controlled_df['audio_filename'] = filtered_controlled_df['sentence_id'].apply(lambda x: f"{x}.wav")

filtered_controlled_df
# Save it to a csv file

filtered_controlled_df.to_csv(Path("/home/co/git/neurospin-new-protocol/stim/sentences/V2/processed/controlled_filtered.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_controlled_df['sentence_id'] = filtered_controlled_df.index.map(lambda x: f"ctr_{x:05d}")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_controlled_df['audio_filename'] = filtered_controlled_df['sentence_id'].apply(lambda x: f"{x}.wav")


## Running the synonym replacement

### V5 - Using Jérémy's idea and a bit more chill (previous versions can be found in previous commits)

In [32]:
import pandas as pd
import json
import random
import os
from pathlib import Path
from openai import Client
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import time
import backoff

OPENAI_API_KEY = "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"

MAX_RETRIES = 1
MAX_WORKERS = 50  # Adjust based on your API rate limits

client = Client(api_key=OPENAI_API_KEY)

@backoff.on_exception(backoff.expo, Exception, max_tries=MAX_RETRIES)
def gpt4_response_base(prompt, model):
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            timeout=30  # Add timeout to prevent hanging
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error in API call: {str(e)}")
        raise

def create_synonym_prompt(sentence):
    return f"""Analyze this sentence and provide single-word synonyms (0-3) WHEN POSSIBLE for content words (nouns, verbs, adjectives, adverbs).
Rules:
- Each synonym must be exactly ONE word
- Only provide synonyms that:
  * Have the EXACT same meaning in the given sentence context
  * Keep the same:
    - part of speech
    - grammatical form
    - tense (for verbs) 
    - and numerosity (singular/plural): never replace product with goods
  * Could replace the original word without changing meaning or naturalness
- Provide NO synonyms if:
  * There isn't a perfect meaning match
  * The replacement would sound unnatural
- Make sure that all the synonyms combination could work with each other. Stay simple.

Sentence: "{sentence}"

Return as JSON:
{{"original_word": ["synonym1", "synonym2"]}}

Example 1:
Input: "The experienced doctors treated patients carefully"
{{"experienced": ["skilled"], "doctors": ["physicians"], "treated": ["handled"], "carefully": ["cautiously"]}}

Example 2:
Input: "Teachers warmly greet students"
{{"teachers": ["professors"], "warmly": ["kindly"], "greet": ["welcome"], "students": ["pupils"]}}"""


def create_sentence_variation(sentence, synonyms_dict):
    words = sentence.split()
    new_words = []
    modified = False
    print(f"\nProcessing sentence: {sentence}")

    for word in words:
        print(f"Processing word: {word}")
        print(f"Synonyms dict: {synonyms_dict}")
        # Strip punctuation for lookup but keep track of it
        punct = ''
        if not word.isalnum():
            punct = word[-1] if not word[-1].isalnum() else ''
            word = word[:-1] if punct else word
        
        word_lower = word.lower()
        
        if word_lower in synonyms_dict and synonyms_dict[word_lower]:
            all_options = [word] + synonyms_dict[word_lower]
            replacement = random.choice(all_options)
            if word[0].isupper():
                replacement = replacement.capitalize()
            # Add back punctuation
            new_words.append(replacement + punct)
            if replacement != word:
                modified = True
            print(f"Replaced '{word}' with '{replacement}'")
        else:
            new_words.append(word + punct)
            print(f"Kept '{word}'")

    return " ".join(new_words), modified

def parse_llm_response(response):
    try:
        cleaned_response = response.strip()
        if cleaned_response.startswith('```'):
            cleaned_response = cleaned_response.split('```')[1]
        if cleaned_response.startswith('json'):
            cleaned_response = cleaned_response[4:]
        return json.loads(cleaned_response.strip())
    except json.JSONDecodeError:
        print(f"Error parsing response: {response}")
        return {}

def get_cache_key(sentence):
    # Using both hash and index/id if available
    return f"{hash(sentence)}"

def get_synonyms_for_sentence(sentence, cache_dir="synonym_cache"):
    Path(cache_dir).mkdir(exist_ok=True)
    cache_key = get_cache_key(sentence)
    cache_file = Path(cache_dir) / f"{cache_key}.json"

    if cache_file.exists():
        try:
            with open(cache_file, 'r') as f:
                return json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            pass

    prompt = create_synonym_prompt(sentence)
    response = gpt4_response_base(prompt, "gpt-4o")
    synonyms_dict = parse_llm_response(response)

    with open(cache_file, 'w') as f:
        json.dump(synonyms_dict, f)

    return synonyms_dict

def process_sentence(sentence, idx):
    try:
        synonyms = get_synonyms_for_sentence(sentence)
        new_sentence, was_modified = create_sentence_variation(sentence, synonyms)
        return idx, new_sentence, was_modified
    except Exception as e:
        print(f"Error processing sentence {idx}: {str(e)}")
        return idx, sentence, False



def generate_variations(df, batch_size=10):
    new_df = df.copy()
    new_df['original_sentence'] = new_df['sentence']
    
    # Process sentences in batches using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            futures = []
            
            for idx, row in batch.iterrows():
                future = executor.submit(process_sentence, row['sentence'], idx)
                futures.append(future)
            
            # Process completed futures
            for future in futures:
                try:
                    idx, new_sentence, was_modified = future.result()
                    if was_modified:
                        new_df.at[idx, 'sentence'] = new_sentence
                except Exception as e:
                    print(f"Error processing future: {str(e)}")
                    continue
            
            # Add a small delay between batches to avoid rate limits
            time.sleep(0.1)
    
    return new_df



In [17]:
# Example usage
testing_df = controlled_df.loc[:50].copy()
new_df = generate_variations(testing_df, batch_size=50)


Processing sentence: The artists in their studios painted their canvases yesterday.
Processing word: The
Synonyms dict: {'artists': ['painters'], 'studios': ['workshops'], 'painted': ['colored', 'depicted'], 'canvases': ['paintings']}
Kept 'The'
Processing word: artists
Synonyms dict: {'artists': ['painters'], 'studios': ['workshops'], 'painted': ['colored', 'depicted'], 'canvases': ['paintings']}
Replaced 'artists' with 'artists'
Processing word: in
Synonyms dict: {'artists': ['painters'], 'studios': ['workshops'], 'painted': ['colored', 'depicted'], 'canvases': ['paintings']}
Kept 'in'
Processing word: their
Synonyms dict: {'artists': ['painters'], 'studios': ['workshops'], 'painted': ['colored', 'depicted'], 'canvases': ['paintings']}
Kept 'their'
Processing word: studios
Synonyms dict: {'artists': ['painters'], 'studios': ['workshops'], 'painted': ['colored', 'depicted'], 'canvases': ['paintings']}
Replaced 'studios' with 'workshops'
Processing word: painted
Synonyms dict: {'artis

In [18]:
new_df[['original_sentence', 'sentence']].head(50)

Unnamed: 0,original_sentence,sentence
0,The artists painted their canvases yesterday.,The artists painted their canvases yesterday.
1,The artists attentively painted their canvases...,The artists attentively decorated their canvas...
2,The talented artists painted their canvases ye...,The talented painters created their canvases y...
3,The talented artists attentively painted their...,The talented painters carefully crafted their ...
4,The artists in their studios painted their can...,The artists in their workshops colored their p...
5,The artists in their studios attentively paint...,The artists in their studios attentively paint...
6,The talented artists in their studios painted ...,The talented creators in their studios decorat...
7,The talented artists in their studios attentiv...,The skilled artists in their studios attentive...
8,The artists who trained hard painted their can...,The artists who trained hard decorated their c...
9,The artists who trained hard attentively paint...,The painters who trained diligently attentivel...


### Full run before re-organizing

In [None]:
# Generate variations
augmented_df = generate_variations(filtered_controlled_df, batch_size=50)


Processing sentence: The artists attentively painted their canvases yesterday.
Processing word: The
Synonyms dict: {'artists': ['painters'], 'attentively': ['carefully'], 'painted': ['decorated'], 'canvases': ['surfaces']}
Kept 'The'
Processing word: artists
Synonyms dict: {'artists': ['painters'], 'attentively': ['carefully'], 'painted': ['decorated'], 'canvases': ['surfaces']}
Replaced 'artists' with 'painters'
Processing word: attentively
Synonyms dict: {'artists': ['painters'], 'attentively': ['carefully'], 'painted': ['decorated'], 'canvases': ['surfaces']}
Replaced 'attentively' with 'carefully'
Processing word: painted
Synonyms dict: {'artists': ['painters'], 'attentively': ['carefully'], 'painted': ['decorated'], 'canvases': ['surfaces']}
Replaced 'painted' with 'painted'
Processing word: their
Synonyms dict: {'artists': ['painters'], 'attentively': ['carefully'], 'painted': ['decorated'], 'canvases': ['surfaces']}
Kept 'their'
Processing word: canvases
Synonyms dict: {'artist

In [None]:

# Add timestamp to filename for versioning
output_path = Path("/home/co/git/neurospin-new-protocol/stim/sentences/V2/processed/controlled_augmented.csv")

# Save to CSV
new_df.to_csv(output_path, index=False)

print(f"Saved variations to: {output_path}")

In [43]:
new_df[['original_sentence', 'sentence']].head(30)

Unnamed: 0,original_sentence,sentence
0,Customers buy the product.,Buyers acquire the product.
1,Customers eagerly buy the product.,Buyers eagerly acquire the product.
2,Happy customers buy the product.,Happy clients purchase the product.
3,Happy customers eagerly buy the product.,Pleased clients enthusiastically buy the product.
4,Customers on the site buy the product.,Buyers on the webpage acquire the product.
5,Customers on the site eagerly buy the product.,Customers on the site eagerly purchase the pro...
6,Happy customers on the site buy the product.,Pleased clients on the site acquire the product.
7,Happy customers on the site eagerly buy the pr...,Satisfied customers on the site enthusiastical...
8,Customers who visit daily buy the product.,Clients who frequent daily purchase the product.
9,Customers who visit daily eagerly buy the prod...,Clients who visit regularly eagerly buy the pr...


# Organizer

TODO:
- Make different runs for each subject, with different natural sentences for each subjects
- Keep the same controlled sentences for all subjects
- Keep the constraint about at least 3 natural sentences at every run start


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from typing import Union, List
from openai import OpenAI

class DatasetPreparator:
    def __init__(self, naturalistic_path, controlled_path, output_dir='stim', openai_key=None):
        self.naturalistic_df = pd.read_csv(naturalistic_path)
        self.controlled_df = pd.read_csv(controlled_path)
        self.output_dir = Path(output_dir)
        self.audio_dir = self.output_dir / 'audio'
        self.openai_key = openai_key
        self.audio_dir.mkdir(parents=True, exist_ok=True)

    def generate_speech(self, row, voice: str = "alloy", model: str = "tts-1") -> Path:
        """Generate speech for a single sentence using its audio_filename"""
        if self.openai_key is None:
            raise ValueError("OpenAI API key is required for speech generation")
            
        client = OpenAI(api_key=self.openai_key)
        
        speech_file_path = self.audio_dir / row['audio_filename']
        
        if speech_file_path.exists():
            print(f"File exists: {speech_file_path}")
            return speech_file_path
            
        try:
            response = client.audio.speech.create(
                model=model,
                voice=voice,
                input=row['sentence']
            )
            response.stream_to_file(str(speech_file_path))
            print(f"Generated: {speech_file_path}")
            return speech_file_path
        except Exception as e:
            print(f"Error generating audio for: {row['sentence']}\nError: {str(e)}")
            return None
        
    def generate_all_speech(self, max_workers=5, batch_size=20, delay_between_batches=0):
        """
        Generate speech for all sentences in parallel.
        
        Args:
            max_workers (int): Maximum number of parallel workers
            batch_size (int): Number of requests to process in each batch
            delay_between_batches (int): Delay in seconds between batches to avoid rate limits
        """
        from concurrent.futures import ThreadPoolExecutor
        import time
        
        def process_batch(rows):
            results = []
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self.generate_speech, row) for row in rows]
                for future in futures:
                    try:
                        result = future.result()
                        results.append(result)
                    except Exception as e:
                        print(f"Error in batch processing: {str(e)}")
                        results.append(None)
            return results

        # Split the dataframe into batches
        total_rows = len(self.final_df)
        batches = [self.final_df[i:i + batch_size] for i in range(0, total_rows, batch_size)]
        
        print(f"Processing {total_rows} sentences in {len(batches)} batches of {batch_size}")
        
        all_results = []
        for i, batch in enumerate(batches, 1):
            print(f"\nProcessing batch {i}/{len(batches)}")
            results = process_batch(batch.to_dict('records'))
            all_results.extend(results)
            
            if i < len(batches):  # Don't delay after the last batch
                print(f"Waiting {delay_between_batches} seconds before next batch...")
                time.sleep(delay_between_batches)
        
        # Count successes and failures
        successes = sum(1 for r in all_results if r is not None)
        failures = sum(1 for r in all_results if r is None)
        
        print(f"\nGeneration complete:")
        print(f"Successful generations: {successes}")
        print(f"Failed generations: {failures}")
        
        return all_results

    def prepare_controlled_data(self):
        """Prepare controlled dataset with proper columns and audio filename"""
        self.controlled_df['dataset'] = 'controlled'
        
        # Preserve the structure column
        if 'structure' not in self.controlled_df.columns:
            self.controlled_df['structure'] = None
            
        # Rename word_count to num_words if it exists
        if 'word_count' in self.controlled_df.columns:
            self.controlled_df.rename(columns={'word_count': 'num_words'}, inplace=True)
        
        # Create sentence_id for reference
        self.controlled_df['sentence_id'] = [f"ctrl_{i:05d}" for i in range(len(self.controlled_df))]
        
        # Create audio_filename using the specified format: tense_theme_plurality_variation.wav
        # self.controlled_df['audio_filename'] = self.controlled_df.apply(
        #     lambda row: f"{row['tense']}_{row['theme']}_{row['numerosity']}_{row['structure']}.wav", 
        #     axis=1
        # )
        # Create instead in the same way as the naturalistic dataset
        self.controlled_df['audio_filename'] = self.controlled_df['sentence_id'].apply(lambda x: f"{x}.wav")
        
        return self.controlled_df

    def prepare_naturalistic_data(self):
        """Prepare naturalistic dataset with sentence_id and audio filename"""
        self.naturalistic_df['dataset'] = 'naturalistic'
        self.naturalistic_df['numerosity'] = self.naturalistic_df['numer']
        self.naturalistic_df.drop('numer', axis=1, inplace=True)
        
        # Handle word count columns
        if 'word_count' in self.naturalistic_df.columns and 'num_words' not in self.naturalistic_df.columns:
            self.naturalistic_df.rename(columns={'word_count': 'num_words'}, inplace=True)
        
        self.naturalistic_df['sentence_id'] = [f"nat_{i:05d}" for i in range(len(self.naturalistic_df))]
        self.naturalistic_df['audio_filename'] = self.naturalistic_df['sentence_id'].apply(lambda x: f"{x}.wav")
        
        return self.naturalistic_df

    def create_final_dataset(self):
        """Merge datasets and adjust naturalistic sentences to make total divisible by 80"""
        naturalistic = self.prepare_naturalistic_data()
        controlled = self.prepare_controlled_data()
        
        # Ensure columns match before concatenation
        common_columns = set(naturalistic.columns) & set(controlled.columns)
        naturalistic = naturalistic[list(common_columns)]
        controlled = controlled[list(common_columns)]
        
        n_controlled = len(controlled)
        n_naturalistic = len(naturalistic)
        
        # Calculate the number of complete runs possible
        total_sentences = n_controlled + n_naturalistic
        n_complete_runs = total_sentences // 80
        
        if total_sentences % 80 != 0:
            n_complete_runs += 1
        
        total_needed = n_complete_runs * 80
        n_naturalistic_needed = total_needed - n_controlled
        
        if n_naturalistic_needed > n_naturalistic:
            n_complete_runs = (n_controlled + n_naturalistic) // 80
            total_needed = n_complete_runs * 80
            n_naturalistic_needed = total_needed - n_controlled
        
        print(f"Total runs possible: {n_complete_runs}")
        print(f"Total sentences needed: {total_needed}")
        print(f"Controlled sentences: {n_controlled}")
        print(f"Naturalistic sentences needed: {n_naturalistic_needed}")
        print(f"Naturalistic sentences available: {n_naturalistic}")
        
        # Randomly select the required number of naturalistic sentences
        naturalistic = naturalistic.sample(n=n_naturalistic_needed, random_state=42)
        
        final_df = pd.concat([naturalistic, controlled], ignore_index=True)
        return final_df

class RunOrganizer:
    def __init__(self, final_df, n_subjects=10, trials_per_run=80):
        self.final_df = final_df
        self.n_subjects = n_subjects
        self.trials_per_run = trials_per_run
        self.n_total_runs = len(final_df) // trials_per_run
        
    def create_runs(self):
        """Create runs for all subjects, where each subject gets all sentences"""
        all_runs = []
        
        # Calculate total number of runs per subject
        self.runs_per_subject = len(self.final_df) // self.trials_per_run
        
        # Create word count mapping
        self.word_counts = dict(zip(self.final_df['audio_filename'], self.final_df['num_words']))
        
        for subject in range(1, self.n_subjects + 1):
            # Get all files and shuffle them independently for each subject
            naturalistic_files = self.final_df[self.final_df['dataset'] == 'naturalistic']['audio_filename'].tolist()
            controlled_files = self.final_df[self.final_df['dataset'] == 'controlled']['audio_filename'].tolist()
            
            subject_runs = self._create_subject_runs(
                subject,
                naturalistic_files,
                controlled_files
            )
            all_runs.extend(subject_runs)
        
        # Convert to DataFrame
        runs_df = pd.DataFrame(all_runs)
        return runs_df
        
    def _create_subject_runs(self, subject, naturalistic_files, controlled_files):
        """Create all runs for a single subject using all available sentences"""
        subject_runs = []
        
        # Create pools of files that we'll draw from for each run
        remaining_naturalistic = naturalistic_files.copy()
        remaining_controlled = controlled_files.copy()
        
        for run in range(1, self.runs_per_subject + 1):
            run_sentences = []
            
            # Ensure we have enough naturalistic sentences for the start
            if len(remaining_naturalistic) < 3:
                remaining_naturalistic = naturalistic_files.copy()
            
            # Start with 3 random naturalistic sentences
            start_sentences = random.sample(remaining_naturalistic, 3)
            run_sentences.extend(start_sentences)
            for sent in start_sentences:
                remaining_naturalistic.remove(sent)
            
            # Pool for remaining sentences
            available_pool = (remaining_naturalistic + remaining_controlled).copy()
            if len(available_pool) < (self.trials_per_run - 3):
                # Replenish the pool if needed
                remaining_naturalistic = naturalistic_files.copy()
                remaining_controlled = controlled_files.copy()
                # Remove the sentences we just used at the start
                for sent in start_sentences:
                    if sent in remaining_naturalistic:
                        remaining_naturalistic.remove(sent)
                available_pool = (remaining_naturalistic + remaining_controlled).copy()
            
            # Fill the rest of the run
            remaining_needed = self.trials_per_run - 3
            
            # Select remaining sentences avoiding consecutive long sentences
            while len(run_sentences) < self.trials_per_run:
                candidates = [s for s in available_pool 
                            if not (len(run_sentences) > 0 and 
                                  self.word_counts.get(run_sentences[-1], 0) > 10 and 
                                  self.word_counts.get(s, 0) > 10)]
                
                if not candidates:
                    candidates = available_pool
                    
                selected = random.choice(candidates)
                run_sentences.append(selected)
                available_pool.remove(selected)
                
                # Update the remaining pools
                if selected in remaining_naturalistic:
                    remaining_naturalistic.remove(selected)
                elif selected in remaining_controlled:
                    remaining_controlled.remove(selected)
            
            # Create run data
            for trial, sentence in enumerate(run_sentences, 1):
                subject_runs.append({
                    'subject': subject,
                    'run': run,
                    'trial': trial,
                    'audio_filename': sentence,
                    'is_naturalistic': sentence.startswith('nat_'),
                    'num_words': self.word_counts.get(sentence, None)
                })
        
        return subject_runs

In [2]:

OPENAI_API_KEY = "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"
# Initialize preparator
preparator = DatasetPreparator(
    naturalistic_path='./../stim/text/1000_most_common.csv',
    controlled_path='./../stim/sentences/relatives/relatives_variations_augmented.csv',
    output_dir='./../stim',
    openai_key=OPENAI_API_KEY
)

# Create final dataset
final_df = preparator.create_final_dataset()
preparator.final_df = final_df  # Make sure final_df is accessible in the preparator

# Generate all audio files in parallel
results = preparator.generate_all_speech(
    max_workers=50,  # Number of parallel requests
    batch_size=200,  # Number of requests per batch
    delay_between_batches=0  # Delay between batches in seconds
)
# Create run organization
organizer = RunOrganizer(final_df)
runs_df = organizer.create_runs()
final_df.to_csv('./../stim/final_dataset.csv', index=False)
runs_df.to_csv('./../stim/runs_organization.csv', index=False)


Total runs possible: 41
Total sentences needed: 3280
Controlled sentences: 1200
Naturalistic sentences needed: 2080
Naturalistic sentences available: 2131
Processing 3280 sentences in 17 batches of 200

Processing batch 1/17
File exists: ../stim/audio/nat_00297.wav
File exists: ../stim/audio/nat_00282.wav
File exists: ../stim/audio/nat_01666.wav
File exists: ../stim/audio/nat_02066.wav
File exists: ../stim/audio/nat_00290.wav
File exists: ../stim/audio/nat_01713.wav
File exists: ../stim/audio/nat_00070.wav
File exists: ../stim/audio/nat_01745.wav
File exists: ../stim/audio/nat_02008.wav
File exists: ../stim/audio/nat_01708.wav
File exists: ../stim/audio/nat_01116.wav
File exists: ../stim/audio/nat_00650.wav
File exists: ../stim/audio/nat_00507.wav
File exists: ../stim/audio/nat_01721.wav
File exists: ../stim/audio/nat_00916.wav
File exists: ../stim/audio/nat_00450.wav
File exists: ../stim/audio/nat_01600.wav
File exists: ../stim/audio/nat_01932.wav
File exists: ../stim/audio/nat_00196.

In [None]:
# Compute how many total characters there are in the dataset
total_characters = sum(final_df['sentence'].apply(len))
total_characters