In [1]:
# Install the library for Google Cloud Storage
!pip install --upgrade gcsfs google-cloud-storage pandas

Collecting gcsfs
  Downloading gcsfs-2025.7.0-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-3.3.0-py3-none-any.whl.metadata (13 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fsspec==2025.7.0 (from gcsfs)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting google-api-core<3.0.0,>=2.15.0 (from google-cloud-storage)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Downloading gcsfs-2025.7.0-py2.py3-none-any.whl (36 kB)
Downloading fsspec-2025.7.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.6/199.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_cloud_storage-3.3.0-py3-none-any.whl (274 kB)


In [2]:
# Import the necessary modules
import json
from kaggle_secrets import UserSecretsClient
import gcsfs

# Authenticate with Google Cloud using the Secret
# Retrieve the secret you stored under the label 'GCS_CREDENTIALS'
user_secrets = UserSecretsClient()
gcp_credentials = user_secrets.get_secret("GCS_CREDENTIALS")
gcp_creds_dict = json.loads(gcp_credentials)

# Create a GCS filesystem object, passing the credentials directly
fs = gcsfs.GCSFileSystem(token=gcp_creds_dict)

print("✅ Authentication successful!")
print("GCS filesystem is ready to use.")


✅ Authentication successful!
GCS filesystem is ready to use.


In [5]:
# Define your bucket name
BUCKET_NAME = "ring-ami-dataset-storage"

# List the contents of your bucket to confirm the connection
print(f"Contents of gs://{BUCKET_NAME}/:")
file_list = fs.ls(f"gs://{BUCKET_NAME}")
for file_path in file_list:
    print(file_path)


Contents of gs://ring-ami-dataset-storage/:
ring-ami-dataset-storage/MiniLibriMix
ring-ami-dataset-storage/ami_public_manual_1.6.2
ring-ami-dataset-storage/array1-01
ring-ami-dataset-storage/array1-02
ring-ami-dataset-storage/array1-03
ring-ami-dataset-storage/array1-04
ring-ami-dataset-storage/array1-05
ring-ami-dataset-storage/array1-06
ring-ami-dataset-storage/array1-07
ring-ami-dataset-storage/array1-08
ring-ami-dataset-storage/headset


In [8]:
# Stage 1 — Build a canonical speaker map from AMI meetings.xml

# Standard library XML parser for reading the AMI manifest
import xml.etree.ElementTree as ET

# Notebook-friendly progress utility (import preserved for parity, even if unused)
from tqdm.notebook import tqdm


# Path to the AMI master manifest that enumerates meetings and speakers
MEETINGS_XML_PATH = "ring-ami-dataset-storage/ami_public_manual_1.6.2/corpusResources/meetings.xml"

# Trace log: announce which manifest is being parsed
print(f"Attempting to parse master file: {MEETINGS_XML_PATH}")

# Accumulator for results: {meeting_id: {channel:int -> speaker:str}}
channel_to_speaker_map = {}


# Guard the I/O and XML parsing so we can surface a clear, single error message
try:

    # Open the AMI manifest via the provided filesystem handle; AMI text is ISO-8859-1 encoded
    with fs.open(MEETINGS_XML_PATH, 'r', encoding='ISO-8859-1') as f:

        # Parse the XML stream into an ElementTree
        tree = ET.parse(f)
        
        # Get the root node (<meetings>)
        root = tree.getroot()
        
        # Iterate over each <meeting> element in the manifest
        for meeting in root.findall('meeting'):

            # Extract the meeting identifier from the 'observation' attribute
            meeting_id = meeting.get('observation')

            # Skip any malformed entries that do not expose an ID
            if meeting_id:

                # Initialize the per-meeting channel→speaker map
                channel_to_speaker_map[meeting_id] = {}
                
                # Enumerate all <speaker> elements within this meeting
                for speaker in meeting.findall('speaker'):

                    # Channel index as declared in the manifest (string)
                    channel = speaker.get('channel')
                    
                    # The speaker's name is in the 'global_name' attribute
                    speaker_name = speaker.get('global_name')

                    # Only materialize mappings when both channel and name are present
                    if channel and speaker_name:
                        
                        # Store the mapping: {meeting_id: {channel: speaker_name}}
                        channel_to_speaker_map[meeting_id][int(channel)] = speaker_name

    # Success path: confirm build and print basic sanity checks
    print("\n Master speaker map built successfully!")
    print(f"Found mappings for {len(channel_to_speaker_map)} meetings.")
    print("\n--- Example mapping for meeting 'IS1000a': ---")
    print(channel_to_speaker_map.get('IS1000a'))

# Consolidated failure path: emit the exception for actionable debugging
except Exception as e:
    print(f"\n An error occurred: {e}")

Attempting to parse master file: ring-ami-dataset-storage/ami_public_manual_1.6.2/corpusResources/meetings.xml

✅ Master speaker map built successfully!
Found mappings for 171 meetings.

--- Example mapping for meeting 'IS1000a': ---
{3: 'MIO016', 1: 'MIO082', 0: 'FIE081', 2: 'MIO050'}


In [9]:
# STAGE 2: Use the map to parse the segment files correctly

# (This assumes the 'channel_to_speaker_map' from Stage 1 is in memory)

# Standard library XML parser for segment-level annotations
import xml.etree.ElementTree as ET

# Notebook-friendly progress display for batch file iteration
from tqdm.notebook import tqdm

# DataFrame utilities for structuring and aggregating parsed annotations
import pandas as pd


# Path to the AMI directory that holds per-meeting segment XMLs
SEGMENTS_PATH = "ring-ami-dataset-storage/ami_public_manual_1.6.2/segments"

# Enumerate all segment XML files available in the path
segment_files = fs.glob(f"{SEGMENTS_PATH}/*.xml")

# Accumulator for parsed results across all meetings
all_speaker_segments = []

# Iterate through every segment file with a progress bar
for file_path in tqdm(segment_files, desc="Parsing Segments with Map"):
    
    # Derive the meeting identifier directly from the filename
    meeting_id = file_path.split('/')[-1].split('.')[0]
    
    # Skip files for which no channel→speaker mapping is available
    if meeting_id not in channel_to_speaker_map:
        continue
        
    try:
        # Open and parse the XML segment file using the known encoding
        with fs.open(file_path, 'r', encoding='ISO-8859-1') as f:
            tree = ET.parse(f)
            root = tree.getroot()

            # Iterate through each <segment> element for this meeting
            for segment in root.findall('segment'):

                # Ensure segment exposes both channel and time boundaries
                if 'channel' in segment.attrib and 'transcriber_start' in segment.attrib and 'transcriber_end' in segment.attrib:

                    # Channel is the index used to resolve the speaker ID
                    channel = int(segment.attrib['channel'])
                    
                    # Use the map to find the speaker ID
                    speaker_id = channel_to_speaker_map[meeting_id].get(channel)

                    # Only materialize the record if a valid mapping exists
                    if speaker_id: # Only add if we found a speaker
                        all_speaker_segments.append({
                            "meeting_id": meeting_id,
                            "speaker_id": speaker_id,
                            "begin_time": float(segment.attrib['transcriber_start']),
                            "end_time": float(segment.attrib['transcriber_end'])
                        })

    # If XML is malformed or unreadable, emit a warning and skip gracefully
    except ET.ParseError:
        print(f"\nWarning: Could not parse XML file: {file_path}. Skipping.")
        continue


# --- Validation and aggregation ---
if not all_speaker_segments:

    # No usable annotations were found across the dataset
    print("\n❌ ERROR: No valid segment data found even with the speaker map.")
    
else:
    # Construct a DataFrame with all parsed speaker segments
    annotations_df = pd.DataFrame(all_speaker_segments)

    # Group per meeting and consolidate timelines into dicts for convenience
    annotations_by_meeting = annotations_df.groupby('meeting_id').apply(
        lambda x: x[['speaker_id', 'begin_time', 'end_time']].to_dict('records')
    ).to_dict()

    print("\n Speaker timelines parsed successfully using the map!")
    print(f"Processed data for {len(annotations_by_meeting)} unique meetings.")
    
    # Display a sample of the output for a single meeting
    print("\n--- First 5 speaker segments for a sample meeting: ---")
    sample_meeting_key = next(iter(annotations_by_meeting))
    print(f"Sample Meeting ID: {sample_meeting_key}")
    print(annotations_by_meeting.get(sample_meeting_key)[:5])


Parsing Segments with Map:   0%|          | 0/687 [00:00<?, ?it/s]


✅ Speaker timelines parsed successfully using the map!
Processed data for 171 unique meetings.

--- First 5 speaker segments for a sample meeting: ---
Sample Meeting ID: EN2001a
[{'speaker_id': 'MEE068', 'begin_time': 5.496, 'end_time': 6.07}, {'speaker_id': 'MEE068', 'begin_time': 11.04, 'end_time': 15.632}, {'speaker_id': 'MEE068', 'begin_time': 18.883, 'end_time': 19.373}, {'speaker_id': 'MEE068', 'begin_time': 21.392, 'end_time': 25.857}, {'speaker_id': 'MEE068', 'begin_time': 28.033, 'end_time': 40.152}]


  annotations_by_meeting = annotations_df.groupby('meeting_id').apply(


In [13]:
# FINAL STAGE: Sliding Window Processing with Accurate File Paths

import numpy as np
import soundfile as sf
from tqdm.notebook import tqdm
import pickle

# --- Runtime Configuration ---
# Size of each sliding analysis window (in seconds)
WINDOW_SIZE = 3.0

# Hop interval between consecutive windows (in seconds)
HOP_SIZE = 1.5

# Hard limit on number of meetings to process (None = process all available)
MEETING_LIMIT = 5

# Audio sampling frequency in Hz
SAMPLE_RATE = 16000

# Define GCS paths
HEADSET_AUDIO_PATH = "ring-ami-dataset-storage/headset"
ARRAY_AUDIO_PATH = "ring-ami-dataset-storage/array1-01"

# Output pickle file to store processed dataset
OUTPUT_PATH = "/kaggle/working/processed_ami_data.pkl"

print("Starting final processing with corrected file paths...")

# Limit meetings if MEETING_LIMIT is set
meeting_ids_to_process = list(annotations_by_meeting.keys())
if MEETING_LIMIT:
    meeting_ids_to_process = meeting_ids_to_process[:MEETING_LIMIT]

# Master accumulator for all sliding window segments across meetings
processed_data = []

# Iterate through every selected meeting with progress feedback
for meeting_id in tqdm(meeting_ids_to_process, desc="Processing Meetings"):

    # Retrieve segment-level annotations for this meeting
    annotations = annotations_by_meeting.get(meeting_id, [])
    if not annotations:
        continue

    try:
        # --- HEADSET AUDIO LOADING ---
        # 1. Load and mix all headset channels for a clean mix
        headset_files = fs.glob(f"{HEADSET_AUDIO_PATH}/{meeting_id}/audio/{meeting_id}.Headset-*.wav")
        if not headset_files:
            print(f"Warning: No headset audio files found for {meeting_id}. Skipping.")
            continue

        headset_tracks = []
        for file_path in headset_files:
            with fs.open(file_path, 'rb') as f:
                audio, _ = sf.read(f)
                headset_tracks.append(audio)
        
        # Mix tracks by summing them. Find the length of the longest track for padding.
        max_len = max(len(track) for track in headset_tracks)
        ihm_full_audio = np.zeros(max_len)
        for track in headset_tracks:
            ihm_full_audio[:len(track)] += track

        # 2. Load the single array microphone track
        array_file = fs.glob(f"{ARRAY_AUDIO_PATH}/{meeting_id}/audio/{meeting_id}.Array1-01.wav")[0]
        with fs.open(array_file, 'rb') as f:
            sdm_full_audio, _ = sf.read(f)

    except (IndexError, FileNotFoundError):
        print(f"Warning: Could not find all required audio files for meeting {meeting_id}. Skipping.")
        continue

    meeting_duration = max(ann['end_time'] for ann in annotations)

    # --- Sliding Window ---
    for window_start in np.arange(0, meeting_duration - WINDOW_SIZE, HOP_SIZE):
        window_end = window_start + WINDOW_SIZE
        
        active_speakers = {ann['speaker_id'] for ann in annotations if ann['begin_time'] < window_end and ann['end_time'] > window_start}
        num_speakers = len(active_speakers)

        label = "Group Discussion"
        if num_speakers == 0:
            label = "Alone / Quiet"
        elif num_speakers == 1:
            label = "Speech / Monologue"
        elif num_speakers == 2:
            label = "One-on-One Conversation"

        start_sample = int(window_start * SAMPLE_RATE)
        end_sample = int(window_end * SAMPLE_RATE)
        
        ihm_clip = ihm_full_audio[start_sample:end_sample]
        sdm_clip = sdm_full_audio[start_sample:end_sample]
        
        expected_len = int(WINDOW_SIZE * SAMPLE_RATE)
        if len(ihm_clip) < expected_len:
            ihm_clip = np.pad(ihm_clip, (0, expected_len - len(ihm_clip)))
        if len(sdm_clip) < expected_len:
            sdm_clip = np.pad(sdm_clip, (0, expected_len - len(sdm_clip)))
            
        processed_data.append({
            "meeting_id": meeting_id, "label": label,
            "ihm_audio": ihm_clip, "sdm_audio": sdm_clip,
            "sample_rate": SAMPLE_RATE,
        })

# --- Save Final Data ---
print(f"\n Successfully generated {len(processed_data)} labeled audio clips.")
print(f"Saving processed data to: {OUTPUT_PATH}")

with open(OUTPUT_PATH, 'wb') as f:
    pickle.dump(processed_data, f)

print("\n--- PREPROCESSING COMPLETE! ---")
print("Your final dataset is ready. You can now proceed to the noise augmentation step.")


Starting final processing with corrected file paths...


Processing Meetings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Successfully generated 12026 labeled audio clips.
Saving processed data to: /kaggle/working/processed_ami_data.pkl

--- PREPROCESSING COMPLETE! ---
Your final dataset is ready. You can now proceed to the noise augmentation step.


In [None]:
# PHASE 2: Augment with Noise from the Correct Path

import numpy as np
import soundfile as sf
from tqdm.notebook import tqdm
import random
import pickle

print("Starting Phase 2: Noise Augmentation with corrected path...")

# --- Configuration ---
# --- THIS IS THE CORRECTED PATH ---
NOISE_PATH = "ring-ami-dataset-storage/MiniLibriMix/MiniLibriMix/train/mix_both"

SNR_LEVEL_DB = 10
INPUT_PATH = "/kaggle/working/processed_ami_data.pkl"
FINAL_OUTPUT_PATH = "/kaggle/working/final_labeled_dataset.pkl"

# --- Load the Data from Phase 1 ---
print(f"Loading data from {INPUT_PATH}...")
with open(INPUT_PATH, 'rb') as f:
    processed_data = pickle.load(f)

# --- Load Noise Files ---
print(f"Finding all noise files in: {NOISE_PATH}")
try:
    # We don't need a recursive search anymore as we have the full path
    noise_files = fs.glob(f"{NOISE_PATH}/*.wav")
    print(f"Found {len(noise_files)} noise files.")
except Exception as e:
    print(f"❌ Could not find noise files. Error: {e}")
    noise_files = []

# --- Augmentation Loop ---
noisy_samples = []
if noise_files:
    speech_clips = [data for data in processed_data if data['label'] == 'Speech / Monologue']

    for speech_sample in tqdm(speech_clips, desc="Augmenting with Noise"):
        clean_audio = speech_sample['ihm_audio']
        
        random_noise_file = random.choice(noise_files)
        with fs.open(random_noise_file, 'rb') as f:
            noise_audio, sr_noise = sf.read(f)
            
        if len(noise_audio) < len(clean_audio):
            repeats = int(np.ceil(len(clean_audio) / len(noise_audio)))
            noise_audio = np.tile(noise_audio, repeats)
        
        noise_segment = noise_audio[:len(clean_audio)]
        
        speech_power = np.mean(clean_audio ** 2)
        noise_power = np.mean(noise_segment ** 2)
        if noise_power > 1e-6:
            snr_factor = np.sqrt(speech_power / (10**(SNR_LEVEL_DB / 10) * noise_power))
            noisy_audio = clean_audio + (noise_segment * snr_factor)
        else:
            noisy_audio = clean_audio
        
        noisy_samples.append({
            "meeting_id": speech_sample['meeting_id'], "label": "Noisy Environment",
            "ihm_audio": noisy_audio, "sdm_audio": noisy_audio,
            "sample_rate": speech_sample['sample_rate'],
        })

    print(f"\n✅ Generated {len(noisy_samples)} 'Noisy Environment' samples.")
    processed_data.extend(noisy_samples)
    print(f"Total processed samples now: {len(processed_data)}")
else:
    print("Skipping noise augmentation as no noise files were found.")

# --- Save the Final, Combined Dataset ---
print(f"\nSaving final combined dataset with all 4 scenarios to: {FINAL_OUTPUT_PATH}")
with open(FINAL_OUTPUT_PATH, 'wb') as f:
    pickle.dump(processed_data, f)
    
print("\n--- DATASET FULLY COMPLETE! ---")


Starting Phase 2: Noise Augmentation with corrected path...
Loading data from /kaggle/working/processed_ami_data.pkl...
Finding all noise files in: ring-ami-dataset-storage/MiniLibriMix/MiniLibriMix/train/mix_both
Found 800 noise files.


Augmenting with Noise:   0%|          | 0/5971 [00:00<?, ?it/s]


✅ Generated 5971 'Noisy Environment' samples.
Total processed samples now: 17997

Saving final combined dataset with all 4 scenarios to: /kaggle/working/final_labeled_dataset.pkl
