In [1]:
import pandas as pd

In [2]:
# Load the CSV file, skipping the first two rows and using the third row as column names
download_list = pd.read_csv('Data/ScreamingDataLink.csv', skiprows=1, header=1)

# Display the loaded data
download_list

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,---1_cCGK4M,0,10,"""/m/01g50p",/m/0284vy3,/m/06d_3,/m/07jdr,"/m/07rwm0c""",,,
1,-20uudT97E0,30,40,"""/m/03qc9zr","/m/09x0r""",,,,,,
2,-2yygHLdpXc,20,30,"""/m/03qc9zr""",,,,,,,
3,-3bGlOhRkAo,140,150,"""/m/03qc9zr""",,,,,,,
4,-4pUrlMafww,1,11,"""/m/03qc9zr","/m/09x0r""",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1117,zhAUkNhQ7uM,10,20,"""/m/03qc9zr","/m/07s2xch""",,,,,,
1118,zkWoni28n64,70,80,"""/m/03qc9zr","/m/09x0r""",,,,,,
1119,zo10roED5Qg,70,80,"""/m/03qc9zr",/m/04rlf,"/m/09x0r""",,,,,
1120,zodTMCJFKv8,110,120,"""/m/03qc9zr""",,,,,,,


In [3]:
download_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   # YTID           1122 non-null   object
 1   start_seconds    1122 non-null   int64 
 2   end_seconds      1122 non-null   int64 
 3   positive_labels  1122 non-null   object
 4   Unnamed: 4       846 non-null    object
 5   Unnamed: 5       379 non-null    object
 6   Unnamed: 6       138 non-null    object
 7   Unnamed: 7       43 non-null     object
 8   Unnamed: 8       6 non-null      object
 9   Unnamed: 9       2 non-null      object
 10  Unnamed: 10      1 non-null      object
dtypes: int64(2), object(9)
memory usage: 96.6+ KB


In [4]:
# Drop rows where the value in the '# YTID' column is equal to "#NAME?"
download_list = download_list[download_list['# YTID'] != '#NAME?']

# Reset the index of the DataFrame after dropping rows
download_list.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
download_list

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,---1_cCGK4M,0,10,"""/m/01g50p",/m/0284vy3,/m/06d_3,/m/07jdr,"/m/07rwm0c""",,,
1,-20uudT97E0,30,40,"""/m/03qc9zr","/m/09x0r""",,,,,,
2,-2yygHLdpXc,20,30,"""/m/03qc9zr""",,,,,,,
3,-3bGlOhRkAo,140,150,"""/m/03qc9zr""",,,,,,,
4,-4pUrlMafww,1,11,"""/m/03qc9zr","/m/09x0r""",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1099,zhAUkNhQ7uM,10,20,"""/m/03qc9zr","/m/07s2xch""",,,,,,
1100,zkWoni28n64,70,80,"""/m/03qc9zr","/m/09x0r""",,,,,,
1101,zo10roED5Qg,70,80,"""/m/03qc9zr",/m/04rlf,"/m/09x0r""",,,,,
1102,zodTMCJFKv8,110,120,"""/m/03qc9zr""",,,,,,,


In [5]:
# Function to trim and resample audio using FFmpeg
def trim_and_resample_audio(input_file, output_file, start_time, end_time, target_sample_rate):
    command = (
        f'ffmpeg -loglevel quiet -i "{input_file}" -ss {start_time} -to {end_time} -ar {target_sample_rate} "{output_file}"'
    )
    os.system(command)

In [None]:
import os
import yt_dlp

# Create a directory to store downloaded audio files if it doesn't exist
output_dir = 'Data/Screaming'
os.makedirs(output_dir, exist_ok=True)

# Create a yt-dlp instance
ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
}

ydl = yt_dlp.YoutubeDL(ydl_opts)

# Loop through each row in the DataFrame and download the audio
for index, row in download_list.iterrows():
    yt_id = row['# YTID']
    start_seconds = row['start_seconds']
    end_seconds = row['end_seconds']

    # Construct the URL with time offsets
    url = f'https://www.youtube.com/watch?v={yt_id}'

    try:
        # Download the audio
        ydl.download([url])
        print(f"Downloaded audio for YTID {yt_id}")

        # Rename the downloaded file to match the YTID
        downloaded_file = os.path.join(output_dir, f"{yt_id}.wav")
        trimmed_file = os.path.join(output_dir, f"{yt_id}_out.wav")

        # Trim the audio using FFmpeg
        trim_and_resample_audio(downloaded_file, trimmed_file, start_seconds, end_seconds, 44100)

        # Remove the original downloaded file
        os.remove(downloaded_file)

        print(f"Trimmed audio for YTID {yt_id}")
    except Exception as e:
        print(f"Error downloading audio for YTID {yt_id}: {str(e)}")

print("Download and trimming completed.")