# 0. Install and load libraries, define constants and create folders
The required libraries need only be installed once. They must be imported everytime you restart the Python kernel.

In [None]:
pip install google-api-python-client isodate

In [1]:
from googleapiclient.discovery import build
import csv
import isodate

In [2]:
# Replace this value with your YouTube API key (provided by instructor)
API_KEY = "<YOUR_API_KEY>"
CSV_OUTPUT_FOLDER = "csv_output"

In [None]:
import os

# Check if the folder exists, if not, create it
if not os.path.exists(CSV_OUTPUT_FOLDER):
    os.makedirs(CSV_OUTPUT_FOLDER)
    print(f"Folder '{CSV_OUTPUT_FOLDER}' created.")
else:
    print(f"Folder '{CSV_OUTPUT_FOLDER}' already exists.")


# 1. Define a function to get all videos from a channel

The function `get_channel_videos` takes the API key and a valid ID of YouTube-channel and extracts all videos from the channel into a CSV-file.

In [4]:
# This function retrieves all videos from a YouTube channel with the given ID
def get_channel_videos(channel_id):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Fetch the playlist ID for the channel's uploaded videos
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    )
    response = request.execute()
    uploads_playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    videos = []
    next_page_token = None

    while True:
        # Fetch videos in the playlist
        playlist_request = youtube.playlistItems().list(
            part="snippet",
            playlistId=uploads_playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        playlist_response = playlist_request.execute()

        video_ids = [item['snippet']['resourceId']['videoId'] for item in playlist_response['items']]
        
        # Fetch video details (views and duration)
        for i in range(0, len(video_ids), 50):
            video_details_request = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=','.join(video_ids[i:i+50])
            )
            video_details_response = video_details_request.execute()

            for video in video_details_response['items']:
                video_data = {
                    'published_at': video['snippet']['publishedAt'],
                    'channel_id': video['snippet']['channelId'],
                    'yt_id': video['id'],
                    "url": f"https://www.youtube.com/watch?v={video['id']}",
                    "title": video["snippet"]["title"],
                    "description": video["snippet"]["description"],
                    "duration": isodate.parse_duration(video['contentDetails']['duration']).total_seconds(),
                    "views": video["statistics"]["viewCount"],
                    "language": video["snippet"]["defaultAudioLanguage"] if "defaultAudioLanguage" in video["snippet"] else "N/A",
                    "thumbnail": video["snippet"]["thumbnails"]["high"]["url"],
                    "channel_title": video["snippet"]["channelTitle"],
                }
                videos.append(video_data)

        next_page_token = playlist_response.get('nextPageToken')
        if next_page_token is None:
            break

    return videos

# 2. Define a function to search for videos and sort by views

In [5]:
# This function performs a video search on YouTube using the API
def search_youtube_videos(query, num_results=50):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    video_ids = []
    next_page_token = None

    while len(video_ids) < num_results:
        # Perform API query
        search_response = youtube.search().list(
            q=query,
            type='video',
            part='id,snippet',
            maxResults=50,
            relevanceLanguage='de',
            pageToken=next_page_token
        ).execute()

        video_ids.extend([item['id']['videoId'] for item in search_response['items']])
        next_page_token = search_response.get('nextPageToken')

        if not next_page_token:
            break

    # Fetch video details in batches of 50
    all_videos = []
    for i in range(0, len(video_ids), 50):
        videos_response = youtube.videos().list(
            id=','.join(video_ids[i:i+50]),
            part='id,snippet,statistics,contentDetails',
            maxResults=50
        ).execute()

        all_videos.extend(videos_response['items'])

    # Sort videos by view count
    sorted_videos = sorted(all_videos, key=lambda x: int(x['statistics']['viewCount']), reverse=True)

    # Write the data rows
    result_list = []
    for item in sorted_videos:
        row = {
            "published_at": item["snippet"]["publishedAt"],
            "channel_id": item["snippet"]["channelId"],
            "yt_id": item["id"],
            "url": f"https://www.youtube.com/watch?v={item['id']}",
            "title": item["snippet"]["title"],
            "description": item["snippet"]["description"],
            "duration": isodate.parse_duration(item['contentDetails']['duration']).total_seconds(),
            "views": item["statistics"]["viewCount"],
            "language": item["snippet"]["defaultAudioLanguage"] if "defaultAudioLanguage" in item["snippet"] else "N/A",
            "thumbnail": item["snippet"]["thumbnails"]["high"]["url"],
            "channel_title": item["snippet"]["channelTitle"]
        }
        result_list.append(row)

    return result_list

# 3. Define a function to write videos to CSV file

In [6]:
# This function takes a list of videos and their metadata and saves the information to a CSV file
def write_videos_to_csv(videos, csv_file):
    fields = ["published_at", "channel_id", "yt_id", "url", "title", "description", "duration", "views", "language", "thumbnail", "channel_title"]

    # Write data to the CSV file
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fields, quoting=csv.QUOTE_ALL, escapechar='\\')
        
        # Write the header row
        writer.writeheader()
        
        # Write the data rows
        for item in videos:
            writer.writerow(item)

    print(f"Data successfully written to {csv_file}")   
    

# 4. Run for a YouTube channel

In [None]:
# Define a variable to store the YouTube's channel ID
channel_id = "UCAuUUnT6oDeKwE6v1NGQxug" # TED channel

# Run the function from above to retrieve all videos from the given channel
videos = get_channel_videos(channel_id)

# Define the name of the output file
output_file = "videos_from_ted_channel.csv"

# Write the results to a CSV file
write_videos_to_csv(videos, f"{CSV_OUTPUT_FOLDER}/{output_file}")

# 5. Run for a keyword search on YouTube

In [None]:
# Run the function from above to perform a YouTube search via the API
videos = search_youtube_videos("Vegane Ernährung", 50)

# Define the name of the output file
output_file = "videos_from_search.csv"

# Write the results to a CSV file
write_videos_to_csv(videos, f"{CSV_OUTPUT_FOLDER}/{output_file}")

# 6. Combine output of multiple CSV files into one

In [None]:
import pandas as pd
import glob

output_file_name = "videos_combined.csv"

# Get a list of all CSV files in the output directory
all_files = glob.glob(CSV_OUTPUT_FOLDER + "/*.csv")

# Create an empty list to store DataFrames
dfs = []

# Loop through each file and read it into a DataFrame, then append to the list
for filename in all_files:
    print(filename)
    df = pd.read_csv(filename)
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Write the combined DataFrame to a new CSV file
combined_df.to_csv(f"{CSV_OUTPUT_FOLDER}/{output_file_name}", index=False)