In [1]:
!pip install chat-downloader yt-dlp -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.33.1 requires urllib3<2.1,>=1.25.4; python_version >= "3.10", but you have urllib3 2.1.0 which is incompatible.
google-auth 2.22.0 requires urllib3<2.0, but you have urllib3 2.1.0 which is incompatible.
kfp 2.0.1 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.
kfp 2.0.1 requires urllib3<2.0.0, but you have urllib3 2.1.0 which is incompatible.
tensorflowjs 4.14.0 requires packaging~=23.1, but you have packaging 21.3 which is incompatible.
ydata-profiling 4.5.1 requires numpy<1.24,>=1.16.0, but you have numpy 1.24.3 which is incompatible.[0m[31m
[0m

In [3]:
import os
import contextlib
from functools import wraps
import re
from itertools import count, groupby
from pathlib import Path
from IPython.display import Audio, display

import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import shapiro
import soundfile as sf
import yt_dlp
import chat_downloader
import numpy as np

In [4]:
def download_youtube_audio(url):
    ydl_opts = {
        'format': 'bestaudio', # Using 'worst' directly doesn't seem to work as expected
        'format_sort': ['+size'], # Sorting in reverse order, hence getting the worst
        'outtmpl': '%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }]
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([url])
        video_info = ydl.extract_info(url, download=False)
    
    video_path_local = Path(f"{video_info['id']}.wav")
    
    return video_path_local

In [5]:
def find_longest_consecutive_group_original(arr):
    arr.sort()

    longest_group = []
    current_group = [arr[0]]

    for i in range(1, len(arr)):
        if arr[i] == current_group[-1] + 1:
            current_group.append(arr[i])
        else:
            if len(current_group) > len(longest_group):
                longest_group = current_group
            current_group = [arr[i]]

    if len(current_group) > len(longest_group):
        longest_group = current_group

    smallest_item = min(longest_group)

    return longest_group, smallest_item

def find_longest_consecutive_group_pythonic(arr):
    groups = [list(g) for k, g in groupby(sorted(arr), key=lambda x, c=count(): x-next(c))]
    longest_group = max(groups, key=len, default=[])
    smallest_item = min(longest_group, default=None)
    return longest_group, smallest_item

def find_lowest_volume_positions(file_path, timestamps_seconds, window_size=60, num_positions=10):
    # Load the audio file
    y, sr = librosa.load(file_path)

    # Calculate the RMS (root mean square) of the audio signal
    rms_values = librosa.feature.rms(y=y, hop_length=sr)[0]

    # Convert timestamps to sample indices
    timestamp_indices = [int(timestamp) for timestamp in timestamps_seconds]

    # Define the time window in samples
    window_size_samples = int(window_size)

    result_positions = []

    for timestamp_index in timestamp_indices:
        # Extract RMS values within the specified time window
        rms_window = rms_values[max(0, timestamp_index - window_size_samples):min(len(rms_values), timestamp_index + window_size_samples)]

        # Check if the window is empty
        if not rms_window.size:
            result_positions.append(None)
            continue

        # Find the top N positions with the lowest volume within the window
        lowest_volume_positions = np.argsort(rms_window)[:num_positions]

        longest_group, smallest_item = find_longest_consecutive_group_pythonic(lowest_volume_positions)

        # Convert positions to timestamps
        most_potent_position = timestamp_index - window_size_samples + smallest_item
        result_positions.append(most_potent_position)

    return result_positions

In [6]:
def extract_and_save_10_seconds_pieces(file_path, positions, output_folder, duration=10):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    y, sr = librosa.load(file_path)

    for i, position in enumerate(positions):
        if position is not None:
            start_time = max(0, position)
            end_time = min(len(y) / sr, position + duration)

            # Extract the 10-second piece
            piece = y[int(start_time * sr):int(end_time * sr)]

            # Save the piece into a WAV file
            output_path = os.path.join(output_folder, f"piece_{i + 1}.wav")
            sf.write(output_path, piece, sr)

            # Display an audio player for the saved piece
            display(Audio(data=piece, rate=sr))

In [7]:
def suppress_print(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        with open(os.devnull, 'w') as fnull:
            with contextlib.redirect_stdout(fnull):
                result = func(*args, **kwargs)
        return result
    return wrapper

In [8]:
@suppress_print
def download_chat(url, output_path):
    chat_downloader.run(url=url, output=output_path, options=['quiet'])

In [9]:
def process_chat_json(json_file_path):
    # Read the JSON file into a DataFrame
    df = pd.read_json(json_file_path)

    # Select relevant columns
    df = df[['message', 'time_text', 'timestamp']]

    # Convert 'timestamp' column to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Sort the DataFrame by 'timestamp'
    df = df.sort_values('timestamp')

    # Filter out rows where 'time_text' is greater than '0'
    df = df[df['time_text'] > '0'].reset_index(drop=True)

    # Calculate the time difference between consecutive rows
    df['time_difference'] = df['timestamp'].diff()
    df['time_difference_seconds'] = df['time_difference'].dt.total_seconds()
        
    return df

In [10]:
def calculate_upper_percentile(data, column_name, percentile):
    """
    Calculate the upper percentile of a column in a DataFrame.

    Parameters:
    - data: DataFrame containing the data.
    - column_name: Name of the column for which to calculate the upper percentile.
    - percentile: The desired percentile value.

    Returns:
    - upper_percentile_value: The value at the specified percentile.
    """
    
    upper_percentile_value = data[column_name].quantile(percentile)
    print(f"The value at the upper {percentile}%: {upper_percentile_value}")
    return upper_percentile_value

In [11]:
def check_normality(df, time_difference_column, alpha=0.05):
    """
    Check normality of a time difference column using Shapiro-Wilk test.

    Parameters:
    - df: DataFrame containing the data.
    - time_difference_column: Name of the column containing time differences as Timedelta.
    - alpha: Significance level for the Shapiro-Wilk test.

    Returns:
    - None: Prints the result of the normality test.
    """

    # Drop NaN values if any
    df = df.dropna(subset=[time_difference_column])

    # Perform Shapiro-Wilk test on the converted values
    statistic, p_value = shapiro(df['time_difference_seconds'])
    print(f"Shapiro-Wilk Test:\nStatistic: {statistic}\nP-value: {p_value}")

    # Check normality based on the p-value
    if p_value > alpha:
        print(f"The {time_difference_column} column follows a normal distribution.")
    else:
        print(f"The {time_difference_column} column does not follow a normal distribution.")


In [12]:
def create_kernel_density_plot(data, column_name):
    """
    Create a Kernel Density Plot.

    Parameters:
    - data: DataFrame containing the data.
    - column_name: Name of the column for which to create the Kernel Density Plot.

    Returns:
    - fig: Plotly Figure object.
    """
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=data[column_name], nbinsx=30, histnorm='probability', opacity=0.7,
                               marker=dict(color='rgba(0, 123, 255, 0.7)')))
    fig.update_layout(title=f'Kernel Density Plot of {column_name}',
                      xaxis=dict(title=f'{column_name} (seconds)'),
                      yaxis=dict(title='Probability Density'))
    return fig

def create_box_plot(data, column_name):
    """
    Create a Box Plot.

    Parameters:
    - data: DataFrame containing the data.
    - column_name: Name of the column for which to create the Box Plot.

    Returns:
    - fig: Plotly Figure object.
    """
    fig = go.Figure()
    fig.add_trace(go.Box(y=data[column_name], boxpoints='all', jitter=0.3, pointpos=-1.8,
                         marker=dict(color='rgba(255, 0, 0, 0.7)'), line=dict(color='rgba(0, 0, 0, 0.9)')))
    fig.update_layout(title=f'Boxplot of {column_name}',
                      xaxis=dict(title=''),
                      yaxis=dict(title=f'{column_name} (seconds)'))
    return fig

In [13]:
def find_emojis(dataframe):
    dataframe['emojis'] = dataframe['message'].str.findall(r':([^:]+):')
    return dataframe

def contains_excluded_emojis(emojis, excluded_names=None):
    default_excluded_names = ['clapping_hands', 'washhands']
    
    # If excluded_names is provided, merge it with the default list and remove duplicates
    excluded_names = list(set(excluded_names + default_excluded_names)) if excluded_names else default_excluded_names

    # Check if any excluded emoji name is present in the emojis list
    return any(name in emojis for name in excluded_names)


def filter_emojis(dataframe, emoji_names_to_exclude=None):
    def _has_repeated_emojis(emojis):
        # Check if any emojis are repeated in the list
        return any(emojis.count(e) > 1 for e in set(emojis))

    def _exceeds_non_emoji_length(row):
        # Extract non-emoji part of the message
        non_emoji_part = re.sub(r':[^:]+:', '', row['message'])
        # Check if the length of emojis exceeds the length of the non-emoji part
        return len(row['emojis']) > len(non_emoji_part)

    # If emoji_names_to_exclude is not provided, set it to an empty list
    emoji_names_to_exclude = emoji_names_to_exclude or []

    filtered_dataframe = dataframe[
        (~dataframe['emojis'].apply(contains_excluded_emojis, excluded_names=emoji_names_to_exclude)) &
        (dataframe['emojis'].apply(len) > 0) &  # Filter out rows with empty emojis
        (dataframe['emojis'].apply(_has_repeated_emojis)) &  # Filter out rows with no repeated emojis
        (dataframe.apply(_exceeds_non_emoji_length, axis=1))  # Filter out rows where emoji length <= non-emoji length
    ]

    return filtered_dataframe.sort_values('timestamp').reset_index(drop=True)


def group_and_aggregate(dataframe):
    grouped_df = dataframe.groupby('time_text').agg(
        {'emojis': lambda x: sum(map(len, x)),
         'timestamp': 'first',
         'message': 'last'}
    )
    result = grouped_df[grouped_df['emojis'] > 1]
    return result.sort_values('timestamp').reset_index(drop=False)


def parse_time_and_diff(dataframe):
    dataframe['total_seconds'] = dataframe['time_text'].apply(
        lambda x: sum(int(value) * 60**(i) for i, value in enumerate(
            reversed(x.split(':'))
        ))
    )

    dataframe['time_diff'] = dataframe['total_seconds'].diff()

    return dataframe


def create_time_groups(dataframe, upper_fence):
    dataframe['group'] = (dataframe['time_diff'] > upper_fence).cumsum()
    return dataframe


def filter_time_groups(dataframe, threshold_seconds=120):
    consecutive_groups = dataframe.groupby('group')
    filtered_groups = consecutive_groups.filter(
        lambda x: (x['total_seconds'].max() - x['total_seconds'].min()) > threshold_seconds
    )
    return filtered_groups

In [14]:
def calculate_emoji_percentage(df, emoji_column='emojis'):
    """
    Calculate the percentage of each emoji in a DataFrame.

    Parameters:
    - df (pandas.DataFrame): DataFrame containing the emoji column.
    - emoji_column (str): Name of the column containing emojis. Default is 'emojis'.
    - percent_threshold (float): Minimum percentage for an emoji to be included in the result. Default is 1.

    Returns:
    - pandas.DataFrame: DataFrame with emoji count and percentage.
    """
    # Check if emoji_column exists in the DataFrame
    if emoji_column not in df.columns:
        raise ValueError(f"The DataFrame must have a '{emoji_column}' column.")

    df = df[~df['emojis'].apply(contains_excluded_emojis)]
        
    # Flatten the list of emojis
    all_emojis = [emoji for emojis_list in df[emoji_column] for emoji in emojis_list]
    
    # Create a Series from the list of emojis
    emoji_series = pd.Series(all_emojis)
    
    # Get the frequency of each emoji
    emoji_frequency = emoji_series.value_counts()
    
    # Calculate the total count of all emojis
    total_count = emoji_frequency.sum()
    
    # Calculate the percentage for each emoji
    emoji_percentage = (emoji_frequency / total_count) * 100
    
    # Combine emoji frequency and percentage into a DataFrame
    result_df = pd.DataFrame({'count': emoji_frequency, 'percentage': emoji_percentage})
 
    return result_df

In [15]:
def save_group_info_to_csv(result, output_file='group_info.csv'):
    # Count the number of groups
    num_groups = len(result.groupby('group'))

    # Create a list to store the information for each group
    group_info_list = []

    # Iterate over the filtered groups and append information to the list
    for i, (group_name, group_df) in enumerate(result.groupby('group')):
        first_row = group_df.iloc[0]  # Get the first row of the group
        group_info = {
            "Group Number": i + 1,
            "Time Text": first_row['time_text'],
            "total_seconds": first_row['total_seconds']
        }
        group_info_list.append(group_info)

    # Convert the list of dictionaries to a DataFrame
    group_info_df = pd.DataFrame(group_info_list)

    # Save the DataFrame to a CSV file
    #group_info_df.to_csv(output_file, index=False)
    csv_string = group_info_df.to_csv(index=False)

    print(f"{num_groups} groups were found.")
    print("===")
    print(group_info_df)
    
    return csv_string

In [16]:
def save_group_info_to_csv(result, output_file='group_info.csv'):
    # Group by 'group' and aggregate information for each group
    group_info_df = result.groupby('group').agg(
        Group_Number=pd.NamedAgg(column='group', aggfunc='count'),
        Time_Text=pd.NamedAgg(column='time_text', aggfunc='first'),
        Total_Seconds=pd.NamedAgg(column='total_seconds', aggfunc='first')
    ).reset_index()

    # Save the DataFrame to a CSV file
    group_info_df.to_csv(output_file, index=False)

    num_groups = len(group_info_df)

    print(f"{num_groups} groups were found.")
    print("===")
    print(group_info_df)
    print("===")
    print(f"Group information has been saved to '{output_file}'.")
    print("===")
    
    return group_info_df

In [17]:
def chat_analyzer(youtube_url, json_file_path, upper_percentile, 
                  drop_threshold, duration_threshold):
    
    download_chat(youtube_url, json_file_path)

    # Process chat data
    df = process_chat_json(json_file_path)

    # Calculate upper percentile
    upper_fence = calculate_upper_percentile(df, 'time_difference_seconds', upper_percentile)
    emoji_percents = calculate_emoji_percentage(find_emojis(df))
    drop_emojis = emoji_percents[emoji_percents['percentage'] < drop_threshold].index.tolist()

    # Define emoji names to exclude
    default_drop_emojis = ['clapping_hands', 'washhands', '_ないすぱ']
    emoji_names_to_exclude = list(set(default_drop_emojis + drop_emojis))

    # Perform the data processing pipeline
    result = (
        df.pipe(find_emojis)
          .pipe(filter_emojis, emoji_names_to_exclude)
          .pipe(group_and_aggregate)
          .pipe(parse_time_and_diff)
          .pipe(create_time_groups, upper_fence)
          .pipe(filter_time_groups, threshold_seconds=duration_threshold)
    )

    # Get CSV data as a string
    csv_data = save_group_info_to_csv(result)

    return csv_data

In [18]:
def seconds_to_hh_mm_ss(total_seconds):
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02}"

def convert_seconds_to_dataframe(seconds_list):
    data = {'Total Seconds': seconds_list}
    df = pd.DataFrame(data)

    df['Hours'] = df['Total Seconds'] // 3600
    df['Minutes'] = (df['Total Seconds'] % 3600) // 60
    df['Seconds'] = df['Total Seconds'] % 60

    df['Time Format'] = df['Total Seconds'].apply(seconds_to_hh_mm_ss)

    return df[['Total Seconds', 'Time Format']]

In [19]:
def process_youtube_video(youtube_url, csv_data, output_folder='.'):
    # Download audio from YouTube
    downloaded_audio_path = download_youtube_audio(youtube_url)

    # Extract chat-based timestamps from CSV data
    chat_based_ts = csv_data['Total_Seconds'].to_list()

    # Adjust timestamps based on audio volume
    ts_adjusted_with_audio = find_lowest_volume_positions(downloaded_audio_path, chat_based_ts)

    # Extract and save 10-second pieces
    extract_and_save_10_seconds_pieces(downloaded_audio_path, ts_adjusted_with_audio, output_folder)
    
    return convert_seconds_to_dataframe(ts_adjusted_with_audio)

In [20]:
youtube_url = 'https://www.youtube.com/live/s7kh4GEb2sg'
json_file_path = './chat.json'

#https://www.youtube.com/live/s7kh4GEb2sg amane namida
#https://www.youtube.com/watch?v=WyHVJ1-ifdQ kukuri

In [24]:
csv_data = chat_analyzer(youtube_url, json_file_path, 
                         upper_percentile=0.999, drop_threshold=1, 
                         duration_threshold=120)

[INFO] Site: youtube.com
[INFO] Retrieving chat for "【朝活フルート&雑談】初見さん大歓迎🌟メリークリスマス🎄寝起きだけどフルート生演奏♪クリスマスイブはフルートで癒されよ💕【#flute #天音なみだ / #Vtuber #shorts 】".
[INFO] Finished retrieving chat messages.


The value at the upper 0.999%: 82.9124795920026
12 groups were found.
===
    group  Group_Number Time_Text  Total_Seconds
0       0            20      7:14            434
1       1            23     15:44            944
2       2            24     27:36           1656
3       3            23     39:47           2387
4       4            33     49:31           2971
5       5            35   1:06:10           3970
6       7            36   1:30:51           5451
7       9            22   1:48:00           6480
8      10            23   2:03:46           7426
9      11            18   2:17:38           8258
10     12            29   2:26:25           8785
11     13            34   2:38:08           9488
===
Group information has been saved to 'group_info.csv'.
===


In [25]:
adjust_timestamp_table = process_youtube_video(youtube_url, csv_data)

[youtube] Extracting URL: https://www.youtube.com/live/s7kh4GEb2sg
[youtube] s7kh4GEb2sg: Downloading webpage
[youtube] s7kh4GEb2sg: Downloading ios player API JSON
[youtube] s7kh4GEb2sg: Downloading android player API JSON
[youtube] s7kh4GEb2sg: Downloading player 4fd50162
[youtube] s7kh4GEb2sg: Downloading m3u8 information
[info] s7kh4GEb2sg: Downloading 1 format(s): 139
[download] Destination: s7kh4GEb2sg.m4a
[download] 100% of   60.23MiB in 00:00:17 at 3.35MiB/s     
[FixupM4a] Correcting container of "s7kh4GEb2sg.m4a"
[ExtractAudio] Destination: s7kh4GEb2sg.wav
Deleting original file s7kh4GEb2sg.m4a (pass -k to keep)
[youtube] Extracting URL: https://www.youtube.com/live/s7kh4GEb2sg
[youtube] s7kh4GEb2sg: Downloading webpage
[youtube] s7kh4GEb2sg: Downloading ios player API JSON
[youtube] s7kh4GEb2sg: Downloading android player API JSON
[youtube] s7kh4GEb2sg: Downloading m3u8 information


In [26]:
adjust_timestamp_table

Unnamed: 0,Total Seconds,Time Format
0,407,00:06:47
1,918,00:15:18
2,1642,00:27:22
3,2364,00:39:24
4,2935,00:48:55
5,3958,01:05:58
6,5433,01:30:33
7,6451,01:47:31
8,7404,02:03:24
9,8223,02:17:03


[0:04:50]  ・  スタート

[0:09:04]  1.  雪の華  //  中島美嘉

[0:16:05]  2.  ORION  //  中島美嘉

[0:22:33]  3.  ベテルギウス  //  優里

[0:30:01]  4.  カブトムシ  //  aiko

[0:36:15]  5.  三国駅  //  aiko

[0:42:18]  6.  クリスマス・イブ  //  山下達郎

[0:47:08]  7.  恋人がサンタクロース  //  荒井由実（松任谷由実）

[0:54:09]  8.  クリスマスソング  //  back number

[1:00:55]  9.  メリクリ  //  BoA

[1:07:13] 10.  Everything  //  MISIA


0:06:54  All I Want For Christmas Is You / マライア・キャリー

0:15:22  ハナミズキ / 一青窈

0:27:28  Snow halation / μ's

0:39:32  いのちの名前 / 木村弓

0:49:00  すてきなホリデイ / 竹内まりや

1:06:04  私は最強 / Ado

1:19:41  ホール・ニュー・ワールド / Alan Menken 

1:30:38  愛をこめて花束を / Superfly

1:47:35  旅立ちの日に / 高橋浩美

2:03:30  クリスマスソング / back number

2:17:06  光るなら / Goose house

2:26:14  君の知らない物語 / supercell

2:38:23  情熱大陸 / 葉加瀬太郎 