In [47]:
from private import DEVELOPER_KEY
from googleapiclient.discovery import build
from pprint import pprint
from emotion_text_classifier import analyze_sentiment
import pandas as pd
import csv
from langdetect import detect
from pyspark.sql import SparkSession, functions, types, udf

In [48]:
# Requirements:
# pip install tf-keras

In [49]:
api_service_name = "youtube"
api_version = "v3"

> Build YouTube API ETL pipeline

In [50]:
youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)

In [51]:
# Get video_id from all_audios_emotion.csv
video_ids = pd.read_csv('all_audios_emotion.csv')['video_id'].tolist()

In [52]:
request = youtube.videos().list(
    part = ['statistics', 'snippet'],
    id = video_ids
)    
response = request.execute()

In [53]:
contents = []
for item in response['items']:
    statisitcs = item['statistics']
    statisitcs.pop('favoriteCount', None)
    # tags to string
    tags = item['snippet'].get('tags', [])
    tags = ', '.join(tags)
    statisitcs.update({'tags': tags})
    contents.append(statisitcs)

In [54]:
# Check if the comment is in English
def is_english(s):
    try:
        return detect(s) == 'en'
    except:
        return False

def parse_sentiment(sentiment_list):
    sentiment_sums = {}
    sentiment_counts = {}

    for sentiment_str in sentiment_list:
        sentiments = sentiment_str.split("\n")
        for sentiment in sentiments:
            sentiment_name, sentiment_value = sentiment.split(" : ")
            sentiment_value = float(sentiment_value.strip("%"))
            sentiment_sums[sentiment_name] = sentiment_sums.get(sentiment_name, 0) + sentiment_value
            sentiment_counts[sentiment_name] = sentiment_counts.get(sentiment_name, 0) + 1

    sentiment_averages = {sentiment: sentiment_sums[sentiment] / sentiment_counts[sentiment] for sentiment in sentiment_sums}
    return sentiment_averages

In [55]:
# Get youtube video comments
average_sentiments = []
for video_id in video_ids:
    request = youtube.commentThreads().list(
        part = 'snippet',
        videoId = video_id,
        maxResults = 50
    )
    response = request.execute()
    comments_temp = []
    for item in response['items']:
        # Navigate through the nested dictionaries to get the comment text
        comment = item['snippet']['topLevelComment']['snippet']
        # Check if the comment is english and length is less than 100
        if is_english(comment['textDisplay']) and len(comment['textDisplay']) < 500:
            comments_temp.append(comment['textDisplay'])
    avg_sentiment = parse_sentiment([analyze_sentiment(comment) for comment in comments_temp])
    average_sentiments.append(avg_sentiment)

# Print the 5 comments
print(average_sentiments[:5])
print(len(average_sentiments))

[{' anger': 24.14157894736842, ' neutral': 34.918000000000006, ' sadness': 8.57375, ' joy': 60.6912, ' surprise': 6.186, ' disgust': 3.0999999999999996, ' fear': 4.725}, {' anger': 16.611428571428572, ' surprise': 21.21619047619047, ' disgust': 8.8125, ' neutral': 48.231351351351364, ' joy': 43.95172413793104, ' sadness': 15.193999999999999, ' fear': 6.4825}, {' anger': 15.55136363636364, ' surprise': 13.644375, ' joy': 32.865, ' neutral': 54.125, ' sadness': 33.400666666666666, ' fear': 9.487499999999999, ' disgust': 8.778}, {' joy': 53.646999999999984, ' neutral': 34.44842105263158, ' anger': 6.503478260869565, ' surprise': 17.156000000000002, ' fear': 40.66, ' sadness': 30.954615384615384, ' disgust': 49.07500000000001}, {' neutral': 40.12250000000001, ' disgust': 7.099999999999999, ' sadness': 2.068888888888889, ' joy': 64.51037037037035, ' anger': 12.133529411764707, ' surprise': 20.912222222222223, ' fear': 1.5859999999999999}]
10


In [56]:
# Convert joy to happiness
for sentiment in average_sentiments:
    sentiment[' happiness'] = sentiment.pop(' joy')

> The sentiment analysis model is trained on BERT, which typically has a maximum input length of 512 tokens. So I filtered out the comments that are longer than 500 tokens.

In [57]:
# Create a dictionary with the average sentiment values
average_sentiments_dict = {sentiment: [sentiment_dict[sentiment] for sentiment_dict in average_sentiments] for sentiment in average_sentiments[0]}

In [58]:
# Define a header
header = ['video_id', 'viewCount', 'likeCount', 'commentCount', 'tags'] + ["comment_" + key.strip() for key in list(average_sentiments[0].keys())]

# Write the data to a CSV file
with open('youtube_contents_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

    # Iterate over each content item and its index
    for i, content in enumerate(contents):
        # Prepare the tags string
        # Fetch the corresponding sentiment values by index
        sentiment_values = [average_sentiments_dict[sentiment][i] for sentiment in average_sentiments[0].keys()]
        # Write the row to the CSV
        writer.writerow([
            video_ids[i],
            content['viewCount'], content['likeCount'], content['commentCount'], content['tags']
        ] + sentiment_values)