In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

data = pd.read_csv('/kaggle/input/youtube-video-and-channel-analytics/YouTubeDataset_withChannelElapsed.csv')

df_agg = data.groupby('videoId', as_index=False).agg({
    'subscriberCount': 'mean',           
    'likes/subscriber': 'mean', 
    'dislikes/subscriber': 'mean',
    'comments/subscriber': 'mean',
    'views/subscribers': 'mean'

})
df_clean = df_agg[
    (df_agg['subscriberCount'] > 0) &
    (df_agg['likes/subscriber'] >= 0) &
    (df_agg['dislikes/subscriber'] >= 0) &
    (df_agg['comments/subscriber'] >= 0) &
    (df_agg['views/subscribers'] >= 0)
].copy()


# Reconstruct total likes, dislikes, comments, views per video from ratios

df_agg['calculated_likes'] = df_clean['likes/subscriber'] * df_clean['subscriberCount']
df_agg['calculated_dislikes'] = df_clean['dislikes/subscriber'] * df_clean['subscriberCount']
df_agg['calculated_comments'] = df_clean['comments/subscriber'] * df_clean['subscriberCount']
df_agg['calculated_views'] = df_clean['views/subscribers'] * df_clean['subscriberCount']

# 1. Calculate the average number of likes per video

avg_likes_per_video = df_agg['calculated_likes'].mean()
avg_likes_per_video_rounded = round(avg_likes_per_video)  # nearest whole number
print("Question 1: ", avg_likes_per_video_rounded)
# 2. Compute the engagement rate for each video, where engagement rate is defined as the sum of likes, dislikes, and comments divided 
# by the total number of views for that video. Finally, find the average engagement rate across all videos in the dataset.
df_agg['engagement_rate'] = (
    (df_agg['calculated_likes'] + df_agg['calculated_dislikes'] + df_agg['calculated_comments']) /
    df_agg['calculated_views'].replace(0, np.nan)
)
avg_engagement_rate = df_agg['engagement_rate'].mean(skipna=True)
avg_engagement_rate_4dp = round(avg_engagement_rate, 4)
print("Question 2:", avg_engagement_rate_4dp)


# 3. Each video contains 500 bytes of metadata and a 200 KB thumbnail. Calculate the combined storage size for a single video in megabytes (MB). (round to nearest 4 decimal points)
total_bytes = 500 + (200 * 1024)
combined_mega = total_bytes / (1024 * 1024)
result = round(combined_mega, 4)
print("Question 3:", result)

# 4. Each time someone engages with a video, the video’s metadata (likes , dislikes, comments, and thumbnail) is accessed 
# and potentially transmitted. The total data transfer refers to the sum of all data accessed for all engagements. Using the
# total number of engagements and combined storage size of a single video (You calculated in Question 3), 
# estimate the total data transfer required for all engagements with videos in the dataset. 
total_engagements = df_agg['calculated_likes'].sum() + df_agg['calculated_dislikes'].sum() + df_agg['calculated_comments'].sum()
total_data_transfer = total_engagements * result
print("Question 4:", round(total_data_transfer))


# 5. Estimate the number of views per minute, assuming the dataset spans 5 years with views uniformly distributed over time.
time = 5 * 365 * 24 * 60
total_views = df_agg['calculated_views'].sum()
num_views_min = total_views / time
print("Question 5:", round(num_views_min))

# 6. Estimate the amount of data accessed per minute to display video details to viewers.
data_total = num_views_min * combined_mega
print("Question 6:", round(data_total))




/kaggle/input/youtube-video-and-channel-analytics/YouTubeDataset_withChannelElapsed.csv
Question 1:  305
Question 2: 0.0087
Question 3: 0.1958
Question 4: 37359682
Question 5: 11884
Question 6: 2327
