# Import package 

In [None]:
from datetime import datetime as dt

import plotly.express as px
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load data 

In [None]:
PATH = '/../sample_data/entube_final.parquet'

In [None]:
df = pd.read_parquet(PATH)

# Data Understanding

## sample 

In [None]:
df.head(5)

Unnamed: 0,channel_id,channel_name,channel_category,channel_started,channel_rank,channel_subscribers,id,title,title_length,categories,...,comment_count,dislike_count,like_per_view,comment_per_view,dislike_per_view,engagement_rate_1,engagement_rate_2,q_score,label_1,label_2
0,UC0jDoh3tVXCaqJ6oTve8ebA,FAP TV,Comedy,2014,2,12800000,nehrVdADdH0,[FAP TV ] Thông Báo Tuyển Diễn Viên Nam Film L...,14,Film & Animation,...,827,105,0.020731,0.002456,0.000312,0.023187,0.023499,0.970364,2,2
1,UC0jDoh3tVXCaqJ6oTve8ebA,FAP TV,Comedy,2014,2,12800000,K66wOEaBwK4,Phía Sau Một Cô Gái - Soobin Hoàng Sơn | MV Fa...,14,Comedy,...,1594,664,0.010124,0.000622,0.000259,0.010746,0.011005,0.95007,1,2
2,UC0jDoh3tVXCaqJ6oTve8ebA,FAP TV,Comedy,2014,2,12800000,D00vn3X7oI8,FAPtv Cơm Nguội: Tập 94 - Dấu Ấn Học Đường Phần 2,12,Entertainment,...,2214,3089,0.006007,0.000234,0.000326,0.006241,0.006568,0.89692,1,2
3,UC0jDoh3tVXCaqJ6oTve8ebA,FAP TV,Comedy,2014,2,12800000,G22G1k3G-kM,FAPtv Cơm Nguội: Tập 100 - Hành Trình Vui Vẻ,10,Entertainment,...,1752,2202,0.007931,0.000228,0.000287,0.008159,0.008446,0.930209,1,2
4,UC0jDoh3tVXCaqJ6oTve8ebA,FAP TV,Comedy,2014,2,12800000,G5EG7ymPErw,FAPtv Cơm Nguội: Tập 95 - Dấu Ấn Học Đường Phầ...,12,Entertainment,...,2417,2208,0.006485,0.000303,0.000277,0.006789,0.007066,0.918029,1,2


## size 

In [None]:
len(df)

23738

## missing_rate of video

In [None]:
df.isnull().mean()

channel_id             0.000000
channel_name           0.000000
channel_category       0.000000
channel_started        0.000000
channel_rank           0.000000
channel_subscribers    0.000000
id                     0.000000
title                  0.000000
title_length           0.000000
categories             0.000000
description            0.017609
tags                   0.000000
num_tags               0.000000
upload_date            0.000000
delta_upload_date      0.000000
duration               0.000000
view_count             0.000000
like_count             0.000000
comment_count          0.000000
dislike_count          0.000000
like_per_view          0.000000
comment_per_view       0.000000
dislike_per_view       0.000000
engagement_rate_1      0.000000
engagement_rate_2      0.000000
q_score                0.000000
label_1                0.000000
label_2                0.000000
dtype: float64

## duplicated rate

In [None]:
df = pd.read_parquet(PATH)

In [None]:
len(df.drop_duplicates(['id']))/len(df)

1.0

In [None]:
len(df)

23738

## check filter 

### likes greate dislikes

In [None]:
len(df[df['like_count'] > df['dislike_count']]) / len(df)

1.0

### no dislike 

In [None]:
len(df[df['dislike_count'] > 0]) / len(df)

1.0

### duration

In [None]:
len(df[df['duration'] >= 1]) / len(df)

1.0

# EDA 

## label 

In [None]:
pd.options.display.float_format = '{:,.6f}'.format
percentiles = [0.01] + [0.05*x for x in range(1, 20)] + [0.99]

### label 1 

In [None]:
def get_distribution_from_label(df, label: str, value: int,columns: list):
    return df[df[label] == value][columns].describe(percentiles=percentiles)

#### Not Engage

In [None]:
label = 'label_1'
value = 0 # Not Engage
columns = ['label_1', 'engagement_rate_1']

In [None]:
get_distribution_from_label(df, label, value, columns)

Unnamed: 0,label_1,engagement_rate_1
count,8178.0,8178.0
mean,0.0,0.00289
std,0.0,0.001121
min,0.0,2e-06
1%,0.0,0.000497
5%,0.0,0.001073
10%,0.0,0.001365
15%,0.0,0.001609
20%,0.0,0.001846
25%,0.0,0.002042


#### Neutral

In [None]:
get_distribution_from_label(df, label, 1, columns)

Unnamed: 0,label_1,engagement_rate_1
count,7107.0,7107.0
mean,1.0,0.007792
std,0.0,0.001888
min,1.0,0.004937
1%,1.0,0.004989
5%,1.0,0.005179
10%,1.0,0.005411
15%,1.0,0.005654
20%,1.0,0.0059
25%,1.0,0.006146


#### Engage

In [None]:
get_distribution_from_label(df, label, 2, columns)

Unnamed: 0,label_1,engagement_rate_1
count,8453.0,8453.0
mean,2.0,0.028089
std,0.0,0.040511
min,2.0,0.011633
1%,2.0,0.011761
5%,2.0,0.012273
10%,2.0,0.012939
15%,2.0,0.013685
20%,2.0,0.014426
25%,2.0,0.01527


### label 2

In [None]:
label = 'label_2'
value = 0 # Not Engage
columns = ['label_2', 'q_score']

#### Not Engage

In [None]:
get_distribution_from_label(df, label, value, columns)

Unnamed: 0,label_2,q_score
count,8365.0,8365.0
mean,0.0,0.57258
std,0.0,0.166587
min,0.0,0.013158
1%,0.0,0.094802
5%,0.0,0.184915
10%,0.0,0.333333
15%,0.0,0.409231
20%,0.0,0.4583
25%,0.0,0.5


#### Neutral 

In [None]:
get_distribution_from_label(df, label, 1, columns)

Unnamed: 0,label_2,q_score
count,6949.0,6949.0
mean,1.0,0.837657
std,0.0,0.036972
min,1.0,0.764745
1%,1.0,0.767857
5%,1.0,0.777715
10%,1.0,0.782609
15%,1.0,0.791127
20%,1.0,0.8
25%,1.0,0.806452


#### Engage

In [None]:
get_distribution_from_label(df, label, 2, columns)

Unnamed: 0,label_2,q_score
count,8424.0,8424.0
mean,2.0,0.937025
std,0.0,0.023348
min,2.0,0.894777
1%,2.0,0.896371
5%,2.0,0.900183
10%,2.0,0.905411
15%,2.0,0.910105
20%,2.0,0.9146
25%,2.0,0.918473


## channel 

In [None]:
channel_df = (df
    .groupby(['channel_id'])
    .agg({'id': lambda x: x.nunique(),
          'channel_name': 'first',
          'channel_category': 'first',
          'channel_started': 'first',
          'channel_rank': 'first', 
          'channel_subscribers': 'first'})
)

channel_df = channel_df.reset_index()
channel_df.columns = [
    'channel_id', 'num_videos', 'channel_name',
    'channel_category', 'channel_started', 'channel_rank', 'channel_subscribers'              
]

In [None]:
channel_df.describe(percentiles=percentiles)

In [None]:
channel_df = (df
    .groupby(['channel_id'])
    .agg({'id': lambda x: x.nunique(),
          'channel_name': 'first',
          'channel_category': 'first',
          'channel_started': 'first',
          'channel_rank': 'first', 
          'channel_subscribers': 'first',
          'title_length': 'median',
          'num_tags': 'median',
          'delta_upload_date': 'median',
          'duration': 'median',
          'view_count': 'median',
          'like_count': 'median',
          'comment_count': 'median',
          'dislike_count': 'median',
          'like_per_view': 'median',
          'comment_per_view': 'median',
          'dislike_per_view': 'median',
          'engagement_rate_1': 'median',
          'engagement_rate_2': 'median',
          'q_score': 'median'})
)

channel_df = channel_df.reset_index()
channel_df.columns = [
    'channel_id', 
    'num_videos', 
    'channel_name',
    'channel_category', 
    'channel_started', 
    'channel_rank', 
    'channel_subscribers',
    'median_title_length', 
    'median_num_tags',
    'median_delta_upload_date',
    'median_duration',
    'median_view_count',
    'median_like_count', 
    'median_comment_count', 
    'median_dislike_count',
    'median_like_per_view', 
    'median_comment_per_view',
    'median_dislike_per_view', 
    'median_engagement_rate_1',
    'median_engagement_rate_2', 
    'median_q_score'     
]

In [None]:
channel_df

In [None]:
channel_df.to_json(DIR_PATH + 'channel_profiling.json')

### by channel_cate 

In [None]:
channel_cate_df = (channel_df
    .groupby(['channel_category'])
    .agg({
        'channel_name': 'count',
        'channel_started': 'min',
        'channel_rank': 'median',
        'channel_subscribers': 'median',
        'num_videos': 'median'
    })
)

In [None]:
channel_cate_df = channel_cate_df.reset_index()

In [None]:
channel_cate_df.columns = [
    'channel_category', 'num_channels', 'min_channel_started', 
    'median_channel_rank', 'median_channel_subscribers', 'median_num_videos'
]

In [None]:
channel_cate_df.T

In [None]:
channel_cate_df.to_json(DIR_PATH + 'channel_cate_1_profiling.json')

## video 

### by channel_cate 

In [None]:
channel_cate_df = (df
    .groupby(['channel_category'])
    .agg({
        'id': 'count',
        'title_length': 'median',
        'num_tags': 'median',
        'delta_upload_date': 'median',
        'duration': 'median',
        'view_count': 'median',
        'like_count': 'median',
        'comment_count': 'median',
        'dislike_count': 'median',
        'like_per_view': 'median',
        'comment_per_view': 'median',
        'dislike_per_view': 'median',
    })
)

In [None]:
channel_cate_df = channel_cate_df.reset_index()

In [None]:
channel_cate_df.columns = [
    'channel_category', 'num_videos',
    'median_title_length', 'median_num_tags',
    'median_delta_upload_date', 'median_duration',
    'median_view_count', 'median_like_count', 
    'median_comment_count', 'median_dislike_count',
    'median_like_per_view', 'median_comment_per_view',
    'median_dislike_per_view', 
]

In [None]:
channel_cate_df.T

In [None]:
channel_cate_df

In [None]:
channel_cate_df.to_json(DIR_PATH + 'channel_cate_2_profiling.json')

## by time series 

### number of videos 

In [None]:
df['upload_date_by_year'] = df['upload_date'].dt.year

In [None]:
df

In [None]:
channel_cate_by_time_series_df = (df
    .groupby(['channel_category','upload_date_by_year'])
    .agg({
        'id': 'count',
        'engagement_rate_1': 'median',
        'engagement_rate_2': 'median',
        'q_score': 'median',
    })
)

channel_cate_by_time_series_df = channel_cate_by_time_series_df.reset_index()
channel_cate_by_time_series_df.columns = [
    'channel_category', 'upload_date_by_year',
    'num_videos', 'median_engagement_rate_1',
    'median_engagement_rate_2', 'median_q_score'                                          
]

In [None]:
channel_cate_by_time_series_df.to_json(DIR_PATH + 'channel_cate_time_series_profiling.json')

In [None]:
fig = px.line(channel_cate_by_time_series_df, x='upload_date_by_year', y='num_videos', color='channel_category')
fig.update_layout(title="Number of video by channel category and time series")
fig.update_layout(xaxis_title="Year of updoad date")
fig.update_layout(yaxis_title="Number of videos")
fig.show()

### engagement_rate_1

In [None]:
fig = px.line(channel_cate_by_time_series_df, x='upload_date_by_year', y='median_engagement_rate_1', color='channel_category')
fig.update_layout(title="Median engagement rate 1 by channel category and time series")
fig.update_layout(xaxis_title="Year of updoad date")
fig.update_layout(yaxis_title="Median engagement rate 1")
fig.show()

### engagement_rate_2

In [None]:
fig = px.line(channel_cate_by_time_series_df, x='upload_date_by_year', y='median_engagement_rate_2', color='channel_category')
fig.update_layout(title="Median engagement rate 2 by channel category and time series")
fig.update_layout(xaxis_title="Year of updoad date")
fig.update_layout(yaxis_title="Median engagement rate 2")
fig.show()

### q_score 

In [None]:
fig = px.line(channel_cate_by_time_series_df, x='upload_date_by_year', y='median_q_score', color='channel_category')
fig.update_layout(title="Median Q score by channel category and time series")
fig.update_layout(xaxis_title="Year of updoad date")
fig.update_layout(yaxis_title="Median Q score")
fig.update_yaxes(range = [0, 1])
fig.show()

## correlation 

### by video 

In [None]:
def plot_corr(df, columns, label: str = ' by video'):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax = sns.heatmap(df[columns].corr(), annot=True, fmt=".2f", linewidths=.5, square = True, cmap="Blues", vmin=-1, vmax=1)
    plt.xlabel('Engagement Metrics' + label, size = 15)
    plt.ylabel('Engagement Metrics' + label, size = 15)
    plt.title('Pearson correlation coefficient of Engagement Metrics' + label, size = 15)
    plt.show()

In [None]:
columns = [
    'channel_subscribers',
    'delta_upload_date',
    'view_count',
    'like_count',
    'comment_count',
    'dislike_count'
]

In [None]:
plot_corr(df, columns)

In [None]:
columns = [
    'channel_subscribers',
    'delta_upload_date',
    'engagement_rate_1',
    'engagement_rate_2',
    'q_score'
]

In [None]:
plot_corr(df, columns)

### by channel 

In [None]:
channel_df = (df
    .groupby(['channel_id'])
    .agg({
        'channel_subscribers': 'first',
        'channel_category': 'first',
        'channel_name': 'first',
        'delta_upload_date': 'median',
        'view_count': 'median',
        'like_count': 'median',
        'comment_count': 'median',
        'dislike_count': 'median',
        'engagement_rate_1': 'median',
        'engagement_rate_2': 'median',
        'q_score': 'median'})
)

channel_df = channel_df.reset_index()
channel_df.columns = [
    'channel_id', 'channel_subscribers', 'channel_category', 'channel_name', 'median_delta_upload_date',
    'median_view_count', 'median_like_count', 'median_comment_count', 'median_dislike_count',
    'median_engagement_rate_1', 'median_engagement_rate_2', 'median_q_score'              
]

In [None]:
columns = [
    'channel_subscribers',
    'median_delta_upload_date',
    'median_view_count',
    'median_like_count',
    'median_comment_count',
    'median_dislike_count'
]

In [None]:
plot_corr(channel_df, columns, label=' by channel')

In [None]:
columns = [
    'channel_subscribers',
    'median_delta_upload_date',
    'median_engagement_rate_1',
    'median_engagement_rate_2',
    'median_q_score'
]

In [None]:
plot_corr(channel_df, columns, label=' by channel')

# Train and test split 

In [None]:
train_df = df[df['upload_date'].dt.year < 2020]
val_df = df[df['upload_date'].dt.year == 2020]
test_df = df[df['upload_date'].dt.year > 2020]
columns = ['id', 'label_1', 'label_2']

In [None]:
len(train_df[columns])

In [None]:
len(val_df[columns])

In [None]:
len(test_df[columns])