# Import package 

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime as dt

# Load data 

In [2]:
df = pd.read_csv("../data/metadata.csv")

In [3]:
df.head()

Unnamed: 0,video_id,like_count,dislike_count,view_count,comment_count,likes_rate,comments_rate,q_score,num_unique_users_liked_comments,title,channel_id,level_channel,category_channel,label,day_updoad,month_upload,year_upload
0,pRNKwV9iNYs,171179,5012,10694315,8720,16.006542,0.81548,0.943102,692.0,FAPtv Cơm Nguội: Tập 199 - Tình Bạn Và Tình Yêu,UC0jDoh3tVXCaqJ6oTve8ebA,High subscribers,Comedy,High engagement,22,7,2019
1,dVvslK05bIA,177725,6481,14351392,7464,12.383816,0.520159,0.929628,690.0,FAPtv Cơm Nguội: Tập 206 - Chuyện Tình Chàng Q...,UC0jDoh3tVXCaqJ6oTve8ebA,High subscribers,Comedy,High engagement,29,9,2019
2,EzUtfm8xAeE,188976,8590,18321883,6271,10.314224,0.342323,0.913037,670.0,FAPtv Cơm Nguội: Tập 210 - Tình Yêu Không Cần ...,UC0jDoh3tVXCaqJ6oTve8ebA,High subscribers,Comedy,High engagement,24,11,2019
3,mYlV1ZjKYVM,305022,17089,32070609,11591,9.510951,0.361421,0.893891,668.0,FAPtv Cơm Nguội: Tập 205 - Hắc Bạch Công Tử,UC0jDoh3tVXCaqJ6oTve8ebA,High subscribers,Comedy,High engagement,13,9,2019
4,S5lH3PAMZn0,202053,7824,16713944,7402,12.088888,0.442864,0.925438,635.0,FAPtv Cơm Nguội: Tập 196 - Bạc Phận,UC0jDoh3tVXCaqJ6oTve8ebA,High subscribers,Comedy,High engagement,20,5,2019


In [4]:
df['level_channel'].value_counts()

Medium subscribers    2092
High subscribers      1788
Low subscribers       1081
Name: level_channel, dtype: int64

In [5]:
len(set(df['channel_id']))

20

# EDA 

In [6]:
assert len(df) == len(set(df['video_id']))

In [24]:
def get_percentage(x, y, title, xaxis_title, yaxis_title):
    
    # Use textposition='auto' for direct text
    fig = go.Figure(data=[go.Bar(
            x=x, y=y,
            text=y,
            textposition='auto',
        )])
    
    fig.update_layout(autosize=False, height=500, width=500)
    fig.update_layout(title=title)
    fig.update_layout(yaxis_title=yaxis_title)
    fig.update_layout(xaxis_title=xaxis_title)
    fig.show()

In [48]:
def get_distribution(df, x, y, color, title, xaxis_title, yaxis_title):
    
    fig = px.box(df, x=x, y=y, color=color)
    fig.update_layout(title=title)
    fig.update_layout(yaxis_title=yaxis_title)
    fig.update_layout(xaxis_title=xaxis_title)
    fig.update_layout(autosize=False, height=400, width=800)
    fig.show()

## Channel categories

In [49]:
df['category_channel'].value_counts(normalize=True)

Travel and events    0.572062
Comedy               0.427938
Name: category_channel, dtype: float64

In [50]:
get_percentage(['Travel and Events', 'Comedy'], [0.57, 0.43],
               "Percentage of each category in dataset",
              "Category of Youtube channels", "Percent")

## Channel levels

In [51]:
df['level_channel'].value_counts(normalize=True)

Medium subscribers    0.421689
High subscribers      0.360411
Low subscribers       0.217900
Name: level_channel, dtype: float64

In [52]:
get_percentage(['High subscriptions', 'Medium subscriptions', 'Low subscriptions'], [36, 42, 22],
               "Percentage of each level of subscriber count in dataset",
              "Level of subscriber count", "Percent")

## Channel levels and chnnel categories

In [53]:
df[df['category_channel'] == "Comedy"]["level_channel"].value_counts()/len(df)

High subscribers      0.194316
Medium subscribers    0.183632
Low subscribers       0.049990
Name: level_channel, dtype: float64

In [54]:
df[df['category_channel'] != "Comedy"]["level_channel"].value_counts()/len(df)

Medium subscribers    0.238057
Low subscribers       0.167910
High subscribers      0.166096
Name: level_channel, dtype: float64

In [55]:
labels = ["High subscriptions","Medium subscriptions",  "Low subscriptions",]

fig = go.Figure(data=[
    go.Bar(name='Comedy', x=labels, y=[19, 18,  5], text=[19, 18,  5],
            textposition='auto',),
    go.Bar(name='Travel and events', x=labels, y=[24, 17, 17], text=[24, 17, 17],
            textposition='auto',),
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(title="Percentage of each category in each level of subscriber count")
fig.update_layout(yaxis_title="Percent")
fig.update_layout(xaxis_title="Level of subscriber count")
fig.update_layout(autosize=False, height=500, width=800)
fig.show()

## Number of views

In [56]:
len(df[df["view_count"] < 10000000])/len(df)

0.9832695021165088

In [57]:
get_distribution(df[df["view_count"] < 10000000], "level_channel", "view_count", 
                 "category_channel", "Number of views in each level of subscriber count",
                 "Level of subscriber count", "Number of views of video")

## Number of likes 

In [58]:
len(df[df["like_count"] < 100000])/len(df)

0.9814553517436001

In [59]:
get_distribution(df[df["like_count"] < 100000], "level_channel", "like_count", 
                 "category_channel", "Number of likes in each level of subscriber count",
                 "Level of subscriber count", "Number of likes of video")

## Number of dislikes 

In [60]:
len(df[df['dislike_count'] < 5000])/len(df)

0.9854867970167305

In [61]:
get_distribution(df[df["dislike_count"] < 5000], "level_channel", "dislike_count", 
                 "category_channel", "Number of dislikes in each level of subscriber count",
                 "Level of subscriber count", "Number of dislikes of video")

## Number of comments

In [62]:
len(df[df["comment_count"] < 5000]) / len(df)

0.9824632130618827

In [63]:
get_distribution(df[df["dislike_count"] < 5000], "level_channel", "dislike_count", 
                 "category_channel", "Number of comments in each level of subscriber count",
                 "Level of subscriber count", "Number of comments of video")

# Analytic 

## Like rate 

In [64]:
len(df[df['likes_rate'] < 50])/len(df)

0.9983874218907478

In [65]:
get_distribution(df[df["likes_rate"] < 50], "level_channel", "likes_rate", 
                 "category_channel", "Likes rate of videos in each level of subscriber count",
                 "Level of subscriber count", "Like rate of video")

## Comment rate 

In [66]:
len(df[df['comments_rate'] < 20])/len(df)

0.9959685547268696

In [67]:
get_distribution(df[df["comments_rate"] < 20], "level_channel", "comments_rate", 
                 "category_channel", "Comment rate of videos in each level of subscriber count",
                 "Level of subscriber count", "Comment rate of video")

## Q score 

In [68]:
len(df[df["q_score"] > 0]) / len(df)

1.0

In [69]:
get_distribution(df[df["q_score"] < 20], "level_channel", "q_score", 
                 "category_channel", "Q score of videos in each level of subscriber count",
                 "Level of subscriber count", "Q score of video")

## Number unique users in liked comment 

In [70]:
get_distribution(df, "level_channel", "num_unique_users_liked_comments", 
                 "category_channel", "Number unique users in liked comment of videos in each level of subscriber count",
                 "Level of subscriber count", "Number unique users in liked comment of video")

## Label for videos

In [73]:
fig = px.scatter(df, x="q_score", y="num_unique_users_liked_comments", color="level_channel")
fig.update_layout(title="Q score and number of users in liked comments per video")
fig.update_layout(yaxis_title="Number unique users in liked comment of video")
fig.update_layout(xaxis_title="Q score of video")
fig.update_layout(autosize=False, height=400, width=800)
fig.show()

## Top channel 

In [74]:
channel_list = ['UC0jDoh3tVXCaqJ6oTve8ebA', 'UCZE88kYvCKUKjM-G0uc8Duw']

In [75]:
def get_upload_date_by_month(row):
    return dt(row['year_upload'], row['month_upload'], 1)

In [76]:
df['upload_date_by_month'] = df.apply(get_upload_date_by_month, axis=1)

In [77]:
channel_df = df[df['channel_id'].isin(channel_list)]

In [78]:
def get_channel_name(channel_id):
    channel_id_dict = {
           'UC0jDoh3tVXCaqJ6oTve8ebA': 'FAP TV',
           'UCZE88kYvCKUKjM-G0uc8Duw': 'Khoai Lang Thang'
    }
    
    return channel_id_dict[channel_id]

In [79]:
channel_df['channel'] = channel_df['channel_id'].apply(get_channel_name)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [80]:
channel_by_month = (channel_df.groupby(['channel', 'upload_date_by_month'])
                    .agg({'q_score': 'mean', 'num_unique_users_liked_comments': 'mean'})
)
channel_by_month.columns = ['avg_q_score', 'avg_num_unique_users_liked_comments']
channel_by_month = channel_by_month.reset_index()

In [81]:
channel_by_month_comedy = channel_by_month[channel_by_month['channel'] == 'FAP TV']

In [82]:
channel_by_month_travel = channel_by_month[channel_by_month['channel'] != 'FAP TV']

In [83]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=channel_by_month_comedy['upload_date_by_month'],
                    y=channel_by_month_comedy['avg_q_score'],
                    mode='lines+markers',
                    name='FAP TV'))
fig.add_trace(go.Scatter(x=channel_by_month_travel['upload_date_by_month'], 
                    y=channel_by_month_travel['avg_q_score'],
                    mode='lines+markers',
                    name='Khoai Lang Thang'))
fig.update_layout(yaxis_range=[0, 1])
fig.update_layout(yaxis_title="Average q score in month")
fig.update_layout(xaxis_title="Upload date by month")
fig.update_layout(title="Q score by month of top channels")
fig.show()

In [84]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=channel_by_month_comedy['upload_date_by_month'],
                    y=channel_by_month_comedy['avg_num_unique_users_liked_comments'],
                    mode='lines+markers',
                    name='FAP TV'))
fig.add_trace(go.Scatter(x=channel_by_month_travel['upload_date_by_month'], 
                    y=channel_by_month_travel['avg_num_unique_users_liked_comments'],
                    mode='lines+markers',
                    name='Khoai Lang Thang'))
fig.update_layout(yaxis_title="Average num unique users in liked comments in month")
fig.update_layout(xaxis_title="Upload date by month")
fig.update_layout(title="Num unique users in liked comments by month of top channels")
fig.show()

# Experiment 

## Video titles, audios, video thumbnails and video frames

In [85]:
title_train_df = pd.read_csv("../data/train.csv")

In [86]:
title_test_df = pd.read_csv("../data/test.csv")

In [87]:
len(title_train_df) + len(title_test_df)

4961

In [88]:
title_df = pd.concat([title_train_df, title_test_df])

In [89]:
title_train_df['label'].value_counts(normalize=True)

Low engagement       0.458921
Medium engagement    0.387601
High engagement      0.153478
Name: label, dtype: float64

In [90]:
title_train_df.head()

Unnamed: 0,video_id,label
0,Z4BTKeXXQoA,High engagement
1,dN3s5POftlg,Low engagement
2,1joeJMr2Mxs,Low engagement
3,Jee74jecmAM,Medium engagement
4,HZO7DbjPLWw,Medium engagement


In [91]:
get_percentage(['High engagement', 'Medium engagement', 'Low engagement'], [15, 39, 46],
               "Titles labeled by level of engagement",
              "Label", "Percent")

In [92]:
get_percentage(['High engagement', 'Medium engagement', 'Low engagement'], [15, 39, 46],
               "Audio labeled by level of engagement",
              "Label", "Percent")

In [93]:
get_percentage(['High engagement', 'Medium engagement', 'Low engagement'], [15, 39, 46],
               "Thumbnails labeled by level of engagement",
              "Label", "Percent")

In [94]:
get_percentage(['High engagement', 'Medium engagement', 'Low engagement'], [15, 39, 46],
               "Frames labeled by level of engagement",
              "Label", "Percent")

## Video transcriptions

In [95]:
transcript_train_df = pd.read_csv("../data/train_transcipt.csv")

In [96]:
transcript_test_df = pd.read_csv("../data/test_transcipt.csv")

In [97]:
len(transcript_train_df) + len(transcript_test_df)

1506

In [98]:
transcript_df = pd.concat([transcript_train_df, transcript_test_df])

In [99]:
len(transcript_test_df) / len(transcript_df)

0.20385126162018594

In [100]:
transcript_train_df.head()

Unnamed: 0,video_id,label
0,3cKHS4Q1clU,Medium engagement
1,Y9ohp3XLY84,Low engagement
2,neE8buwDcio,Medium engagement
3,JPBr1LIoVzY,Low engagement
4,dHN5NZg6j3w,Medium engagement


In [101]:
transcript_df['label'].value_counts(normalize=True)

Medium engagement    0.467463
Low engagement       0.367862
High engagement      0.164675
Name: label, dtype: float64

In [102]:
get_percentage(['High engagement', 'Medium engagement', 'Low engagement'], [16, 47, 37],
               "Transcripts labeled by level of engagement",
              "Label", "Percent")