In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from jupyterthemes import jtplot
jtplot.style()
%matplotlib inline

# Load Dataset

In [2]:
count = pd.read_csv('video_count.csv', parse_dates=['date'])
count.head()

FileNotFoundError: [Errno 2] File video_count.csv does not exist: 'video_count.csv'

In [None]:
count.info()

In [None]:
feature = pd.read_csv('video_features.csv',parse_dates=['video_upload_date'])
feature.head()

In [None]:
feature.info()

In [None]:
# merge data
data = pd.merge(left=count, right=feature, on='video_id', how='left')
data = data.rename(columns={'video_length': 'length', 
                            'video_language': 'language', 
                            'video_upload_date': 'upload_date', 
                            'video_quality': 'quality'})
data.head()

# Data Processing and Analysis

In [None]:
# extract time interval information
data['interval'] = (data['date'] - data['upload_date']).apply(lambda x: x.days)

# extract video play day of week
data['dayofweek'] = data['date'].apply(lambda x: x.dayofweek)

data.head()

In [None]:
# visualization of count
warnings.filterwarnings('ignore')
hist_kws={'histtype': 'bar', 'edgecolor':'black', 'alpha': 0.2}

fig, ax = plt.subplots(figsize=(8, 4))
sns.distplot(data['count'], ax=ax, hist_kws=hist_kws)
plt.tight_layout()
plt.show()

In [None]:
# visualization of length
fig, ax = plt.subplots(figsize=(8, 5))
sns.distplot(data['length'], ax=ax, hist_kws=hist_kws)
plt.tight_layout()
plt.show()

In [None]:
# visualization of length
sns.jointplot(x='length', y='count', data=data, kind='scatter', size=6)
plt.tight_layout()
plt.show()

In [None]:
# visualization of language
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
sns.countplot(x='language', data=data, ax=ax[0])
ax[0].set_title('Count Plot of language', fontsize=16)

sns.barplot(x='language', y='count', data=data, ax=ax[1])
ax[1].set_title('Count vs. language', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of quality
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
sns.countplot(x='quality', data=data, ax=ax[0])
ax[0].set_title('Count Plot of quality', fontsize=16)

sns.barplot(x='quality', y='count', data=data, ax=ax[1])
ax[1].set_title('Count vs. quality', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of interval
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
sns.countplot(x='interval', data=data, ax=ax[0])
ax[0].set_title('Count Plot of interval', fontsize=16)

sns.barplot(x='interval', y='count', data=data, ax=ax[1])
ax[1].set_title('Count vs. interval', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of dayofweek
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
sns.countplot(x='dayofweek', data=data, ax=ax[0])
ax[0].set_title('Count Plot of dayofweek', fontsize=16)

sns.barplot(x='dayofweek', y='count', data=data, ax=ax[1])
ax[1].set_title('Count vs. dayofweek', fontsize=16)
plt.tight_layout()
plt.show()

# Questions

In [None]:
data.head()

### Question 1

In [None]:
def parse_data(df):
    """ function to calculate the statistics for """
    counts = df.sort_values(by='date')['count'].values
    change = (counts[1:] - counts[:-1]) / counts[:-1]
    
    index = ['ct_mean', 'ct_std', 'ct_min', 'ct_25%', 'ct_50%', 'ct_75%', 'ct_max', 
             'cg_mean', 'cg_std', 'cg_min', 'cg_25%', 'cg_50%', 'cg_75%', 'cg_max']
    
    ct_vals = pd.Series(counts).describe()
    cg_vals = pd.Series(change).describe()
    
    ct_values = [ct_vals['mean'], ct_vals['std'], ct_vals['min'], ct_vals['25%'], 
                 ct_vals['50%'], ct_vals['75%'], ct_vals['max']]
    cg_values = [cg_vals['mean'], cg_vals['std'], cg_vals['min'], cg_vals['25%'], 
                 cg_vals['50%'], cg_vals['75%'], cg_vals['max']]
    
    return pd.Series(ct_values + cg_values, index=index)

data_stats = data.groupby('video_id').apply(parse_data)
data_stats.head()

In [None]:
# Popularity
fig, ax = plt.subplots(figsize=(8, 4))
sns.distplot(data_stats['ct_mean'], ax=ax, hist_kws=hist_kws)
plt.tight_layout()
plt.show()

1. For `Stable and Popular` videos, based on above figure, we can manually set some threshold, for example, set `ct_mean` cutoff threshold to be $1,500,000$, we can define the popular `Stable and Popular` videos

In [None]:
popular_video = data_stats[data_stats['ct_mean'] > 1500000]
popular_video.head()

2. For `Hot` videos, we can use the video's daily change rate.

In [None]:
# Popularity
fig, ax = plt.subplots(figsize=(8, 4))
sns.distplot(data_stats['cg_mean'], ax=ax, hist_kws=hist_kws)
plt.tight_layout()
plt.show()

From above figure, it's clear that if we set the cutoff threshold for change rate, we can separate a group of videos that the mean daily change rate is above $20\%$, which can be defined a `Hot` videos.

In [None]:
hot_video = data_stats[data_stats['cg_mean'] > 0.2]
hot_video.head()

3. For `Everything else`, the rest videos are selected.

### Question 2

In [None]:
data['hot'] = False
data.loc[data['video_id'].isin(hot_video.index), 'hot'] = True
data.head()

In [None]:
# visualization of length
fig, ax = plt.subplots(figsize=(8, 5))
sns.distplot(data[data['hot']==True]['length'], label='Hot', ax=ax, hist_kws=hist_kws)
sns.distplot(data[data['hot']==False]['length'], label='Not Hot', ax=ax, hist_kws=hist_kws)
ax.legend(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# visualization of language
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
tmp = data.groupby('hot')['language'].apply(lambda x: x.value_counts(normalize=True)).unstack()
tmp.plot(kind='bar', ax=ax[0])

sns.barplot(x='language', y='count', data=data, hue='hot', ax=ax[1])
ax[1].set_title('Count vs. language', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of quality
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
tmp = data.groupby('hot')['quality'].apply(lambda x: x.value_counts(normalize=True)).unstack()
tmp.plot(kind='bar', ax=ax[0])

sns.barplot(x='quality', y='count', hue='hot', data=data, ax=ax[1])
ax[1].set_title('Count vs. quality', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of interval
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
sns.countplot(x='interval', hue='hot', data=data, ax=ax[0])
ax[0].set_title('Count Plot of interval', fontsize=16)

sns.barplot(x='interval', y='count', hue='hot', data=data, ax=ax[1])
ax[1].set_title('Count vs. interval', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# visualization of dayofweek
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
tmp = data.groupby('hot')['dayofweek'].apply(lambda x: x.value_counts(normalize=True)).unstack()
tmp.plot(kind='bar', ax=ax[0])

sns.barplot(x='dayofweek', y='count', hue='hot', data=data, ax=ax[1])
ax[1].set_title('Count vs. dayofweek', fontsize=16)
plt.tight_layout()
plt.show()

### Question 3

[Reference](https://github.com/stasi009/TakeHomeDataChallenges/blob/master/18.OnlineVideo/online_videos.ipynb)

* we can build a model to predict whether the video will be Hot (trending up) or not. The videos which are predicted to be Hot will be shown on the Home page and attract users to click.
* study the reason why videos uploaded on Sunday/Monday are much more likely to be Hot than videos uploaded on Thursday.
    - if the reason is like my guess, is because Sunday/Monday allow the video have longer "offline propagation time"
    - then the problem is why "online propagation" doesn't work well?
    - Is it because our website doesn't have a "Share..." buttion?
    - or is it because we didn't include the most popular social network in our "Share with" list?
* study the reason why English videos are more likely to be Hot.
    - is is because the translation issue?
    - or maybe we didn't include local social network (such as weibo in China) in our 'Share with' list.