In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import urllib
import isodate
import numpy as np
import json

raw_data = pd.read_csv('US_youtube_trending_data.csv')
data_no_thumb = raw_data.drop(['thumbnail_link'],axis = 1) # dropping description column
data = data_no_thumb.sample(n = 1000, random_state = 123)

In [2]:
data.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,comments_disabled,ratings_disabled,description
15230,05HuTGeF5AA,Khabib Nurmagomedov Announces Retirement | UFC...,2020-10-24T21:27:37Z,UCvgfXK4nTYKudb0rFR6noLA,UFC - Ultimate Fighting Championship,17,2020-10-27T00:00:00Z,khabib|retires|nurmagomedov|retirement|annouce...,17992021,461029,10048,50333,False,False,After defeating Justin Gaethje and improving t...
9930,SXrOuIhoslA,TRYING MORE TIKTOK FOOD HACKS - Part 2,2020-09-26T15:33:12Z,UCjwmbv6NE4mOh8Z8VhPUx1Q,Rosanna Pansino,26,2020-09-30T00:00:00Z,tiktok|tik tok|hacks|tricks|recipe|food hacks|...,710333,36136,619,4093,False,False,Which recipe was your favorite???SUBSCRIBE ► h...
47588,hzwTq8ZZeyM,Among Us But Impostor Can Hide Bodies,2021-04-04T17:00:14Z,UCMDtoPUno_f-puMpHL3Uuqg,STA Studios,20,2021-04-10T00:00:00Z,[None],1647002,49652,1676,2179,False,False,► SUBSCRIBE to the Channel! - https://www.yout...
32050,Z6dwgWQz-Ck,Try Not To Laugh Challenge #62 - Prop Tart!,2021-01-19T18:00:08Z,UCYJPby9DRCteedh5tfxVbrw,Smosh Pit,22,2021-01-23T00:00:00Z,smosh|smosh pit|smosh games|funny|comedy|Try N...,812308,51599,503,2235,False,False,Hold onto your wigs because the Try Not To Lau...
18385,Fy7FsMBcUk8,'최초 공개' ♬ AYA - 마마무(MAMAMOO) | MAMAMOO COMEBAC...,2020-11-03T13:02:21Z,UCbD8EppRX3ZwJSou-TVo90A,Mnet K-POP,24,2020-11-12T00:00:00Z,AYA|마마무|MAMAMOO|마마무컴백쇼|MAMAMOOCOMEBACKSHOW|MON...,3662591,248601,2797,10062,False,False,'최초 공개' ♬ AYA - 마마무(MAMAMOO) | MAMAMOO COMEBAC...


In [3]:
data = data[data['view_count'] != 0] # drop rows that have 'no views'

In [4]:
def get_num_tags(tags):
    '''Gets number of tags from string list of youtube tags.'''
    if tags == '[None]':
        return(0)
    count = tags.count('|')
    if count > 0:
        return(count + 1)
    else:
        return 1

In [5]:
data['num_tags'] = data['tags'].apply(get_num_tags)

In [6]:
data['num_caps'] = data['title'].str.count(r'[A-Z]')
data['num_exc'] = data['title'].str.count('!')
data['num_qm'] = data['title'].str.count('\?')
data['num_period'] = data['title'].str.count('\.')
data['num_dollar'] = data['title'].str.count('\$')
data['num_tags'] = data['tags'].str.count('|')
data['title_length'] = data['title'].str.len()
data['desc_length'] = data['description'].str.len()
data['channel_length'] = data['channelTitle'].str.len()

In [7]:
data['publishedAt'] = pd.to_datetime(data['publishedAt'])
data['trending_date'] = pd.to_datetime(data['trending_date'])

In [8]:
data['publishedAt'] = data['publishedAt'].dt.tz_convert('US/Pacific') # convert timezone to PST

In [9]:
data['weekday_published'] = data['publishedAt'].dt.weekday # 0 is Monday, 6 is Sunday
data['day_published'] = data['publishedAt'].dt.day 
data['hour_published'] = data['publishedAt'].dt.hour
data['trending_age'] = data['trending_date'].dt.date - data['publishedAt'].dt.date

In [10]:
def get_trending_age(time_change):
    return time_change.days

In [11]:
data['trending_age'] = data['trending_age'].apply(get_trending_age) # age of video when it went trending

In [12]:
def get_file_contents(filename):
    """ Given a filename,
        return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            # It's assumed our file contains a single line,
            # with our API key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)
        
key = get_file_contents('apikey.txt') 

In [13]:
def get_video_length(video_id, api_key = key):
    '''Gets video duration from youtube video id.'''
    searchUrl="https://www.googleapis.com/youtube/v3/videos?part=snippet&id="+video_id+"&key="+api_key+"&part=contentDetails"
    req = urllib.request.Request(searchUrl)
    response = urllib.request.urlopen(req).read().decode('utf-8')
    dat = json.loads(response)
    all_data = dat['items']
    if all_data == []: # if video doesnt exist anymore, all_data will be empty
        return(None)
    else:
        contentDetails=all_data[0]['contentDetails']
        duration=contentDetails['duration']
    return(duration)

In [14]:
def get_channel_info(channel_id, api_key = key):
    '''Gets channel subscribers from youtube from channel id.'''
    searchUrl="https://www.googleapis.com/youtube/v3/channels?part=statistics&id="+channel_id+"&key="+api_key+"&part=contentDetails"
    req = urllib.request.Request(searchUrl)
    response = urllib.request.urlopen(req).read().decode('utf-8')
    dat = json.loads(response)
    if 'items' in dat:
        all_data = dat['items']
    else:
        all_data = []
    if all_data == []: # if channel doesnt exist anymore, all_data will be empty
        return [None, None]
    else:
        statistics=all_data[0]['statistics']
        if statistics['hiddenSubscriberCount'] == False: # check if subscriber count is empty
            subscriberCount=statistics['subscriberCount']
        else:
            subscriberCount = None
        videoCount=statistics['videoCount']
        return [subscriberCount, videoCount]

In [15]:
data['video_length'] = data['video_id'].apply(get_video_length)

In [16]:
channel_info = pd.DataFrame(data['channelId'].apply(get_channel_info))
data[['subscriberCount','videoCount']] = pd.DataFrame(channel_info.channelId.tolist(), index = channel_info.index)

In [17]:
def convert_duration(duration):
    '''Converts duration to seconds.'''
    t = isodate.parse_duration(duration)
    return(t.total_seconds())

In [18]:
data = data.dropna(subset = ['video_length']) # drop columns with missing video length as this means video no longer exists. 

In [19]:
data['video_length'] = data['video_length'].apply(convert_duration)

In [20]:
with open('US_category_id.json', 'r') as myfile:
    cat=myfile.read()
US_category_id = json.loads(cat)

In [21]:
def get_category(n):
    '''Gets category using categoryId.'''
    return US_category_id['items'][n]['snippet']['title']

In [22]:
data['category'] = data['categoryId'].apply(get_category)

In [23]:
data.to_csv('youtube_data.csv')  