In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import urllib
import isodate
import numpy as np
import json

raw_data = pd.read_csv('US_youtube_trending_data.csv', index_col = False)
data_no_desc = raw_data.drop(['description' , 'thumbnail_link'],axis = 1) # dropping description column
data = data_no_desc.sample(n = 1000, random_state = 123)

In [2]:
data = data[data['view_count'] != 0] # drop rows that have 'no views'

In [3]:
def get_num_tags(tags):
    '''Gets number of tags from string list of youtube tags.'''
    if tags == '[None]':
        return(0)
    count = tags.count('|')
    if count > 0:
        return(count + 1)
    else:
        return 1

In [4]:
data['num_tags'] = data['tags'].apply(get_num_tags)

In [5]:
data['num_caps'] = data['title'].str.count(r'[A-Z]')
data['num_exc'] = data['title'].str.count('!')
data['num_qm'] = data['title'].str.count('\?')
data['num_period'] = data['title'].str.count('\.')
data['num_dollar'] = data['title'].str.count('\$')
data['num_tags'] = data['tags'].str.count('|')
data['title_length'] = data['title'].str.len()

In [6]:
data['publishedAt'] = pd.to_datetime(data['publishedAt'])
data['trending_date'] = pd.to_datetime(data['trending_date'])

In [7]:
data['weekday_published'] = data['publishedAt'].dt.weekday # 0 is Monday, 6 is Sunday
data['day_published'] = data['publishedAt'].dt.day 
data['hour_published'] = data['publishedAt'].dt.hour
data['trending_age'] = data['trending_date'].dt.date - data['publishedAt'].dt.date

In [8]:
def get_trending_age(time_change):
    return time_change.days

In [9]:
data['trending_age'] = data['trending_age'].apply(get_trending_age) # age of video when it went trending

In [10]:
def get_file_contents(filename):
    """ Given a filename,
        return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            # It's assumed our file contains a single line,
            # with our API key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)
        
key = get_file_contents('apikey.txt') 

In [11]:
def get_video_length(video_id, api_key = key):
    '''Gets video duration from youtube given id and api key.'''
    searchUrl="https://www.googleapis.com/youtube/v3/videos?id="+video_id+"&key="+api_key+"&part=contentDetails"
    req = urllib.request.Request(searchUrl)
    response = urllib.request.urlopen(req).read().decode('utf-8')
    data = json.loads(response)
    all_data = data['items']
    if all_data == []: # if video doesnt exist anymore, all_data will be empty
        return(None)
    else:
        contentDetails=all_data[0]['contentDetails']
        duration=contentDetails['duration']
    return(duration)

In [None]:
data['video_length'] = data['video_id'].apply(get_video_length)

In [None]:
data = data.dropna(subset = ['video_length']) # drop columns with missing video length as this means video no longer exists. 

In [None]:
def convert_duration(duration):
    '''Converts duration to seconds.'''
    t = isodate.parse_duration(duration)
    return(t.total_seconds())

In [None]:
data['video_length'] = data['video_length'].apply(convert_duration)

In [None]:
with open('US_category_id.json', 'r') as myfile:
    cat=myfile.read()
US_category_id = json.loads(cat)

In [None]:
def get_category(n):
    '''Gets category using categoryId.'''
    return US_category_id['items'][n]['snippet']['title']

In [None]:
data['category'] = data['categoryId'].apply(get_category)

In [None]:
data.to_csv('youtube_data.csv')  