#### Program to search YouTube using its API
(by Valentin Todorov - 11/28/2017)
<br>
This program was developed to find all music videos by Suzanita & Kaskata in YouTube which contain a copyrighted version of the song "Allahumma Ya Subuhun". Using pre-defined search terms, the video links are collected and sent to YouTube's copyright violations team to remove the content.

In [None]:
# Import the libraries that will be used in the program
import pandas as pd
import numpy as np
import requests
import json
import time
import re
from datetime import datetime

# Allocate values to variables
api_key = ""         # Paste YouTube's API key. TO DO: Create an environment setup file and add it to .gitignore. Use configparser to setup environment
search_url_prefix = "https://www.googleapis.com/youtube/v3/search?part=snippet&maxResults="
video_url_prefix = "https://www.youtube.com/watch?v="
channel_url_prefix = "https://www.youtube.com/channel/"
api_url_prefix = "https://www.googleapis.com/youtube/v3/videos?part=snippet%2CcontentDetails%2Cstatistics&id="
    
# Set the parameters for the number of search pages to return, the number of records per search page and the location of the final Excel file
search_pages = 10
search_results_per_page = 50
final_output_location = "/Users/valentin/Documents/VideoSearch"

# Search terms: suzanita+lucifer; suzanita+kaskata; lucifer+buddha; луцифер+буда; сузанита+луцифер; сузанита+каската; biz+amulet;
# Define the parameters
# For search terms in Bulgarian only -->>  (1) In "search_term" write the search in Bulgarian, and (2) in "search_term_save" write the search in English
search_term = ["suzanita+lucifer", "suzanita+kaskata", "lucifer+buddha", "biz+amulet", "сузанита+луцифер", "сузанита+каската", "луцифер+буда"]
search_term_save = ["suzanita+lucifer", "suzanita+kaskata", "lucifer+buddha", "biz+amulet", "suzanita+lucifer_bg", "suzanita+kaskata_bg", "lucifer+buda_bg"]


# Initialize empty lists for the data I'll be collecting - videos' links, titles, uploader, uploader channel, date uploaded on and search term used
video_link = []
video_title = []
video_user = []
video_user_channel = []
video_publish_date = []
search_term_used = []

for s in range(len(search_term)):
    search_pages = 10
    
    # Create an empty token value for the first page - Each search page returned has a token
    # To go to the next page with results we need to add the token value to the end of the url
    # The token value will be updated from the JSON file which is returned in the API call
    next_page_token = ""

    print ("\nBeginning the search for: " + search_term_save[s])
    time.sleep(1)

    # Get the total number of pages with results and loop through the first n pages
    api_response = requests.get(url = (search_url_prefix + str(search_results_per_page) + "&q=" + search_term[s] + "&key=" + api_key))
    total_pages = int(json.loads(api_response.text)["pageInfo"]["totalResults"] / search_results_per_page)

    if search_pages > total_pages:
        search_pages = total_pages
        
    for page_results in range(1, (search_pages + 1)):

        # API call and JSON response
        search_url = (search_url_prefix + str(search_results_per_page) + "&q=" + search_term[s] + "&key=" + api_key + "&pageToken=" + next_page_token)
        api_response = requests.get(url = search_url)
        json_data = json.loads(api_response.text)

        if page_results == 1:
            print ("The program will collect the information from the first " + str(search_pages * search_results_per_page) + " videos returned in the search")
            time.sleep(1)
            print ("The total number of videos found on YouTube for the search term " + search_term[s] + " are: " + str(json_data["pageInfo"]["totalResults"]))

        # Update the token for the next page with search results
        next_page_token = str(json_data["nextPageToken"])

        print ("Extracting content from page: " + str(page_results))
        print ("The next page token is: " + str(next_page_token))
        print search_url

        # Loop through all the search results returned in the API
        for videos in range(0, len(json_data["items"])):

            # Get the link to video
            video_link.append(video_url_prefix + json_data["items"][videos]["id"].values()[1])

            # Get title of video
            video_title.append(json_data["items"][videos]["snippet"]["title"])

            # Get the user name/channel name from which a video was uploaded
            video_user.append(json_data["items"][videos]["snippet"]["channelTitle"])

            # Get link to the channel
            video_user_channel.append(channel_url_prefix + json_data["items"][videos]["snippet"]["channelId"])

            # Date of video upload. The extract date is a string, from which the data is extracted
            # Example from here: https://stackoverflow.com/questions/37192942/extract-date-from-string-in-python
            string_date = str(json_data["items"][videos]["snippet"]["publishedAt"])
            match = re.search(r"\d{4}-\d{2}-\d{2}", string_date)
            date = datetime.strptime(match.group(), "%Y-%m-%d").date()
            video_publish_date.append(date)

            # Add the search term that was used
            search_term_used.append(search_term_save[s])


In [None]:
## Create a dataframe with the video links, titles, username and number of views
final_combined_df = pd.DataFrame(list(zip(video_link, video_title, video_user, video_user_channel, video_publish_date, search_term_used)),
                                 columns = ["video_link", "video_title", "video_user", "video_user_channel", "video_publish_date", "search_term_used"])

# Dedup the dataframe by video_link
# YouTube returns the same video in the search results when using different search terms)
final_combined_df2 = final_combined_df.drop_duplicates(subset = ["video_link"], keep = "first")

# Remove rows where len(video_link) > 43. Playlists and channels have a url length greater than 43
# These can also be removed through the API -> (json_data["items"][tt]["id"].values()[0] == "youtube#video")
final_combined_df3 = final_combined_df2[final_combined_df2["video_link"].map(len) < 44]

# Remove videos that have already been marked as non-cases during a manual search
reviewed_videos = pd.read_excel(final_output_location + "/reviewed_videos_important.xlsx")
final_combined_df3 = pd.merge(final_combined_df3, reviewed_videos[["video_link", "keep"]], on = "video_link", how = "left")
final_combined_df3 = final_combined_df3[final_combined_df3["keep"] != "d"]
final_combined_df3 = final_combined_df3.drop("keep", 1)

# Check the sizes of the dataframes
final_combined_df.shape, final_combined_df2.shape, final_combined_df3.shape

In [None]:
## Get the duration for each video and append it to the Pandas dataframe

# Remove playlists from the list ->> keep only elements in the list with length < 44
video_link = list(set(video_link))
video_link = [x for x in video_link if len(x) < 44]

# YouTube only allows API requests for up to 50 videos at a time. I created buckets with 40 videos in each
video_link3 = np.array_split(video_link, len(video_link)/40)

# Some more checking
len(list(set(video_link))), len(video_link3)

In [None]:
## Run through the list with videos, extract the duration and output the information
video_link = []
video_duration_str = []

for i in range(0, len(video_link3)):
    videos_list_search = "%2C+".join([x.split("?v=")[1] for x in video_link3[i]])
    
    search_url = api_url_prefix + videos_list_search + "&key=" + api_key
    api_response = requests.get(url = search_url)
    json_data = json.loads(api_response.text)
    
    video_index = len(json_data["items"])
    video_link.append([video_url_prefix + json_data["items"][x]["id"] for x in range(0, video_index)])
    video_duration_str.append([json_data["items"][x]["contentDetails"]["duration"].split("PT")[1] for x in range(0, video_index)])

video_link = [x for sublist in video_link for x in sublist]
video_duration_str = [x for sublist in video_duration_str for x in sublist]

In [None]:
## Merge the video durations with the list of videos found in the search
video_duration_df = pd.DataFrame(list(zip(video_link, video_duration_str)),
                                 columns = ["video_link", "video_duration_str"])

final_combined_df4 = pd.merge(final_combined_df3, video_duration_df, on = "video_link", how = "left")

# Remove videos published prior to 2017-06-01
final_combined_df4["video_publish_date"] = pd.to_datetime(final_combined_df4["video_publish_date"])
mask = (final_combined_df4["video_publish_date"] >= "2017-06-01")
final_combined_df4 = final_combined_df4.loc[mask]

# Remove videos longer than 59 minutes. Need to make this to remove videos longer than 15 minutes (there are ~ 100 that are 15 min and longer)
final_combined_df4 = final_combined_df4[final_combined_df4.video_duration_str.str.contains("H") == False]
#final_combined_df4 = final_combined_df4["B"].str.split("M", expand = True)[0]    # Remove videos > 15 minutes

## Write to an Excel file on my computer
print ("\nWrite file to Excel on Google Drive\n The location is: "  + final_output_location + "/videos_links_final_" + time.strftime("%Y%m%d") + ".xlsx")

writer_excel = pd.ExcelWriter(final_output_location + "/videos_links_final_" + time.strftime("%Y%m%d") + ".xlsx")
final_combined_df4.to_excel(writer_excel, 'Sheet1')
writer_excel.save()

print (final_combined_df4.shape)
print ("\nComplete!\nFile successfully written to Google Drive")