### Program to search YouTube using its API
###### (by Valentin Todorov - 08/28/2017)
This program was developed to scrub all videos from Suzanita & Kaskata in which "Allahumma Ya Subuhun" can be heard in the background.


In [1]:
# Import the libraries that will be used in the program
import pandas as pd
import requests
import json
import time
import re
from datetime import datetime


## Search terms: suzanita+lucifer; suzanita+kaskata; луцифер+буда; сузанита+луцифер; amulet+biz;


# Define the parameters
# For search terms in Bulgarian only -->>  (1) In "search_term" write the search in Bulgarian, and (2) in "search_term_save" write the search in English
search_term = "suzanita+lucifer"
search_term_save = "suzanita+lucifer" # + "_inBulgarian"     #-->> uncomment only when using Bulgarian search terms <<--#

# My API key from YouTube
api_key = "AIzaSyDHXROE2Qt0NglgCmTnO8Y8cMikF5mD-m4"

# Set the parameters for the number of search pages to return, the number of records per search page and the location of the final Excel file
search_pages = 2
search_results_per_page = 50
final_output_location = "C:/Users/bre49823/Google Drive/VideoSearch"


# Initialize empty lists for the data I'll be collecting - videos' links, titles, uploader, channel, and date uploaded on
video_link = []
video_title = []
video_user = []
video_user_channel = []
video_publish_date = []
search_term_used = []


# Create an empty token value for the first page - Each search page returned has a token
# To go to the next page with results we need to add the token value to the end of the url
# The token value will be updated from the JSON file which is returned in the API call
next_page_token = ""

print ("Beginning the search...")
time.sleep(1)

# Loop through the first n pages of search results
for page_results in range(1, (search_pages + 1)):

    # API call and JSON response
    search_url = ("https://www.googleapis.com/youtube/v3/search?part=snippet&maxResults=" + str(search_results_per_page) + "&q=" + search_term + "&key=" + api_key + "&pageToken=" + next_page_token)
    api_response = requests.get(url = search_url)
    json_data = json.loads(api_response.text)
    
    if page_results == 1:
        print ("The program will collect the information from the first " + str(search_pages * search_results_per_page) + " videos returned in the search")
        time.sleep(1)
        print ("The total number of videos found on YouTube for the search term " + search_term + " are: " + str(json_data['pageInfo']['totalResults']))

    # Update the token for the next page with search results
    next_page_token = str(json_data['nextPageToken'])

    print ("Extracting content from page: " + str(page_results))
    print ("The next page token is: " + str(next_page_token))
    
    # Loop through all the search results returned in the API
    for videos in range(1, len(json_data['items'])):
        
        # Get the link to video
        video_link.append("https://www.youtube.com/watch?v=" + json_data['items'][videos]['id'].values()[1])

        # Get title of video
        video_title.append(json_data['items'][videos]['snippet']['title'])

        # Get the user name/channel name from which a video was uploaded
        video_user.append(json_data['items'][videos]['snippet']['channelTitle'])

        # Get link to the channel
        video_user_channel.append("https://www.youtube.com/channel/" + json_data['items'][videos]['snippet']['channelId'])
   
        # Date of video upload. The extract date is a string, from which the data is extracted
        # Example from here: https://stackoverflow.com/questions/37192942/extract-date-from-string-in-python
        string_date = str(json_data['items'][videos]['snippet']['publishedAt'])
        match = re.search(r'\d{4}-\d{2}-\d{2}', string_date)
        date = datetime.strptime(match.group(), '%Y-%m-%d').date()
        video_publish_date.append(date)


# Add the search term that was used
search_term_used = [search_term_save] * len(video_link)

## Create a dataframe with the video links, titles, username and number of views
print ("\nStack the information about videos in a Pandas dataframe")
time.sleep(1)

final_combined_df = pd.DataFrame(list(zip(video_link, video_title, video_user, video_user_channel, video_publish_date, search_term_used)),
                                 columns = ['video_link', 'video_title', 'video_user', 'video_user_channel', 'video_publish_date', 'search_term_used'])
final_combined_df.head(10)


############################################
#######  Add functionalities
############################################

# Dedup the dataframe by video_link
# YouTube returns the same video in the search results when using different search terms)



# Remove rows where len(video_link) > 43. Playlists and channels have a url length greater than 43
# These can also be removed through the API -> (json_data['items'][tt]['id'].values()[0] == "youtube#video")



# Create a flag which videos to keep - use the logic I created in Excel
# The videos with the flag are more likely to include Allahumma Ya Subuhun in the background




## Write to an Excel file on Google Drive
print ("\nWrite file to Excel on Google Drive\n The location is: "  + final_output_location + "/videos_links_searchterm_" + search_term_save + ".xlsx")

writer_excel = pd.ExcelWriter(final_output_location + "/videos_links_searchterm_" + search_term_save + ".xlsx")
final_combined_df.to_excel(writer_excel, 'Sheet1')
writer_excel.save()

print ("\nComplete!\nFile successfully written to Excel for search term: " + search_term)


Beginning the search...
The program will collect the information from the first 100 videos returned in the search
The total number of videos found on YouTube for the search term suzanita+lucifer are: 1530
Extracting content from page: 1
The next page token is: CDIQAA
Extracting content from page: 2
The next page token is: CGQQAA

Stack the information about videos in a Pandas dataframe

Write file to Excel on Google Drive
 The location is: C:/Users/bre49823/Desktop/Misc/videos_links_searchterm_suzanita+lucifer.xlsx

Complete!
File successfully written to Excel for search term: suzanita+lucifer
