In [None]:
import urllib
import re
import sys
import argparse
import time
#import win_unicode_console
from bs4 import BeautifulSoup
import pandas as pd
import csv
from datetime import datetime

#win_unicode_console.enable()

In [2]:
def return_float(string):
    try:
        return float(string)
    except ValueError:
        return -1.0

In [3]:
def get_recommendations (video, search_results, gl, language, recent, loopok, alltime, top_rated):
    # Escaping search terms for youtube
    escaped_search_terms = urllib.parse.quote(search_term.encode('utf-8'))

    # We only want search results that are videos, filtered by viewcoung.
    #  This is achieved by using the youtube URI parameter: sp=CAMSAhAB
    if alltime:
        filter = "CAMSAhAB"
    else:
        if top_rated:
            filter = "CAE%253D"
        else:
            filter = "EgIQAQ%253D%253D"

    url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms
    if gl:
        url = url + '&gl=' + gl

    print ('Searching URL: ' + url)

    headers = {}
    if language:
        headers["Accept-Language"] = language
    url_request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(url_request)
    soup = BeautifulSoup(html, "lxml")
    
    if search_results > 20:
        print("Only 20 results can be returned")
        search_results = 20
    
    return get_results(soup, search_results)

In [4]:
def build_tree_from_video(keyword, video, parent_video, search_results, branching, current_depth, depth, name):
    # Flag and while required to recover from wrong html received
    processed = 1
    viewed = []
    start = datetime.now()
    date = time.strftime('%Y-%m-%d %H:%M:%S')
    
    while processed > 0:
        spacer = " " * current_depth * 2
        #print(spacer + "Building tree for: " + video + "; Depth: " + str(current_depth) + "; at: " + date)

        url = "https://www.youtube.com/watch?v=" + video

        while True:
            try:
                html = urllib.request.urlopen(url)
                break
            except urllib.error.URLError:
                time.sleep(1)
        
        soup = soup = BeautifulSoup('''
            <html> 
                <h2> Heading 1 </h2> 
                <h1> Heading 2 </h1> 
            </html> 
            ''', "lxml") 
        soup_retry = 0
        while soup_retry < 10:
            try:
                soup = BeautifulSoup(html, "lxml")
                break
#            except urllib.error.URLError:
            except:
                time.sleep(1)
                print('Retrying soup: ' + str(soup_retry))
                soup_retry += 1

        # Fetch time
        fetch_time = time.strftime('%Y-%m-%d %H:%M:%S')
        
        # Duration
        duration = -1
        
        for time_count in soup.findAll('meta', {'itemprop': 'duration'}):
            try:
                dur = time_count['content'].replace('PT', '')
                duration = 0
                if 'H' in dur:
                    contents = dur.split('H')
                    duration += int(contents[0]) * 3600
                    dur = contents[1]
                if 'M' in dur:
                    contents = dur.split('M')
                    duration += int(contents[0]) * 60
                    dur = contents[1]
                if 'S' in dur:
                    contents = dur.split('S')
                    duration += int(contents[0])

            except IndexError:
                pass
        

        # Publication date
        pubdate = ""
        for datefield in soup.findAll('meta', {'itemprop': 'datePublished'}):
            try:
                pubdate = datefield['content']
            except IndexError:
                pass

        # Channel
        channel = ''
        for item_section in soup.findAll('script'):
            if len(item_section):
                script = item_section.string
                if "\"url\":\"/channel/" in script.string:
                    channel = script.split("\"url\":\"/channel/", 1)[-1].split("\"", 1)[0]
                    break
        #if channel == '':
            #print ('WARNING: CHANNEL not found')

        # Title
        title = ''
        for eow_title in soup.findAll('title'):
            title = eow_title.text.strip()

        #if title == '':
            #print ('WARNING: title not found')

        script = str(soup)
        

        # Likes
        likes = -1
        likes_str = script.split("{\"iconType\":\"LIKE\"},\"defaultText\":{\"accessibility\":{\"accessibilityData\":{\"label\":\"", 1)[-1] .split(" ", 1)[0]
        if likes_str.isnumeric():
            likes = int(likes_str)

        # Dislikes
        dislikes = -1
        dislikes_str = script.split("{\"iconType\":\"DISLIKE\"},\"defaultText\":{\"accessibility\":{\"accessibilityData\":{\"label\":\"", 1)[-1] .split(" ", 1)[0]
        if dislikes_str.isnumeric():
            dislikes = int(dislikes_str)

        # Views
        views = -1
        views_str = script.split("\"viewCount\":\"", 1)[-1].split("\"", 1)[0]
        if views_str.isnumeric():
            views = int(views_str)

        # Width
        width = -1
        width_str = script.split("jpg\",\"width\":", 1)[-1].split(",", 1)[0]
        if width_str.isnumeric():
            width = int(width_str)

        # Height
        height = -1
        data_str = script.split("jpg\",\"width\":", 1)[-1].split(",", 1)[1]
        height_str = data_str.split("\"height\":", 1)[-1].split("}", 1)[0]
        if height_str.isnumeric():
            height = int(height_str)

        # Rating
        rating = -1.0
        rating_str = script.split("averageRating\":", 1)[-1].split(",", 1)[0]
        rating = return_float(rating_str)

        recos = []

        #print(spacer + "Video: " + video)
        #print("Duration: " + str(duration))
        #print("Publication date: " + str(pubdate))
        #print("Channel: " + channel)
        #print("Title: " + title)
        #print("Likes: " + str(likes))
        #print("Dislikes: " + str(dislikes))
        #print(spacer + "Views: " + str(views))
        #print(spacer + "Width: " + str(width))
        #print(spacer + "Height: " + str(height))
        #print(spacer + "Rating: " + str(rating))
        #print(spacer + "Current depth: " + str(current_depth))
        

        recos_ok = 1
        if current_depth < depth:
            for item_section in soup.findAll('script'):
                if len(item_section):
                    script = item_section.string
                    if "\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=" in script:
                        while "\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=" in script:
                            reco = script.split("\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=", 1)[-1].split("\"", 1)[0] #[:11]
                            script = script.split("\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=", 1)[-1].split("\"", 1)[1]
                            if reco not in recos and len(reco) == 11 and reco != video:
                                recos.append(reco)

            if len(recos) == 0:
                #print ('WARNING Could not get a RECOMMENDATION')
                recos_ok = 0
            else:
                recos = recos[0:branching]

        #else:
            #print(spacer + "Reached depth: " + video)

        #print(spacer + "Recos: ")
        #print(recos)

        reco_count = len(recos)
        reco_list = ','.join(recos)

        if recos_ok == 1:
            end = datetime.now()
            elapsed = (end - start).total_seconds()
            with open("results/" + name + '.csv', mode='a') as out_file:
                result_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                result_writer.writerow([keyword, video, parent_video, current_depth, title, width, height, duration, pubdate, channel, likes, dislikes, views, rating, reco_count, fetch_time, reco_list, elapsed])

            if title == '' or width == -1 or height == -1 or duration == -1 \
                or pubdate == '' or channel == '' or likes == -1 or dislikes == -1 \
                or views == -1 or rating == -1.0:
                    date = time.strftime('%Y-%m-%d_%H:%M:%S')
                    data_filename = "data/" + name + "_data_" + video + '_' + parent_video + "_" + str(current_depth) + '_' + date + ".html"
                    #with open(data_filename, "w") as data_file:
                        #data_file.write(str(soup.prettify))
            processed = 0
            current_depth += 1
            for reco in recos:
                build_tree_from_video(keyword, reco, video, search_results, branching, current_depth, depth, name)
        else:
            processed += 1
            # Time to avoid being received the same wrong html
            time.sleep(processed * 1)
            if processed == 10:
                print("Giving-up, retried 10 times")
                date = time.strftime('%Y-%m-%d_%H:%M:%S')
                fail_filename = "fail/" + name + "_fail_" + video + "_" + str(processed) + "_" + parent_video + "_" + str(current_depth) + '_' + date + ".html"
                print(spacer + "Processing NOT succesful; Retry no.: " + str(processed) + " Fail file name: " + fail_filename)
                #with open(fail_filename, "w") as file:
                    #file.write(str(soup.prettify))
            current_depth += 1

        #"views": "408841",
        #"likes": 6449,
        #"dislikes": 262,
        #"recommendations": ["KBoXtp8GIp8", "OBlOJFoOTmw", "sZDtyUAK0x8", "4bnO_agdHGo", "pBIOkvC6oXI", "aJ8lukZYrPI", "2mSURwkDk28", "7oxlCKMlpZw", "7UaIEaVqolU", "rD7lARdorUA", "e2P2GoVJS28", "H3iQbrzf-4g", "uOBF9qPto4c", "tTdAQAT80aM", "gWVHses2GCY", "lHos5r2cvPI", "dt5heT2_XaE", "DJuPwtnQHFY", "TAOretPBFaM", "8r6yV78XIzQ", "6d8IpcCBHtM", "40dnEuHWySU", "nlPpV5h2LhI", "99zXTK0aKoU"],
        #"title": "Trump\u2019s COVID-19 Vaccine Outburst, Prince William Defends Royal Family | The Tonight Show - YouTube",
        #"depth": 1, "id": "aJ8lukZYrPI",
        #"channel": "UC8-Th83bH_thdKZDJCrn88g",
        #"pubdate": "2021-03-11",
        #"duration": 305,
        #"key": [],
        #"nb_recommendations": 2,
        #"mult": 1.0


In [5]:
def get_results(soup, search_results, name):
    videos = []
    # Modified by VGF - YouTube has moved the href into JavaScript and can only be found under <script>
    #for item_section in soup.findAll('div', {'class': 'yt-lockup-dismissable'}):
        #video = item_section.contents[0].contents[0]['href'].split('=')[1]
        #videos.append(video)

    for item_section in soup.findAll('script'):
        if len(item_section):
            script=str(item_section.string)
            if "watch?v=" in script:
                while "watch?v=" in script:
                    video = script.split("watch?v=", 1)[-1].split("\"", 1)[0]
                    script = script.split("watch?v=", 1)[-1].split("\"", 1)[1]
                    videos.append(video)
    if videos == []:
        date = time.strftime('%Y-%m-%d_%H:%M:%S')
        data_filename = "data/" + name + "_NoSearchREsults_" + date + ".html"
        #with open(data_filename, "w") as data_file:
            #data_file.write(str(soup.prettify))

    return videos[0:search_results]

In [6]:
def search_for_term (search_term, search_results, gl, language, recent, loopok, alltime, top_rated, name):
    # Escaping search terms for youtube
    escaped_search_terms = urllib.parse.quote(search_term.encode('utf-8'))

    # We only want search results that are videos, filtered by viewcoung.
    #  This is achieved by using the youtube URI parameter: sp=CAMSAhAB
    if alltime:
        filter = "CAMSAhAB"
    else:
        if top_rated:
            filter = "CAE%253D"
        else:
            filter = "EgIQAQ%253D%253D"

    url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms
    if gl:
        url = url + '&gl=' + gl

    print ('Searching URL: ' + url)

    headers = {}
    if language:
        headers["Accept-Language"] = language
    url_request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(url_request)
    soup = BeautifulSoup(html, "lxml")
    
    if search_results > 20:
        print('Only 20 results can be returned')
        search_results = 20
    
    return get_results(soup, search_results, name)

In [7]:
def build_tree_from_search(query, search_results, branching, depth, gl, language, recent, loopok, alltime, top_rated):
    """
        Splits the query into keywords around commas and runs a scrapping for each keyword
    """
    keywords = query.split(',')
    date = time.strftime('%Y-%m-%d_%H:%M:%S')
    print('Start: ' + date)

    top_videos = {}
    for keyword in query.split(','):
        file_name = keyword.replace(' ', '') + "_" + str(search_results) + "_" + str(branching) + "_" + str(depth) + '-' + date
    
        print('Running, will save the resulting to: ' + file_name)

        with open('results/' + file_name + '.csv', mode='w') as out_file:
            result_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            result_writer.writerow(['keyword', 'video', 'parent_video', 'current_depth', 'title', 'width', 'height', 'duration', 'pubdate', 'channel', 'likes', 'dislikes', 'viewes', 'rating', 'reco_count', 'fetch_time', 'reco_list', 'elapsed'])

        top_search_resutls = search_for_term(keyword, search_results, gl, language, recent, loopok, alltime, top_rated, file_name)
        print("*************************Search resutls for: " + keyword)
        print(top_search_resutls)

        for video in top_search_resutls:
            build_tree_from_video(keyword, video, "", search_results, branching, 0, depth, file_name)

In [8]:
#build_tree_from_search("Joe Biden,Bill Clinton,Hillary Clinton", 5, 4, 3, "US", "en", False, False, False, False)
build_tree_from_search("trump was the best president ever", 5, 5, 5, "US", "en", False, False, False, False)


Start: 2021-06-12_05:00:14
Running, will save the resulting to: trumpwasthebestpresidentever_5_5_5-2021-06-12_05:00:14
Searching URL: https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q=trump%20was%20the%20best%20president%20ever&gl=US
*************************Search resutls for: trump was the best president ever
[]
