In [2]:
import urllib
import re
import sys
import argparse
import time
#import win_unicode_console
from bs4 import BeautifulSoup
import pandas as pd
import csv
from datetime import datetime

#win_unicode_console.enable()

In [3]:
def return_float(string):
    try:
        return float(string)
    except ValueError:
        return -1.0

In [4]:
def get_recommendations (video, search_results, gl, language, recent, loopok, alltime, top_rated):
    # Escaping search terms for youtube
    escaped_search_terms = urllib.parse.quote(search_term.encode('utf-8'))

    # We only want search results that are videos, filtered by viewcoung.
    #  This is achieved by using the youtube URI parameter: sp=CAMSAhAB
    if alltime:
        filter = "CAMSAhAB"
    else:
        if top_rated:
            filter = "CAE%253D"
        else:
            filter = "EgIQAQ%253D%253D"

    url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms
    if gl:
        url = url + '&gl=' + gl

    print ('Searching URL: ' + url)

    headers = {}
    if language:
        headers["Accept-Language"] = language
    url_request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(url_request)
    soup = BeautifulSoup(html, "lxml")
    
    if search_results > 20:
        print("Only 20 results can be returned")
        search_results = 20
    
    return get_results(soup, search_results)

In [5]:
def build_tree_from_video(keyword, video, parent_video, search_results, branching, current_depth, depth, name, trace, parent_trace):
    # Flag and while required to recover from wrong html received
#    print (keyword, video, trace)
    processed = 1
    viewed = []
    start = datetime.now()
    date = time.strftime('%Y-%m-%d %H:%M:%S')

    while processed > 0:
        spacer = " " * current_depth * 2

        url = "https://www.youtube.com/watch?v=" + video

        while True:
            try:
                html = urllib.request.urlopen(url)
                break
            except urllib.error.URLError:
                time.sleep(1)
        
        soup = soup = BeautifulSoup('''
            <html> 
                <h2> Heading 1 </h2> 
                <h1> Heading 2 </h1> 
            </html> 
            ''', "lxml") 
        soup_retry = 0
        while soup_retry < 10:
            try:
                soup = BeautifulSoup(html, "lxml")
                break
            except:
                time.sleep(1)
                #print('Retrying soup: ' + str(soup_retry))
                soup_retry += 1

        # Fetch time
        fetch_time = time.strftime('%Y-%m-%d %H:%M:%S')

        # Publication date
        pubdate = ""
        for datefield in soup.findAll('meta', {'itemprop': 'datePublished'}):
            try:
                pubdate = datefield['content']
            except IndexError:
                pass

        script = str(soup)

        # Likes
        likes = -1
        likes_str = script.split("{\"iconType\":\"LIKE\"},\"defaultText\":{\"accessibility\":{\"accessibilityData\":{\"label\":\"", 1)[-1] .split(" ", 1)[0]
        if likes_str.isnumeric():
            likes = int(likes_str)

        # Dislikes
        dislikes = -1
        dislikes_str = script.split("{\"iconType\":\"DISLIKE\"},\"defaultText\":{\"accessibility\":{\"accessibilityData\":{\"label\":\"", 1)[-1] .split(" ", 1)[0]
        if dislikes_str.isnumeric():
            dislikes = int(dislikes_str)

        # Rating
        rating = -1.0
        rating_str = script.split("averageRating\":", 1)[-1].split(",", 1)[0]
        rating = return_float(rating_str)

        recos = []

        recos_ok = 1
        if current_depth < depth:
            for item_section in soup.findAll('script'):
                if len(item_section):
                    script = item_section.string
                    if "\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=" in script:
                        while "\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=" in script:
                            reco = script.split("\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=", 1)[-1].split("\"", 1)[0] #[:11]
                            script = script.split("\"commandMetadata\":{\"webCommandMetadata\":{\"url\":\"/watch?v=", 1)[-1].split("\"", 1)[1]
                            if reco not in recos and len(reco) == 11 and reco != video:
                                recos.append(reco)

            if len(recos) == 0:
                #print ('WARNING Could not get a RECOMMENDATION')
                recos_ok = 0
            else:
                recos = recos[0:branching]

        reco_count = len(recos)
        reco_list = ','.join(recos)

        if recos_ok == 1:
            end = datetime.now()
            elapsed = (end - start).total_seconds()
            with open("results/" + name + '.csv', mode='a') as out_file:
                result_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                result_writer.writerow([keyword, video, parent_video, current_depth, pubdate, likes, dislikes, rating, reco_count, fetch_time, reco_list, elapsed, trace, parent_trace])

            processed = 0
            trace_seed = 1
            current_depth += 1
            for reco in recos:
                childTrace = trace + ((10 ** (4 - current_depth + 1)) * trace_seed)
                build_tree_from_video(keyword, reco, video, search_results, branching, current_depth, depth, name, childTrace, trace)
                trace_seed += 1
        else:
            processed += 1
            # Time to avoid being received the same wrong html
            time.sleep(processed * 1)
            if processed == 5:
                print("Giving-up, retried 10 times")
                date = time.strftime('%Y-%m-%d_%H:%M:%S')
                fail_filename = "fail/" + name + "_fail_" + video + "_" + str(processed) + "_" + parent_video + "_" + str(current_depth) + '_' + date + ".html"
                print(spacer + "Processing NOT succesful; Retry no.: " + str(processed) + " Fail file name: " + fail_filename)
                #with open(fail_filename, "w") as file:
                    #file.write(str(soup.prettify))
            current_depth += 1


In [6]:
def get_results(soup, search_results, name):
    videos = []
    # Modified by VGF - YouTube has moved the href into JavaScript and can only be found under <script>
    #for item_section in soup.findAll('div', {'class': 'yt-lockup-dismissable'}):
        #video = item_section.contents[0].contents[0]['href'].split('=')[1]
        #videos.append(video)

    for item_section in soup.findAll('script'):
        if len(item_section):
            script=str(item_section.string)
            if "watch?v=" in script:
                while "watch?v=" in script:
                    video = script.split("watch?v=", 1)[-1].split("\"", 1)[0]
                    script = script.split("watch?v=", 1)[-1].split("\"", 1)[1]
                    videos.append(video)
    if videos == []:
        date = time.strftime('%Y-%m-%d_%H:%M:%S')
        data_filename = "data/" + name + "_NoSearchREsults_" + date + ".html"
        #with open(data_filename, "w") as data_file:
            #data_file.write(str(soup.prettify))

    return videos[0:search_results]

In [7]:
def search_for_term (search_term, search_results, gl, language, recent, loopok, alltime, top_rated, name):
    # Escaping search terms for youtube
    escaped_search_terms = urllib.parse.quote(search_term.encode('utf-8'))

    # We only want search results that are videos, filtered by viewcoung.
    #  This is achieved by using the youtube URI parameter: sp=CAMSAhAB
    if alltime:
        filter = "CAMSAhAB"
    else:
        if top_rated:
            filter = "CAE%253D"
        else:
            filter = "EgIQAQ%253D%253D"

    url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms
    if gl:
        url = url + '&gl=' + gl

    print ('Searching URL: ' + url)

    headers = {}
    if language:
        headers["Accept-Language"] = language
    url_request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(url_request)
    soup = BeautifulSoup(html, "lxml")
    
    if search_results > 20:
        print('Only 20 results can be returned')
        search_results = 20
    
    return get_results(soup, search_results, name)

In [8]:
def build_tree_from_search(query, search_results, branching, depth, gl, language, recent, loopok, alltime, top_rated):
    """
        Splits the query into keywords around commas and runs a scrapping for each keyword
    """
    keywords = query.split(',')
    date = time.strftime('%Y-%m-%d_%H:%M:%S')
    print('Start: ' + date)

    top_videos = {}
    
    trace_seed = 1
    for keyword in query.split(','):
        
        trace = trace_seed * 1000000
        
        file_name = keyword.replace(' ', '') + "_" + str(search_results) + "_" + str(branching) + "_" + str(depth) + '-' + date
    
        print('Running, will save the resulting to: ' + file_name)

        with open('results/' + file_name + '.csv', mode='w') as out_file:
            result_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            result_writer.writerow(['keyword', 'video', 'parent_video', 'current_depth', 'pubdate', 'likes', 'dislikes', 'rating', 'reco_count', 'fetch_time', 'reco_list', 'elapsed', 'trace', 'parent_trace'])

        top_search_resutls = search_for_term(keyword, search_results, gl, language, recent, loopok, alltime, top_rated, file_name)
        print("*************************Search resutls for: " + keyword)
        print(top_search_resutls)

        for video in top_search_resutls:
            build_tree_from_video(keyword, video, "", search_results, branching, 0, depth, file_name, trace, 0)
        
        trace_seed += 1

In [9]:
#build_tree_from_search("Joe Biden,Hillary Clinton,Donald Trump,AOC,Lets go Brandon", 5, 5, 5, "US", "en-US", False, False, False, False)

In [10]:
#build_tree_from_search("Lets go Brandon,Kyle Rittenhouse,Joseph Rosenbaum,Anthony Huber,Ilhan Omar", 5, 5, 5, "US", "en-US", False, False, False, False)

In [11]:
#build_tree_from_search("Lindsey Graham,Mitch McConnell,Nancy Pelosi,Newt Gingrich,Barak Obama,Ted Cruz", 5, 5, 5, "US", "en-US", False, False, False, False)

In [12]:
#build_tree_from_search("Climate Change,Cultural Appropriation,Feminism,Free Speech,Gun Control", 5, 5, 5, "US", "en-US", False, False, False, False)

In [13]:
#build_tree_from_search("Joe Manchin,Immigration,Moderna,Pfizer,NRA,Mask,Planned Parenthood,Vaccine,Welfare State", 5, 5, 5, "US", "en-US", False, False, False, False)

In [14]:
#build_tree_from_search("Infrastructure Bill,Anthony Fauci,Kyrie Irving,Vaxxed,Omicron,Gavin Newson,Peter Thiel,trump was the best president ever,trump was the worst president ever,biden is the best president ever,biden is the worst president ever", 5, 5, 5, "US", "en-US", False, False, False, False)

In [15]:
#build_tree_from_search("trump was the best president ever", 5, 5, 5, "US", "en", False, False, False, False)
#build_tree_from_search("trump was the worst president ever", 5, 5, 5, "US", "en", False, False, False, False)
#build_tree_from_search("biden is the best president ever", 5, 5, 5, "US", "en", False, False, False, False)
#build_tree_from_search("biden is the worst president ever", 5, 5, 5, "US", "en", False, False, False, False)


In [None]:
build_tree_from_search("COVID,Coronavirus", 5, 5, 5, "US", "en-US", False, False, False, False)

Start: 2022-01-23_10:27:07
Running, will save the resulting to: COVID_5_5_5-2022-01-23_10:27:07
Searching URL: https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q=COVID&gl=US
*************************Search resutls for: COVID
['mXVpH5SyHGM', 'LjwIXFV_1j4', 'pu7PtzUUcz0', 'mZlyPFP0ilM', 'SzSFRWNzuOU']
Giving-up, retried 10 times
        Processing NOT succesful; Retry no.: 5 Fail file name: fail/COVID_5_5_5-2022-01-23_10:27:07_fail_Kh2vWO58sj4_5_LjwIXFV_1j4_4_2022-01-23_16:24:29.html
