# TED web scraper

The purpose of this notebook is to create a web scraper that collects TED talks:
- slugs (url part) by topic
- info (talk and speaker details, including transcripts) by slugs





In [None]:
# Importing libraries

from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib.request
import json
import time
import os

---
### 1. Downloding slug(s) by topic
slug = url element to a talk

There are options available:<br>
A. **Version 1** - using api request<br> 
B. **Version 2** - parsing html using beautiful soup -> might not work everytime<br>

In [1]:
# User Inputs

topic='technology' # Select a topic from the list available here: https://www.ted.com/topics
filename=f"slug/TED_Talk_{topic}_URLs.txt"


In [5]:
# Version 1 - using api request

def api_scraping(topic, filename):
    urls = []
    page_number=0
    total_pages=0
    f = open(filename, 'w')
    
    while True:
        # The API endpoint to request
        endpoint = "https://zenith-prod-alt.ted.com/api/search"

        # The headers for the request
        headers = {
        "Accept": "*/*",
        "Content-Type": "application/json"
      }

      # The data for the request
        data = [{
              "indexName": "coyote_models_acme_videos_alias_21e1372f285984be956cd03b7ad3406e",
              "params": {
                  "attributeForDistinct": "objectID",
                  "distinct": 1,
                  "facetFilters": [
                      [
                          "tags:"+topic
                      ]
                  ],
                  "facets": [
                      "subtitle_languages",
                      "tags"
                  ],
                  "highlightPostTag": "__/ais-highlight__",
                  "highlightPreTag": "__ais-highlight__",
                  "hitsPerPage": 24,
                  "maxValuesPerFacet": 500,
                  "page": page_number,
                  "query": "",
                  "tagFilters": ""
              }
          },
          {
              "indexName": "coyote_models_acme_videos_alias_21e1372f285984be956cd03b7ad3406e",
              "params": {
                  "analytics": False,
                  "attributeForDistinct": "objectID",
                  "clickAnalytics": False,
                  "distinct": 1,
                  "facets": "tags",
                  "highlightPostTag": "__/ais-highlight__",
                  "highlightPreTag": "__ais-highlight__",
                  "hitsPerPage": 0,
                  "maxValuesPerFacet": 500,
                  "page": page_number,
                  "query": ""
              }
          }]



        # Send the POST request and get the response
        response = requests.post(endpoint, headers=headers, data=json.dumps(data))
        if page_number==0:
            total_pages=response.json()['results'][0]['nbPages']

        for slug in response.json()['results'][0]['hits']:
            urls.append(slug['slug'])
        print(f"{page_number} / {total_pages}")
        page_number+=1
        if page_number>total_pages:
            break
    f.write('\n'.join(urls))
    f.close()

    print(f"Done.{len(urls)} URLs for topic {topic} have been saved in {f}.")
    return

In [9]:
# Version 2 - using beautiful soup parsing

def html_scrapping(topic, filename):
    urls = []
    page_number=1
    f = open(filename, 'w')
    
    while True:
        res  =  requests.get("https://www.ted.com/talks?page="+str(page_number)+"&sort=newest&topics[]="+topic)

        soup = BeautifulSoup(res.text)
        e=soup.find('div', id='browse-results')

        string_end="Sorry. We couldn't find a talk quite like that." 

        if e is None:
            break
        elif e.find('div', class_='h3 m2') is not None:
            if e.find('div', class_='h3 m2').text==string_end:
                break
        else:
            for elem in e.find_all("h4", class_="h9 m5 f-w:700"):
                urls.append(elem.find('a')['href'])
            page_number+=1


    f.write('\n'.join(urls))
    f.close()

# try version 2 if this doesn't work


In [8]:

api_scraping(topic,filename)

0 / 57
1 / 57
2 / 57
3 / 57
4 / 57
5 / 57
6 / 57
7 / 57
8 / 57
9 / 57
10 / 57
11 / 57
12 / 57
13 / 57
14 / 57
15 / 57
16 / 57
17 / 57
18 / 57
19 / 57
20 / 57
21 / 57
22 / 57
23 / 57
24 / 57
25 / 57
26 / 57
27 / 57
28 / 57
29 / 57
30 / 57
31 / 57
32 / 57
33 / 57
34 / 57
35 / 57
36 / 57
37 / 57
38 / 57
39 / 57
40 / 57
41 / 57
42 / 57
43 / 57
44 / 57
45 / 57
46 / 57
47 / 57
48 / 57
49 / 57
50 / 57
51 / 57
52 / 57
53 / 57
54 / 57
55 / 57
56 / 57
57 / 57
Done.1346 URLs for topic technology have been saved in <_io.TextIOWrapper name='slug/TED_Talk_technology_URLs.txt' mode='w' encoding='cp1252'>.


---
### 2. Downloading content for each talk

In [10]:
#functions

def getBuildID():
    response=requests.get("https://ted.com")
    buildID=str(response.content).split("buildId\":\"")[1].split("\"")[0]
    return buildID

def buildDataURL(slug):
    daily_id=getBuildID()
    base=f"https://www.ted.com/_next/data/{daily_id}/talks/"
    mid=".json?slug="
    url=base+slug+mid+slug
    return url

def getSlugData(url):
    response=requests.get(url)
    try:
        return response.json()
    except ValueError:
        print(response)

def getTextFromSlug(talkData):
    text = ""
    for paragraph in talkData["pageProps"]["transcriptData"]["translation"]["paragraphs"]:
        for cue in paragraph["cues"]:
            text+=" "+cue["text"]
    return text

In [32]:
# define which slugs to be loaded

path_to_slug = 'slug/'
slug_files = [pos_slug for pos_slug in os.listdir(path_to_slug) if pos_slug.endswith('.txt')]

slugs_combined=[]

for sfile in slug_files:
    print(sfile)
    with open(os.path.join(path_to_slug, sfile)) as slug_file:
        loaded=slug_file.read().splitlines()
        print(f"{len(loaded)} slugs found")
        slugs_combined.append(loaded)

# combining all slugs and removing duplicates        
slugs = list(set([item for sublist in slugs_combined for item in sublist]))


TED_Talk_Ai_URLs.txt
144 slugs found
TED_Talk_computers_URLs.txt
213 slugs found
TED_Talk_Data_URLs.txt
207 slugs found
TED_Talk_technology_URLs.txt
1346 slugs found


In [33]:
print(len(slugs))

1360


In [18]:
# creating a function to download json data for each slug

def extract_json_by_slug(slugs):
    count=0
    max_count=10

    data_list=[]
    slugs_retry=[]

    for slug in slugs:
        count+=1
        slugURL = buildDataURL(slug)
        data_json = getSlugData(slugURL)

        if data_json is None: #checking if the respose is null
            slugs_retry.append(slug)

        else:
            json_object = json.dumps(data_json)
            with open(f"jsons/{topic}_{slug[:50]}.json", "w") as outfile:
                outfile.write(json_object)

        # adding throttling/wait time to stay within rate rate limiting parameters     
        if count>max_count:
            time.sleep(10)
            count=0
        else:
            time.sleep(3)
    return slugs_retry
        
if len(slugs_retry)!=0:
    extract_json_by_slug(slugs_retry)
        
    




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144


In [6]:
# check for issues: null and missing transcripts
count=0
path_to_json = 'jsons/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
for jfile in json_files:
    with open(os.path.join(path_to_json, jfile)) as json_file:
        json_text = json.load(json_file)
        if json_text is None:
            print(jfile +"bigger issue")
        elif json_text["pageProps"]["transcriptData"]["translation"] is None:
            count+=1
            print(jfile)
         

print(count)
data_init=pd.json_normalize(data_json['pageProps']['videoData'])

adam_cutler.json
andrew_arru.json
ayanna_howa.json
briana_brow.jsonbigger issue
bruno_miche.json
computers_achin_bhowmik_interactive.json
computers_david_pogue_simplicity_se.jsonbigger issue
computers_emmanuel_schanzer_why_is_.json
computers_florian_pinel_how_ibm_s_w.json
computers_jerry_chow_the_future_of_.json
computers_jinha_lee_reach_into_the_.jsonbigger issue
computers_kai_fu_lee_and_chen_qiufa.json
computers_kathy_kleiman_the_pioneer.json
computers_rodrigo_bijou_governments.jsonbigger issue
computers_sadasivan_shankar_designi.json
computers_steve_brown_why_machines_.json
computers_steve_jobs_how_to_live_be.jsonbigger issue
dario_gil_t.json
drew_silver.json
gunjan_bhar.json
juliane_gal.json
julie_chang.json
mariana_lin.json
mother_lond.json
nivruti_rai.json
philipp_ger.json
robin_hause.json
vinith_misr.json
yann_lecun_.json
24
