Updated get_video_info() to work with new YouTube HTML formatting

mattpopovich · mattpopovich · commit 803a6090e1e6 · 2021-07-18T17:33:31.000-06:00
diff --git a/web-scraping/youtube-extractor/README.md b/web-scraping/youtube-extractor/README.md
@@ -8,22 +8,17 @@ To run this:
     **Output:**
     ```
     Title: Me at the zoo
-    Views: 106602383
-    Published at: 23/04/2005
+    Views: 172639597
+    Published at: 2005-04-23
     Video Duration: 0:18
     Video tags: me at the zoo, jawed karim, first youtube video
-    Likes: 3825489
-    Dislikes: 111818
+    Likes: 8188077
+    Dislikes: 191986
 
-    Description: The first video on YouTube. Maybe it's time to go back to the zoo?
-
-    NEW VIDEO LIVE! https://www.youtube.com/watch?v=dQw4w...
-
-
-    == Ok, new video as soon as 10M subscriberz! ==
+    Description: The first video on YouTube. While you wait for Part 2, listen to this great song: https://www.youtube.com/watch?v=zj82_v2R6ts
 
 
     Channel Name: jawed
     Channel URL: https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A
-    Channel Subscribers: 1.03M
+    Channel Subscribers: 1.98M subscribers
     ```
diff --git a/web-scraping/youtube-extractor/extract_video_info.py b/web-scraping/youtube-extractor/extract_video_info.py
@@ -16,13 +16,13 @@ def get_video_info(url):
     # initialize the result
     result = {}
     # video title
-    result["title"] = soup.find("h1").text.strip()
+    result["title"] = soup.find("meta", itemprop="name")['content']
     # video views (converted to integer)
-    result["views"] = int(''.join([ c for c in soup.find("span", attrs={"class": "view-count"}).text if c.isdigit() ]))
+    result["views"] = soup.find("meta", itemprop="interactionCount")['content']
     # video description
-    result["description"] = soup.find("yt-formatted-string", {"class": "content"}).text
+    result["description"] = soup.find("meta", itemprop="description")['content']
     # date published
-    result["date_published"] = soup.find("div", {"id": "date"}).text[1:]
+    result["date_published"] = soup.find("meta", itemprop="datePublished")['content']
     # get the duration of the video
     result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
     # get the video tags