In [110]:
import requests
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

True

In [111]:
url = os.getenv("MONGO_URL")
client = MongoClient(url)
db = client.get_database("kuwtk")
imbd_episodes = db.imbd_episodes

In [112]:
def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text)
    raise Exception(response)

# Kardashian API content
## IMBD Episodes
For each episode: title, imbd_rate, description, season, watch_platforms, air_date, imbd_id

In [113]:
def season_url(
    season): return f"https://www.imdb.com/title/tt1086761/episodes?season={season}"

In [114]:
def parseDate(date):
    if date != "":
        try:
            return datetime.strptime(date, '%d %b. %Y')
        except Exception as error:
            return datetime.strptime(date, '%d %b %Y')
    return None

In [115]:
def getSeasonEpisodeData(html):
    data = []
    list_episodes = html.select("div.list.detail.eplist > div")
    for episode in list_episodes:
        imbd_id = episode.select_one("div.image > a > div").get("data-const")
        image_url_raw = episode.select_one("div.image > a > div > img")
        image_url = None
        if image_url_raw:
            image_url = image_url_raw.get("src")
        s, e = episode.select_one(
            "div.image > a > div > div").get_text().split(",")
        season_num = int(s.replace("S", ""))
        season_episode_num = int(e.replace("Ep", ""))

        air_date_raw = episode.select_one(
            "div.info > div.airdate")
        air_date = None
        raw_air_date = None
        if air_date_raw:
            raw_air_date = air_date_raw.get_text().replace("\n", "").strip()
            air_date = parseDate(raw_air_date)

        title = episode.select_one("div.info > strong > a").get("title")

        imbd_rate_raw = episode.select_one(
            "div.info span.ipl-rating-star__rating")
        imbd_rate = None
        if imbd_rate_raw:
            imbd_rate = float(imbd_rate_raw.get_text())

        imbd_rate_votes_raw = episode.select_one(
            "div.info span.ipl-rating-star__total-votes")
        imbd_rate_votes = None
        if imbd_rate_votes_raw:
            imbd_rate_votes = int(re.sub(
                r"([()])", "", imbd_rate_votes_raw.get_text()))

        description_raw = episode.select_one(
            "div.info > div[itemprop='description']")
        description = None
        if description_raw:
            description = description_raw.get_text().replace("\n", "")

        watch_links = []
        data.append(
            {
                "imbd_id": imbd_id,
                "image_url": image_url,
                "season": season_num,
                "episode": season_episode_num,
                "raw_air_date": raw_air_date,
                "air_date": air_date,
                "title": title,
                "imbd_rate": imbd_rate,
                "imbd_rate_votes": imbd_rate_votes,
                "description": description,
                "watch_links": watch_links
            }
        )
    return data

In [124]:
errors = []
for s in range(1, 21):
    print(s)
    try:
        html = get_html(season_url(s))
        docs = getSeasonEpisodeData(html)
        [imbd_episodes.update_one({"imbd_id": doc["imbd_id"]}, {"$set": doc}, upsert=True)
         for doc in docs]
    except Exception as error:
        errors.append(error)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [125]:
errors

[]

In [126]:
h = imbd_episodes.aggregate([
    {
        "$group": {
            "_id": "$season",
            "totalEpisodes": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "_id": 1
        }
    }
])

In [127]:
[e for e in h]

[{'_id': 1, 'totalEpisodes': 8},
 {'_id': 2, 'totalEpisodes': 11},
 {'_id': 3, 'totalEpisodes': 12},
 {'_id': 4, 'totalEpisodes': 11},
 {'_id': 5, 'totalEpisodes': 12},
 {'_id': 6, 'totalEpisodes': 16},
 {'_id': 7, 'totalEpisodes': 19},
 {'_id': 8, 'totalEpisodes': 21},
 {'_id': 9, 'totalEpisodes': 20},
 {'_id': 10, 'totalEpisodes': 20},
 {'_id': 11, 'totalEpisodes': 13},
 {'_id': 12, 'totalEpisodes': 26},
 {'_id': 13, 'totalEpisodes': 14},
 {'_id': 14, 'totalEpisodes': 20},
 {'_id': 15, 'totalEpisodes': 16},
 {'_id': 16, 'totalEpisodes': 12},
 {'_id': 17, 'totalEpisodes': 12},
 {'_id': 18, 'totalEpisodes': 6},
 {'_id': 19, 'totalEpisodes': 9},
 {'_id': 20, 'totalEpisodes': 14}]