In [None]:
import requests
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
url = os.getenv("MONGO_URL")
client = MongoClient(url)
db = client.get_database("kuwtk")

In [None]:
def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text)
    raise Exception(response)

# Kardashian API content

## IMBD Episodes
For each episode: title, imbd_rate, description, season, watch_platforms, air_date, imbd_id

In [None]:
imbd_episodes = db.imbd_episodes

In [None]:
def season_url(
    season): return f"https://www.imdb.com/title/tt1086761/episodes?season={season}"

In [None]:
def parseDate(date):
    if date != "":
        try:
            return datetime.strptime(date, '%d %b. %Y')
        except Exception as error:
            return datetime.strptime(date, '%d %b %Y')
    return None

In [None]:
def getSeasonEpisodeData(html):
    data = []
    list_episodes = html.select("div.list.detail.eplist > div")
    for episode in list_episodes:
        imbd_id = episode.select_one("div.image > a > div").get("data-const")
        image_url_raw = episode.select_one("div.image > a > div > img")
        image_url = None
        if image_url_raw:
            image_url = image_url_raw.get("src")
        s, e = episode.select_one(
            "div.image > a > div > div").get_text().split(",")
        season_num = int(s.replace("S", ""))
        season_episode_num = int(e.replace("Ep", ""))

        air_date_raw = episode.select_one(
            "div.info > div.airdate")
        air_date = None
        raw_air_date = None
        if air_date_raw:
            raw_air_date = air_date_raw.get_text().replace("\n", "").strip()
            air_date = parseDate(raw_air_date)

        title = episode.select_one("div.info > strong > a").get("title")

        imbd_rate_raw = episode.select_one(
            "div.info span.ipl-rating-star__rating")
        imbd_rate = None
        if imbd_rate_raw:
            imbd_rate = float(imbd_rate_raw.get_text())

        imbd_rate_votes_raw = episode.select_one(
            "div.info span.ipl-rating-star__total-votes")
        imbd_rate_votes = None
        if imbd_rate_votes_raw:
            imbd_rate_votes = int(re.sub(
                r"([()])", "", imbd_rate_votes_raw.get_text()))

        description_raw = episode.select_one(
            "div.info > div[itemprop='description']")
        description = None
        if description_raw:
            description = description_raw.get_text().replace("\n", "")

        watch_links = []
        data.append(
            {
                "imbd_id": imbd_id,
                "image_url": image_url,
                "season": season_num,
                "episode": season_episode_num,
                "raw_air_date": raw_air_date,
                "air_date": air_date,
                "title": title,
                "imbd_rate": imbd_rate,
                "imbd_rate_votes": imbd_rate_votes,
                "description": description,
                "watch_links": watch_links
            }
        )
    return data

In [None]:
errors = []
for s in range(1, 21):
    print(s)
    try:
        html = get_html(season_url(s))
        docs = getSeasonEpisodeData(html)
        [imbd_episodes.update_one({"imbd_id": doc["imbd_id"]}, {"$set": doc}, upsert=True)
         for doc in docs]
    except Exception as error:
        errors.append(error)

In [None]:
errors

In [None]:
h = imbd_episodes.aggregate([
    {
        "$group": {
            "_id": "$season",
            "totalEpisodes": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "_id": 1
        }
    }
])

In [None]:
[e for e in h]

## Wikipedia Episodes
For each episode: title, imbd_rate, description, season, watch_platforms, air_date, imbd_id

In [None]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_Keeping_Up_with_the_Kardashians_episodes"

In [None]:
html = get_html(wiki_url)
wiki_ep = html.select("tr.vevent")

In [None]:
episodes_wikipedia = []
for e in wiki_ep:
    episode_overall = int(e.select_one("tr > th:first-child").get_text())
    episode_data = [i.get_text() for i in e.select("tr > td")]
    episode = episode_data[0]
    title = episode_data[1]
    special_episode = False
    air_date = None
    us_viewers = None
    season = None
    if len(episode_data) == 5:
        episode = episode_overall
        episode_overall = None
        season = episode_data[0]
        special_episode = True
        air_date = datetime.fromisoformat(
            re.sub(r"([()])", "", re.search(r"\((.*?)\)", episode_data[3]).group()))
        if episode_data[4] != "N/A":
            us_viewers = float(re.sub(
                r"\[[^)]*\]", "", episode_data[4]).replace('″', ""))
    if len(episode_data) <= 4:
        air_date = datetime.fromisoformat(
            re.sub(r"([()])", "", re.search(r"\((.*?)\)", episode_data[2]).group()))
        if len(episode_data) == 4 and episode_data[3] != "N/A":
            us_viewers = float(re.sub(
                r"\[[^)]*\]", "", episode_data[3]).replace('″', ""))
    episodes_wikipedia.append({
        "title": title.strip('"'),
        "episode": int(episode),
        "season": season,
        "episode_overall": episode_overall,
        "air_date": air_date,
        "us_viewers": us_viewers,
        "special_episode": special_episode
    })

In [None]:
wiki_episodes = db.wiki_episodes

In [None]:
errors = []
wiki_episodes.drop()
for doc in episodes_wikipedia:
    print(doc["episode_overall"])
    try:
        wiki_episodes.update_one({"title": doc["title"]}, {
                                 "$set": doc}, upsert=True)
    except Exception as error:
        errors.append(error)

In [None]:
errors

## IMBD Fullcredits per episode

In [None]:
def url_fullcredits(
    imbd_id): return f"https://www.imdb.com/title/{imbd_id}/fullcredits"

In [None]:
episodes_ids = [id["imbd_id"]
                for id in imbd_episodes.find({}, {"imbd_id": 1, "_id": 0})]

In [None]:
def get_document(html, id):
    directed_by = [e.get_text().strip()
                   for e in html.select("h4[name='director'] + table a")]
    written_by = [e.get_text().strip()
                  for e in html.select("h4[name='writer'] + table a")]
    cast = [e.get_text().strip() for e in html.select(
        "h4[name='cast'] + table tr > td:nth-of-type(2) a")]
    tables_names = ['producer',
                    'composer',
                    'cinematographer',
                    'editor',
                    'make_up_department',
                    'production_manager',
                    'sound_department',
                    'visual_effects',
                    'camera_department',
                    'casting_department',
                    'editorial_department',
                    'music_department',
                    'miscellaneous']
    data_tables = {}
    for table in tables_names:
        raw = html.select(f"h4[name='{table}'] + table tr")
        data_tables[table] = []
        for r in raw:
            name = r.select_one("td.name > a").get_text().strip()
            credit_raw = r.select_one("td.credit")
            credit = None
            if credit_raw:
                credit = credit_raw.get_text().strip()
            data_tables[table].append({"name": name, "credit": credit})
    return {
        "imbd_id":id,
        "directed_by": directed_by,
        "written_by": written_by,
        "cast": cast,
        "produced_by": data_tables["producer"],
        "music_by": data_tables["composer"],
        "cinematographer": data_tables["cinematographer"],
        "edited_by": data_tables["editor"],
        "makeup_by": data_tables["make_up_department"],
        "production_manager": data_tables["production_manager"],
        "sound_department": data_tables["sound_department"],
        "visual_effects": data_tables["visual_effects"],
        "camera_department": data_tables["camera_department"],
        "casting_department": data_tables["casting_department"],
        "editorial_department": data_tables["editorial_department"],
        "music_department": data_tables["music_department"],
        "miscellaneous": data_tables["miscellaneous"],
    }

In [None]:
imbd_episodes_fullcredits = db.imbd_episodes_fullcredits
mongo_errors = []
for id in episodes_ids:
    print(id)
    html = get_html(url_fullcredits(id))
    doc = get_document(html,id)
    try:
        imbd_episodes_fullcredits.update_one({"imbd_id":doc["imbd_id"]},{"$set":doc},upsert=True)
    except Exception as error:
        mongo_errors.append(error)