In [2]:
import io
import re
import json
import sqlite3
import subprocess
import tempfile
from pprint import pprint

import numpy as np
import pandas as pd
import requests as req

import matplotlib.pyplot as plt
import seaborn as sns

import ftfy
import mistune
import textstat

import textacy
import textacy.ke

from pomegranate import GeneralMixtureModel, ExponentialDistribution, NormalDistribution
from scipy.stats import gmean, ks_2samp

from html2text import HTML2Text
from jinja2 import Template
from tqdm import tqdm

sns.set(context="talk", style="darkgrid")

pd.options.display.max_rows = 9999
pd.options.display.max_columns = 999

In [3]:
df = pd.read_csv("~/Downloads/imdb_office.csv")
df.head()

Unnamed: 0,Trend,Season,Episode,Title,Year,Votes,Rating
0,50,1,1,Pilot,2005,4635,7.5
1,46,1,2,Diversity Day,2005,4497,8.3
2,43,1,3,Health Care,2005,3787,7.8
3,44,1,4,The Alliance,2005,3680,8.1
4,49,1,5,Basketball,2005,4021,8.4


In [4]:
df.Rating.describe([0.2, 0.5, 0.8])

count    188.000000
mean       8.245745
std        0.585231
min        6.600000
20%        7.800000
50%        8.200000
80%        8.700000
max        9.800000
Name: Rating, dtype: float64

In [5]:
(
df[df.Rating >= 8.7]
    .style
    .background_gradient("plasma", subset=["Season", "Episode", "Year"])
    .background_gradient("YlGn", subset=["Rating"])
)

Unnamed: 0,Trend,Season,Episode,Title,Year,Votes,Rating
6,52,2,1,The Dundies,2005,4050,8.7
15,36,2,10,Christmas Party,2005,3413,8.9
17,44,2,12,The Injury,2006,4026,9.1
26,40,2,21,Conflict Resolution,2006,3020,8.7
27,47,2,22,Casino Night,2006,4432,9.4
28,46,3,1,Gay Witch Hunt,2006,3825,9.0
35,31,3,8,The Merger,2006,2945,8.7
37,29,3,10,A Benihana Christmas,2006,3112,8.8
40,30,3,13,The Return,2007,3002,8.8
43,38,3,16,Business School,2007,3218,8.9


In [18]:
url = "https://www.ratingraph.com/show-episodes-list/22144/?draw=2&columns%5B0%5D%5Bdata%5D=trend&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=false&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=season&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=false&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=episode&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=false&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=false&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=start&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=false&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=total_votes&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=false&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=average_rating&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=false&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=1&order%5B0%5D%5Bdir%5D=asc&order%5B1%5D%5Bcolumn%5D=2&order%5B1%5D%5Bdir%5D=asc&start=0&length=250&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1588110434853 "
data = req.get(url).json()
ww = pd.DataFrame(data["data"])
ww["average_rating"] = ww.average_rating.astype(float)
ww["total_votes"] = ww.total_votes.str.replace(",", "").astype(int)
ww.head()

Unnamed: 0,rank,trend,season,episode,name,start,total_votes,average_rating
0,138,124,1,1,The Original,2016,19796,8.9
1,208,99,1,2,Chestnut,2016,14913,8.5
2,252,103,1,3,The Stray,2016,13598,8.3
3,219,103,1,4,Dissonance Theory,2016,13584,8.7
4,232,107,1,5,Contrapasso,2016,12943,8.7


In [24]:
ww.average_rating.describe([0.2, 0.5, 0.8])

count    26.000000
mean      8.711538
std       0.454600
min       7.900000
20%       8.300000
50%       8.700000
80%       9.000000
max       9.700000
Name: average_rating, dtype: float64

In [23]:
(
ww[ww.average_rating >= ww.average_rating.quantile(0.75)]
    .style
    .format("{:,}", subset=["total_votes"])
    .background_gradient("plasma", subset=["season", "episode"])
    .background_gradient("YlGn", subset=["total_votes", "average_rating"])
)

Unnamed: 0,rank,trend,season,episode,name,start,total_votes,average_rating
0,138,124,1,1,The Original,2016,19796,8.9
5,216,105,1,6,The Adversary,2016,13110,8.9
6,132,126,1,7,Trompe L'Oeil,2016,17826,9.5
8,153,132,1,9,The Well-Tempered Clavier,2016,16255,9.4
9,61,194,1,10,The Bicameral Mind,2016,26264,9.7
13,244,114,2,4,The Riddle of the Sphinx,2018,11615,9.0
17,169,184,2,8,Kiksuya,2018,14862,9.2
23,627,705,3,4,The Mother of Exiles,2020,5603,9.2
25,1873,-,3,6,Decoherence,2020,3323,8.9


In [66]:
show_number = 60660
show_number = 4054

url = f"https://www.ratingraph.com/show-episodes-list/{show_number}/?draw=2&columns%5B0%5D%5Bdata%5D=trend&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=false&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=season&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=false&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=episode&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=false&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=false&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=start&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=false&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=total_votes&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=false&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=average_rating&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=false&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=1&order%5B0%5D%5Bdir%5D=asc&order%5B1%5D%5Bcolumn%5D=2&order%5B1%5D%5Bdir%5D=asc&start=0&length=250&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1588110434853 "
data = req.get(url).json()

show = pd.DataFrame(data["data"])
show["average_rating"] = show.average_rating.astype(float)
show["total_votes"] = show.total_votes.str.replace(",", "").astype(int)
show.head()

Unnamed: 0,rank,trend,season,episode,name,start,total_votes,average_rating
0,3652,17,1,1,"Good News, Bad News",1989,3224,7.6
1,5043,15,1,2,The Stakeout,1990,2541,7.7
2,6011,15,1,3,The Robbery,1990,2298,7.6
3,6442,11,1,4,Male Unbonding,1990,2294,7.4
4,6447,14,1,5,The Stock Tip,1990,2174,7.6


In [67]:
show.average_rating.describe([0.2, 0.5, 0.8])

count    173.000000
mean       8.436994
std        0.424834
min        7.200000
20%        8.100000
50%        8.400000
80%        8.760000
max        9.600000
Name: average_rating, dtype: float64

In [69]:
(
show[show.average_rating >= show.average_rating.quantile(0.8)]
    .style
    .format("{:,}", subset=["total_votes"])
    .background_gradient("plasma", subset=["season", "episode"])
    .background_gradient("YlGn", subset=["total_votes", "average_rating"])
)

Unnamed: 0,rank,trend,season,episode,name,start,total_votes,average_rating
15,3417,9,2,11,The Chinese Restaurant,1991,2486,8.8
22,3064,13,3,6,The Parking Garage,1991,2585,8.9
24,4838,14,3,8,The Tape,1991,2002,8.8
29,3909,8,3,13,The Subway,1992,2300,8.8
33,3656,9,3,17,The Boyfriend,1992,2350,8.9
35,3020,12,3,19,The Limo,1992,2552,9.0
46,3494,7,4,7,The Bubble Boy,1992,2460,8.8
50,719,13,4,11,The Contest,1992,4626,9.6
51,3486,9,4,12,The Airport,1992,2356,9.0
56,1700,9,4,17,The Outing,1993,3129,9.4
