In [402]:
# -*- coding: utf-8 -*-
import bs4
import re
import requests
import pandas as pd
import omdb
import numpy as np
import pickle
import os
import time
from urllib import unquote
import feather

# let's be nice
omdb.set_default("timeout", 5)

In [412]:
from bokeh.plotting import figure
from bokeh.charts import (
    Bar,
    Line,
    show,
    output_file,
    output_notebook,
    )
from bokeh import plotting
from bokeh.models.sources import ColumnDataSource
output_notebook()

In [4]:
helpful_url = "http://ramiro.org/notebook/us-presidents-causes-of-death/"
TEST_SPARQL = "https://query.wikidata.org/"
OMDBAPI_URL = "http://www.omdbapi.com/?i={}"
feather_path = "data/xfiles.feather"

In [5]:
query = u"""PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

SELECT ?show ?episodeID ?showLabel ?season ?seasonNumber ?episode ?imdb ?episodeLabel ?url WHERE {
    BIND(wd:Q2744 as ?show) .
    ?season wdt:P361 ?show .
    ?episode wdt:P361 ?season .
    ?season p:P179 [
            pq:P1545 ?seasonNumber] .
    ?episode wdt:P345 ?imdb .
    ?episode wdt:P2364 ?episodeID.
    ?episode rdfs:label ?episodeLabel .
  
    ?url schema:about ?episode .
      ?url schema:inLanguage "en" .
      ?url schema:isPartOf <https://en.wikipedia.org/> .
  
    FILTER(langMatches(lang(?episodeLabel),"en"))
}
""".encode("utf8")

In [6]:
sparql_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(sparql_url, params={'query': query, 'format': 'json'})

In [7]:
# extract sqarql
def extract(binding):
    
    value = lambda x: x["value"].encode("utf")
    uri_value = lambda x: unquote(x["value"].encode("utf-8"))
    
    return {
        "episodeID": value(binding["episodeID"]),
        "episode": value(binding["episodeLabel"]),
        "season": value(binding["seasonNumber"]),
        "imdb": value(binding["imdb"]),
        "wikipedia": uri_value(binding["url"]),
    }

In [8]:
results = []
response = data.json()
    
for binding in response["results"]["bindings"]:
    d.append(extract(binding))
    
df = pd.DataFrame(d)

In [211]:
def dump_episode(url, url_title):
    ep_page = requests.get(url)
    out_path = "data/%s.pkl" % url_title
    with open(out_path, "w") as of:
        pickle.dump(ep_page.text, of)
    return ep_page

def get_episode_soup(url):
    url = unquote(url)
        
    url_title = url.split("/")[-1]
   
    pickle_path = "data/{}.pkl".format(url_title)
    if not os.path.exists(pickle_path):
        print "dumping episode {}".format(url_title)
        dump_episode(url, url_title)
        
    ep_text = pickle.load(open(pickle_path, "r"))
    return bs4.BeautifulSoup(ep_text, "html.parser")
    
def get_content(ep_soup):
    return ep_soup.find_all("div", {"id": "mw-content-text"})[0]

def is_monster_of_the_week(content):
    monster_re = r"Monster-of-the-Week"
    
    for p in content.find_all("p"):
        txt = p.text
        monster = re.findall(monster_re, txt, re.IGNORECASE)
        if monster:
            return True
    return False

In [212]:
df["is_monster_of_the_week"] = df["wikipedia"].map(
    lambda x: is_monster_of_the_week(get_content(get_episode_soup(x)))
)

In [10]:
def imdb_data(row):
    """Fetch imdb data with the imdb_id from each row
    """
    resp = omdb.imdbid(row["imdb"])
    for imdb_key in resp.keys():
        row[imdb_key] = resp[imdb_key]
    # be nice
    time.sleep(1)
    return row

In [26]:
# get imdb data one season at a time
seasons_df = []
for _s in set(df.season.values):
    _df = df[df.season == _s]
    seasons_df.append(_df.apply(imdb_data, axis=1))

In [41]:
df_concat = pd.concat(seasons_df)
df = df_concat
df["season"] = df["season"].astype(int)

In [417]:
# type casting
for f in ("imdb_rating", "imdb_votes"):
    df[f] = df[f].astype(float)

In [404]:
# save df to disk so we do not have to grab imdb data again
# feather.write_dataframe(df, feather_path)
df = feather.read_dataframe(feather_path)

In [181]:
df["imdb_weight"] = df["imdb_rating"] * df["imdb_votes"]

**Season Aggregate**

In [None]:
df_season_agg = df.groupby("season").agg({
    "imdb_votes": np.nansum,
    "imdb_rating": np.nansum,
}).reset_index()

In [263]:
# scale
df_season_agg["imdb_rating"] = df_season_agg["imdb_rating"] * 100

In [461]:
from bokeh.models.tickers import SingleIntervalTicker
from bokeh.models.axes import LinearAxis

x = list(df_season_agg["season"])
y1 = df_season_agg["imdb_rating"]
y2 = df_season_agg["imdb_votes"]


source = ColumnDataSource({
    'xs': [x, x],
    'ys': [y1, y2],
    'labels': ['imdb_rating', 'imdb_votes'],
})

plot = figure(title= "X-Files Season Popularity", 
            x_axis_label= 'Season', 
            y_axis_label= 'Popularity',
            plot_height = 500,
            plot_width = 600,
)
plot.multi_line(
    xs="xs",
    ys="ys",
    legend="labels",
    color=["green", "black"],
    source=source,
    line_width=2,
)
show(plot)

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)


**Monster of the Week output**

In [442]:
df_monster = df.groupby(["season", "is_monster_of_the_week"]).apply(len).reset_index()
df_monster.columns=["season", "monster", "count"]

In [443]:
b = Bar(
    df_monster,
    "season",
    values="count",
    plot_height = 500,
    plot_width = 1000,
    group=["monster"],
    title="Monster-of-the-week episodes per Season",
    legend="top_right",
    color=["black", "green"],
    ylabel="episodes",
)
show(b)

In [444]:
df_monster_total = df.groupby(["is_monster_of_the_week"]).agg({
    "episodeID": len,
    "imdb_rating": np.mean,
}).reset_index()
df_monster_total.columns = ["monster_of_the_week", "imdb_rating", "count"]
df_monster_total.head()

Unnamed: 0,monster_of_the_week,imdb_rating,count
0,False,8.510938,64
1,True,7.864539,142


In [446]:
from bokeh.charts.attributes import color, cat

b = Bar(
    df_monster_total,
    values="count",
    label=cat(columns='monster_of_the_week', sort=False),
    plot_height = 500,
    plot_width = 1000,
    title="Total Monster-of-the-Week episodes",
    ylabel="episodes",
    legend="bottom_right",
    color=color(
        columns="monster_of_the_week",
        palette=['black', 'green'],
        sort=False,
    )
)
show(b)
b = Bar(
    df_monster_total,
    values="imdb_rating",
    label=cat(columns='monster_of_the_week', sort=False),
    plot_height = 500,
    plot_width = 1000,
    title="Average X-Files Monster-of-the-Week Rating",
    ylabel="imdb_rating",
    legend="bottom_right",
    color=color(
        columns="monster_of_the_week",
        palette=['black', 'green'],
        sort=False,
    )
)
show(b)

**Writer Analysis**

In [401]:
df[["writer"]].head()

Unnamed: 0,writer
0,"Chris Carter (created by), Chris Ruppenthal"
1,"Chris Carter (created by), Howard Gordon, Alex..."
2,"Chris Carter (created by), Alex Gansa, Howard ..."
3,"Chris Carter (created by), Glen Morgan, James ..."
4,"Chris Carter (created by), Glen Morgan, James ..."


In [427]:
def clean_writer(x):
    if not x:
        return ""
    
    x = str(x)
    writers = x.split(",")
    regex = r" \(.*\)"
    out = []
    if writers:
        for writer in writers:
            repl = re.sub(regex, "", writer)
            repl = repl.strip()
            if repl not in out:
                out.append(repl)
    return out

In [428]:
df["writer_cleaned"] = df["writer"].map(clean_writer)

In [430]:
cols = [
    "imdb_votes",
    "imdb_rating",
    "season",
    "episodeID",
]

rows = []
_ = df.apply(lambda row: [rows.append([row[c] for c in cols] + [w]) 
                         for w in row["writer_cleaned"]], axis=1)
df_new = pd.DataFrame(rows, columns=cols + ["writer"])

In [431]:
df_new["imdb_votes"] = df_new["imdb_votes"].astype(float)

df_writer = df_new.groupby("writer").agg({
    "episodeID": len,
    "imdb_votes": np.sum,
    "imdb_rating": np.mean,
}).reset_index()
df_writer.rename(columns={"episodeID":"episodes"}, inplace=True)

In [432]:
df_writer = df_writer[df_writer["writer"]!="nan"]

In [452]:
for value in [("episodes", "total"), ("imdb_rating", "average")]:
    b = Bar(
        df_writer,
        "writer",
        values=value[0],
        title="The X-Files {} {} by writer".format(value[1], value[0]),
        plot_height = 700,
        plot_width = 1200,
        legend=None,
        ylabel=value[0],
        color="writer",
    )
    show(b)

In [453]:
df_writer.sort("imdb_rating", ascending=False, inplace=True)
df_writer.head()

  if __name__ == '__main__':


Unnamed: 0,writer,imdb_rating,imdb_votes,episodes
9,Darin Morgan,8.9,19450.0,7
8,Daniel Arkin,8.6,3308.0,2
29,Mat Beck,8.5,2054.0,1
11,David Duchovny,8.4375,15160.0,8
43,Vince Gilligan,8.425,48820.0,28


In [455]:
df["imdb_metric"] = df["imdb_rating"] * df["imdb_votes"]