# Initialization

This notebook was run on `Google Colab` so this section is optional. As long as the packages in `requirements.txt` are installed via `pip` or in `conda` environment, this should run smoothly.

In [None]:
#@title Mount and change directory
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/playground/series-remake')

In [None]:
#@title Install [`imdbpy`](https://github.com/alberanid/imdbpy) and update `plotly`
%%bash 
pip install imdbpy
pip install -U kaleido
pip install -U plotly


# Obtain data

In [None]:
#@title Importing packages
import re
import numpy as np 
import pandas as pd

from utils import *

from tqdm.notebook import tqdm
from IPython.display import HTML as ipy_html
from functools import partial


## Process Wikipedia tables
The lists of remakes are from 2 wikipedia tables (one from UK shows, another from other foreign countries, of course with some overlap between the two):

- https://en.wikipedia.org/wiki/List_of_American_television_series_based_on_British_television_series
- https://en.wikipedia.org/wiki/List_of_American_television_shows_based_on_foreign_shows

Obtain the names of the shows, original show's country and, if available, the Wikipedia URL from the shows in the tables. Combine the 2 tables, and drop the duplicates based on the Wikiepdia URLs. 

In [None]:
brit_wiki = dict(
    wiki_url = 'https://en.wikipedia.org/wiki/List_of_American_television_series_based_on_British_television_series',
    colrename_dict = {'UK original':'original', 'US remake': 'remake'},
    dropcol_list = ['Notes'],
    addcol_dict = {'original_country': 'United Kingdom', 'remake_country': 'United States'}
)


forg_wiki = dict(
    wiki_url = 'https://en.wikipedia.org/wiki/List_of_American_television_shows_based_on_foreign_shows',
    colrename_dict = {'American show': 'remake', 'Original show': 'original', 'Original country': 'original_country'},
    dropcol_list = ['Notes'],
    addcol_dict = {'remake_country': 'United States'},
    cols_only_proctxt = ['original_country']
)

df_brit = process_wikitable(**brit_wiki)
df_forg = process_wikitable(**forg_wiki)

df_remake = pd.concat([df_brit,df_forg], axis=0, ignore_index=True)\
                .drop_duplicates(['original_wikiurl', 'remake_wikiurl'])\
                .reset_index(drop=True)


In the Wikipedia pages of each show, the IMDb URLs are in the external sections. Obtain and only keep the ones with the IMDb URLs (could be more than one), and remove duplicate entries. 

In [None]:
tqdm.pandas()
for prefix in ['original', 'remake']:
    df_remake[prefix + '_imdburl'] = df_remake.progress_apply(
        lambda row: imdburls_from_wiki(row[prefix + '_wikiurl']), axis=1)
    

df_remake = filtout_rows(df_remake, partial(empty_row, fields=['original_imdburl','remake_imdburl']))\
                     .sort_values(by='original_show')\
                     .reset_index(drop=True)

## Process IMDB URL and obtain scores
For each IMDb URL, obtain the info using [`imdbpy`](https://github.com/alberanid/imdbpy) package. 

In [None]:
imdb_fields = ['title', 'imdbID', 'rating', 'votes', 'country', 'genres', 
               'kind','localized title', 'number of seasons']

for prefix in ['original', 'remake']:
    df_remake[prefix + '_imdbinfo'] = df_remake.progress_apply(
        lambda row: get_imdbinfo(row[prefix + '_imdburl'], imdb_fields), axis=1)

Clean IMDb information fields, as occasionally the external links obtained from the Wikipedia pages are not from the original or remake shows (eg the remake links in original wiki page). A quick clean is just to check if the country in the Wikipedia table (for remake it is just `United States`) at least belongs in the `country` list obtained from the IMDb URL. 

In [None]:
def clean_imdbinfo(info, country):
    def check_country_none(x_country):
        if x_country is None: return []
        return country in x_country
    return [x for x in info if check_country_none(x['country'])]

for prefix in ['original', 'remake']:
    df_remake[prefix + '_imdbinfo'] = df_remake.apply(
        lambda row: clean_imdbinfo(row[prefix + '_imdbinfo'], 
                                   row[prefix + '_country']), axis=1)

df_remake = filtout_rows(df_remake, 
                         partial(empty_row, fields=['original_imdbinfo','remake_imdbinfo']),
                         reset_index = True)

# saving 
df_remake.to_pickle('data/usremake-imdb-df.pkl')

# displaying
ipy_html(df_remake.to_html())


# Data summary

Obtain summary by taking the mean values of the scores (i.e. ratings) and number of votes (to represent a form of "popularity") for each original and remake show. 

In [None]:
# reading pickle
df_remake = pd.read_pickle('data/usremake-imdb-df.pkl')


In [None]:
def get_imdb_summary(d, k):
    if len(d) == 0:
        return np.nan
    return np.array([x[k] for x in d]).astype('float').mean()

imdb_infofields = {
    'rating': 'score', 
    'votes': 'nvotes', 
    'number of seasons': 'nseasons'
}

for prefix in ['original', 'remake']:
    for info_k,info_v in imdb_infofields.items():
        df_remake[prefix + '_imdb-' + info_v] = df_remake.apply(
            lambda row: get_imdb_summary(row[prefix + '_imdbinfo'], info_k), axis=1)
    

def nan_row(row, fields):
    return any([np.isnan(row[k]) for k in fields])

imdb_field_regex = re.compile('.*imdb-(score|nvotes).*')
imdb_select_fields = [s for s in list(df_remake.columns) if imdb_field_regex.search(s) is not None]

df_remake = filtout_rows(df_remake, 
                         partial(nan_row, fields=imdb_select_fields),
                         reset_index = True)

rmfield_regexp = re.compile('.*imdbinfo$')
fields2remove = [s for s in list(df_remake.columns) if rmfield_regexp.search(s) is not None]
df_remake.drop(columns=fields2remove, inplace=True)


In [None]:
df_remake

Unnamed: 0,original_show,original_wikiurl,remake_show,remake_wikiurl,original_country,remake_country,original_imdburl,remake_imdburl,original_imdb-score,original_imdb-nvotes,original_imdb-nseasons,remake_imdb-score,remake_imdb-nvotes,remake_imdb-nseasons
0,Agony,https://en.wikipedia.org/wiki/Agony_(TV_series),The Lucie Arnaz Show,https://en.wikipedia.org/wiki/The_Lucie_Arnaz_...,United Kingdom,United States,{https://www.imdb.com/title/tt0078559/},{https://www.imdb.com/title/tt0088556/},7.2,117.0,3.0,5.7,25.0,1.0
1,Airline,https://en.wikipedia.org/wiki/Airline_(1998_TV...,Airline,https://en.wikipedia.org/wiki/Airline_(U.S._TV...,United Kingdom,United States,"{https://www.imdb.com/title/tt7254516/, https:...",{https://www.imdb.com/title/tt0395378/},7.8,93.0,6.0,7.3,200.0,3.0
2,As If,https://en.wikipedia.org/wiki/As_If_(UK_TV_ser...,As If,https://en.wikipedia.org/wiki/As_If_(U.S._TV_s...,United Kingdom,United States,{https://www.imdb.com/title/tt0275821/},{https://www.imdb.com/title/tt0313034/},8.2,820.0,4.0,8.1,84.0,1.0
3,BeTipul,https://en.wikipedia.org/wiki/BeTipul,In Treatment,https://en.wikipedia.org/wiki/In_Treatment_(U....,Israel,United States,{https://www.imdb.com/title/tt0466345/},{https://www.imdb.com/title/tt0835434/},8.3,301.0,2.0,8.2,16264.0,4.0
4,Being Human,https://en.wikipedia.org/wiki/Being_Human_(UK_...,Being Human,https://en.wikipedia.org/wiki/Being_Human_(Nor...,United Kingdom,United States,{https://www.imdb.com/title/tt1349938/},{https://www.imdb.com/title/tt1595680/},7.8,24284.0,5.0,7.5,24592.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,Whose Line Is It Anyway?,https://en.wikipedia.org/wiki/Whose_Line_Is_It...,Whose Line Is It Anyway?,https://en.wikipedia.org/wiki/Whose_Line_Is_It...,United Kingdom,United States,{https://www.imdb.com/title/tt0094580/},{https://www.imdb.com/title/tt0163507/},8.2,5374.0,10.0,8.5,40794.0,8.0
112,Wild at Heart,https://en.wikipedia.org/wiki/Wild_at_Heart_(U...,Life Is Wild,https://en.wikipedia.org/wiki/Life_Is_Wild,United Kingdom,United States,{https://www.imdb.com/title/tt0757127/},{https://www.imdb.com/title/tt0938853/},7.4,1300.0,8.0,6.8,501.0,1.0
113,Wilfred (Australian TV series),https://en.wikipedia.org/wiki/Wilfred_(Austral...,Wilfred (U.S. TV series),https://en.wikipedia.org/wiki/Wilfred_(U.S._TV...,Australia,United States,{https://www.imdb.com/title/tt0899203/},{https://www.imdb.com/title/tt1703925/},7.9,2828.0,2.0,7.8,40552.0,4.0
114,"Yo soy Betty, la fea","https://en.wikipedia.org/wiki/Yo_soy_Betty,_la...",Ugly Betty,https://en.wikipedia.org/wiki/Ugly_Betty,Colombia,United States,{https://www.imdb.com/title/tt0233127/},{https://www.imdb.com/title/tt0805669/},8.2,2733.0,1.0,6.5,41229.0,4.0


In [None]:
df_remake.to_csv('data/remake-summary.csv')

# Data visualization

In [None]:
#@title Import `plotly`
import plotly.graph_objects as go

# configure plotly for static image export 
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"

Load in data summary and select certain shows for label annotation with either:
- certain remake-original score differences 
- or some of the more known shows like `The Office` or `The X Factor`

In [None]:
df_remake = pd.read_csv('data/remake-summary.csv')

# Select only shows with certain score difference
IMDb_score_diff = df_remake['remake_imdb-score'] - df_remake['original_imdb-score'] 
imdbscore_select = np.logical_or(IMDb_score_diff > 1.2 , IMDb_score_diff < -3.5)

# Select some popular shows
select_shows = ['The Office', 'House of Cards', 'The X Factor', 'Pop Idol', 'Miranda', 'Strictly Come Dancing']
rows_with_selshows = np.array([x in select_shows for x in df_remake['original_show'].to_list()])

select_data_rows = df_remake[(np.logical_or(imdbscore_select,rows_with_selshows))].reset_index(drop=True)

In [None]:
df_remake

Unnamed: 0.1,Unnamed: 0,original_show,original_wikiurl,remake_show,remake_wikiurl,original_country,remake_country,original_imdburl,remake_imdburl,original_imdb-score,original_imdb-nvotes,original_imdb-nseasons,remake_imdb-score,remake_imdb-nvotes,remake_imdb-nseasons
0,0,Agony,https://en.wikipedia.org/wiki/Agony_(TV_series),The Lucie Arnaz Show,https://en.wikipedia.org/wiki/The_Lucie_Arnaz_...,United Kingdom,United States,{'https://www.imdb.com/title/tt0078559/'},{'https://www.imdb.com/title/tt0088556/'},7.2,117.0,3.0,5.7,25.0,1.0
1,1,Airline,https://en.wikipedia.org/wiki/Airline_(1998_TV...,Airline,https://en.wikipedia.org/wiki/Airline_(U.S._TV...,United Kingdom,United States,"{'https://www.imdb.com/title/tt7254516/', 'htt...",{'https://www.imdb.com/title/tt0395378/'},7.8,93.0,6.0,7.3,200.0,3.0
2,2,As If,https://en.wikipedia.org/wiki/As_If_(UK_TV_ser...,As If,https://en.wikipedia.org/wiki/As_If_(U.S._TV_s...,United Kingdom,United States,{'https://www.imdb.com/title/tt0275821/'},{'https://www.imdb.com/title/tt0313034/'},8.2,820.0,4.0,8.1,84.0,1.0
3,3,BeTipul,https://en.wikipedia.org/wiki/BeTipul,In Treatment,https://en.wikipedia.org/wiki/In_Treatment_(U....,Israel,United States,{'https://www.imdb.com/title/tt0466345/'},{'https://www.imdb.com/title/tt0835434/'},8.3,301.0,2.0,8.2,16264.0,4.0
4,4,Being Human,https://en.wikipedia.org/wiki/Being_Human_(UK_...,Being Human,https://en.wikipedia.org/wiki/Being_Human_(Nor...,United Kingdom,United States,{'https://www.imdb.com/title/tt1349938/'},{'https://www.imdb.com/title/tt1595680/'},7.8,24284.0,5.0,7.5,24592.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,111,Whose Line Is It Anyway?,https://en.wikipedia.org/wiki/Whose_Line_Is_It...,Whose Line Is It Anyway?,https://en.wikipedia.org/wiki/Whose_Line_Is_It...,United Kingdom,United States,{'https://www.imdb.com/title/tt0094580/'},{'https://www.imdb.com/title/tt0163507/'},8.2,5374.0,10.0,8.5,40794.0,8.0
112,112,Wild at Heart,https://en.wikipedia.org/wiki/Wild_at_Heart_(U...,Life Is Wild,https://en.wikipedia.org/wiki/Life_Is_Wild,United Kingdom,United States,{'https://www.imdb.com/title/tt0757127/'},{'https://www.imdb.com/title/tt0938853/'},7.4,1300.0,8.0,6.8,501.0,1.0
113,113,Wilfred (Australian TV series),https://en.wikipedia.org/wiki/Wilfred_(Austral...,Wilfred (U.S. TV series),https://en.wikipedia.org/wiki/Wilfred_(U.S._TV...,Australia,United States,{'https://www.imdb.com/title/tt0899203/'},{'https://www.imdb.com/title/tt1703925/'},7.9,2828.0,2.0,7.8,40552.0,4.0
114,114,"Yo soy Betty, la fea","https://en.wikipedia.org/wiki/Yo_soy_Betty,_la...",Ugly Betty,https://en.wikipedia.org/wiki/Ugly_Betty,Colombia,United States,{'https://www.imdb.com/title/tt0233127/'},{'https://www.imdb.com/title/tt0805669/'},8.2,2733.0,1.0,6.5,41229.0,4.0


In [None]:
#@title Layout for `plotly` figure
axis_config = dict(
    showline=True,
    showgrid=False,
    showticklabels=True,
    linecolor='rgb(0, 0, 0)',
    linewidth=2,    
    ticks='outside',
    tickwidth=2
    )

font_config = dict(
    family="Fira Sans",
    size=18,
    color='black'
    )

title_config = dict(
    title_x = 0.5,
    title_y = 0.9,
    title_xanchor = 'center',
    title_yanchor = 'top',
    title_font_size=23
)

general_layout = go.Layout(
    xaxis=axis_config,
    yaxis=axis_config,
    font=font_config,
    margin=dict(
        autoexpand=True,
        l=100,
        r=50,
        t=100,
        b=120
    ),
    showlegend=True,
    plot_bgcolor='white',
    autosize=True,
    **title_config
)


In [None]:
def hovertemplate_func(row):
    hovr_tmp = ('<b>%s</b> (%s): ' +\
                '<b>%.2f</b><br>' +\
                '%d votes, %.0f seasons<br>' +\
                '<i>%s</i> (remake): ' +\
                '<b>%.2f</b><br>' +\
                '%d votes, %.0f seasons<br>') %(
                    row['original_show'], row['original_country'],
                    row['original_imdb-score'], 
                    row['original_imdb-nvotes'], row['original_imdb-nseasons'], 
                    row['remake_show'], 
                    row['remake_imdb-score'], 
                    row['remake_imdb-nvotes'], row['remake_imdb-nseasons']
                    )
            
    return hovr_tmp

In [None]:
fig = go.Figure(layout=general_layout)

# identity line
fig.add_trace(
    go.Scatter(
        x = [0, 10],
        y = [0, 10],
        mode = 'lines', 
        line = dict(width=2,color='gray',dash='dot'),
        showlegend=False
    )
)

# scatter plot with scores as coordinates, colors as log of relative ratio of number of votes 
fig.add_trace(
    go.Scatter(
        x=df_remake['original_imdb-score'], 
        y=df_remake['remake_imdb-score'], 
        showlegend=False,
        mode='markers',
        marker = dict(
            size=20,
            color= np.log(df_remake['remake_imdb-nvotes'] / df_remake['original_imdb-nvotes']),
            colorscale='RdYlBu', 
            cmid = 0,
            showscale=True,
            opacity=0.5,
            line=dict(
                color='black',
                width=1.5
            ),
            colorbar=dict(
                title='log relative<br> vote count',
                titleside='right',
                tickmode='array',
                tickvals=[-6, 6],
                ticktext=['remakes<br>more','original<br>more'],
                ticks='',
                lenmode="pixels", 
                len=300,
                titlefont=dict(size=18),
                tickfont=dict(size=15),
            )
        ),
        customdata = df_remake.apply(hovertemplate_func,axis = 1),
        hovertemplate = '%{customdata}'
    )
)


# show selected show annotations 
for i in range(select_data_rows.shape[0]):
    row = select_data_rows.iloc[i]
    coord = [row['original_imdb-score'], row['remake_imdb-score']]
    fig.add_annotation(
        x=coord[0],
        y=coord[1],
        text=row['original_show'],
        arrowhead=2,
        opacity=0.8,
        font=dict(
            family="Fira Code, monospace",
            size=12,
            color='black'
        )
    ) 

# data source 
fig.add_annotation(
    text='<b>Data sources:</b><br>' +
    '[1] https://en.wikipedia.org/wiki/List_of_American_television_shows_based_on_foreign_shows<br>' +
    '[2] https://en.wikipedia.org/wiki/List_of_American_television_series_based_on_British_television_series<br>' + 
    '[3] https://www.imdb.com/ using imdbpy',
    xref='paper', yref='paper',
    x=-0.02, y=-0.14, 
    align='left',
    opacity=0.8,
    font=dict(
        family="Fira Code, monospace",
        size=10,
        color='gray',
    ),
    showarrow=False
)

# brief description
fig.add_annotation(
    text='remakes generally score <br><em>worse</em> than the originals',
    xref='paper', yref='paper',
    x=0.03, y=0.95, 
    align='left',
    textangle=-45,
    opacity=0.8,
    font=dict(
        family="Fira Code Light",
        size=20
    ),
    showarrow=False
)

# plot titles, lables and limits 
fig.update_layout(
    title_text = 'Are US <b>remakes</b> of foreign TV shows <br> better or worse than the original?',
    xaxis_title='Foreign original IMDb score',
    yaxis_title='US remake IMDb score',
    autosize=True,
    width=800,
    height=900,
    xaxis_range=[2.5,9.5],
    yaxis_range=[2.5,9.5],
)

# to have equal aspect ratio
fig.update_yaxes(
    constrain="domain",
)
fig.update_xaxes(
    scaleanchor = "y",
    scaleratio = 1
)

# save and show
fig.write_html('docs/imdb-remakes.html')
fig.write_image('docs/imdb-remakes.svg')

fig.show()