In [1]:
import numpy as np
import pandas as pd

In [2]:
YEAR = 1990
LANGUAGE = 'hi'
REGION = 'IN'

In [3]:
!pip install scikit-network==0.24.0

!rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-network==0.24.0
  Downloading scikit_network-0.24.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 154 kB/s 
Installing collected packages: scikit-network
Successfully installed scikit-network-0.24.0
rm: cannot remove '*.tsv.gz': No such file or directory
total 1093004
drwxr-xr-x 1 root root      4096 Dec 30 19:54 .
drwxr-xr-x 1 root root      4096 Dec 30 19:53 ..
drwxr-xr-x 4 root root      4096 Dec 20 20:18 .config
-rw-r--r-- 1 root root 238712260 Dec 30 13:15 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Dec 20 20:19 sample_data
-rw-r--r-- 1 root root 292223822 Dec 30 13:15 title.akas.tsv.gz
-rw-r--r-- 1 root root 165782308 Dec 30 13:15 title.basics.tsv.gz
-rw-r--r-- 1 root root 422482830 Dec 30 13:15 title.principals.tsv.gz


In [4]:
from scipy.sparse import csr_matrix

#sbg
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory = True).set_index('titleId')['region']
region.value_counts().head(10)

  exec(code_obj, self.user_global_ns, self.user_ns)


DE    4101360
JP    4099620
FR    4099084
IN    4036161
ES    4022019
IT    4001343
PT    3935264
\N    1874152
US    1400090
GB     432944
Name: region, dtype: int64

In [5]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=True).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title['startYear'] = pd.to_numeric(title['startYear'], errors='coerce')
title = title[title['startYear'] >= YEAR]
title = title[title['titleType']=="movie"]
# title.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
#sbg
title.startYear.unique()

array([2019., 2021., 2000., 1993., 2001., 1990., 2020., 1995., 2006.,
       2018., 1991., 2022., 1994., 1992., 2005., 2004., 1998., 2002.,
       1997., 2009., 1996., 2017., 1999., 2015., 2008., 2003., 2007.,
       2010., 2012., 2013., 2011., 2016., 2014., 2024., 2023., 2025.,
       2027., 2026., 2028.])

In [7]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst','nconst','category']]

# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor'})]
# cast.head()

# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
# cast.head()

In [8]:
# Restrict data to just a single region (e.g. IN, US, etc)
# This loads the region for each title
region = pd.read_csv('title.akas.tsv.gz',sep='\t', low_memory = True).set_index('titleId')[['region','language']]
region = region[(region.region == REGION)]
##region.head(5)
# region.value_counts().head(10)

region = region[region['language'] == LANGUAGE ]

reg=region.copy()

region=region['region']

# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]

# name.head()

In [9]:
def get_pairs(lang=None, min_acted=25, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
        # graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [10]:
pairs, cat = get_pairs(lang=REGION, min_acted=1, min_pairings=1)

ForKumu = lookup(pairs, cat)

ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})

ForKumu.to_excel("pairs.xlsx", index = False)