In [5]:
!pip install numpy
!pip install pandas
import numpy as np
import pandas as pd

Collecting numpy
  Using cached numpy-1.24.2-cp311-cp311-win_amd64.whl (14.8 MB)
Installing collected packages: numpy
Successfully installed numpy-1.24.2



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pandas
  Downloading pandas-1.5.3-cp311-cp311-win_amd64.whl (10.3 MB)
     --------------------------------------- 10.3/10.3 MB 36.4 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
     ------------------------------------- 499.4/499.4 kB 32.6 MB/s eta 0:00:00
Installing collected packages: pytz, pandas
Successfully installed pandas-1.5.3 pytz-2022.7.1


In [11]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [12]:
title = title[title['startYear'].str.find('\\N')==-1]

In [13]:
title['startYear'] = title.startYear.values.astype('int')

In [15]:
title = title[title['startYear']>=2000]

In [16]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t', low_memory=False)[['tconst', 'nconst', 'category']]
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [17]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
cast.head()

Unnamed: 0,tconst,nconst,category
80701,tt0011801,nm0459029,actor
80702,tt0011801,nm0681726,actor
80703,tt0011801,nm0692612,actress
80704,tt0011801,nm0726256,actor
80705,tt0011801,nm0776458,actor


In [24]:
# Restrict data to just a single region (e.g. IN, US, etc)
# This loads the region for each title
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory=False).set_index('titleId')['region']
region.head(10)
# new_region = region[region['language'] == 'hi']

titleId
tt0000001    UA
tt0000001    DE
tt0000001    HU
tt0000001    GR
tt0000001    RU
tt0000001    US
tt0000001    \N
tt0000001    JP
tt0000002    \N
tt0000002    FR
Name: region, dtype: object

In [19]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]

In [20]:
name.head()

Unnamed: 0_level_0,primaryName,birthYear
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000001,Fred Astaire,1899.0
nm0000002,Lauren Bacall,1924.0
nm0000003,Brigitte Bardot,1934.0
nm0000004,John Belushi,1949.0
nm0000005,Ingmar Bergman,1918.0


In [25]:
from scipy.sparse import csr_matrix

In [29]:
def get_pairs(lang=None, min_acted=1, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)


In [31]:
pairs, cat = get_pairs(lang='IN', min_acted=1, min_pairings=1)

In [32]:
pairs, cat

(          row    col  n
 0       24366      0  1
 1        2501      0  1
 2         328      0  1
 3        8812      0  1
 4        4258      0  1
 ...       ...    ... ..
 516779  13205  68999  1
 516780  63762  68999  1
 516781  36583  68999  1
 516782  25622  68999  1
 516783  13594  68999  1
 
 [516784 rows x 3 columns],
                primaryName  birthYear
 nm0000002    Lauren Bacall     1924.0
 nm0000008    Marlon Brando     1924.0
 nm0000018     Kirk Douglas     1916.0
 nm0000032  Charlton Heston     1923.0
 nm0000047     Sophia Loren     1934.0
 ...                    ...        ...
 nm9991306   Stella Carlish        NaN
 nm9992850           Wahida        NaN
 nm9993103   Brianna Temple        NaN
 nm9993616  Ryan Mac Lennan        NaN
 nm9993693      Apsara Rani     1996.0
 
 [69000 rows x 2 columns])

In [33]:
ForKumu = lookup(pairs, cat)
ForKumu

Unnamed: 0,count,name1,year1,name2,year2
28869,47,Brahmanandam,1956.0,Mohammad Ali,1968.0
41435,47,Mohammad Ali,1968.0,Brahmanandam,1956.0
98909,36,Sapna Sappu,1980.0,Amit Pachori,1985.0
111173,36,Amit Pachori,1985.0,Sapna Sappu,1980.0
41448,35,Tanikella Bharani,1954.0,Brahmanandam,1956.0
...,...,...,...,...,...
179172,1,Ani Hovhannisyan,,Sylvie de Neef,
179171,1,Jonathan Dumontier,,Sylvie de Neef,
179170,1,Prakashchandra Roy,,Sylvie de Neef,
179169,1,Parambrata Chattopadhyay,1980.0,Sylvie de Neef,


In [34]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})
ForKumu

Unnamed: 0,From,To,Strength
28869,Brahmanandam,Mohammad Ali,47
41435,Mohammad Ali,Brahmanandam,47
98909,Sapna Sappu,Amit Pachori,36
111173,Amit Pachori,Sapna Sappu,36
41448,Tanikella Bharani,Brahmanandam,35
...,...,...,...
179172,Ani Hovhannisyan,Sylvie de Neef,1
179171,Jonathan Dumontier,Sylvie de Neef,1
179170,Prakashchandra Roy,Sylvie de Neef,1
179169,Parambrata Chattopadhyay,Sylvie de Neef,1


In [35]:
ForKumu.to_excel("pairs.xlsx", index = False)

ModuleNotFoundError: No module named 'openpyxl'