In [18]:
# Prepare data to create a network of actors who acted in hollywood movies released from year 1995 (included).

In [19]:
#importing libraries

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [20]:
# preprocessing title_akas

title_akas = pd.read_csv(r'/kaggle/input/kumu-bollywood-network/title.akas.tsv.gz', low_memory=True, sep="\t", usecols=["titleId", "region", "language"],
                         dtype={"titleId":"str", "region":"str", "language":"str"}).set_index('titleId')[["region", "language"]]

In [21]:
title_akas = title_akas[title_akas["region"] == "IN"]

In [22]:
title_akas.head()

Unnamed: 0_level_0,region,language
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000302,IN,hi
tt0000302,IN,en
tt0000417,IN,en
tt0002773,IN,en
tt0003311,IN,\N


In [23]:
title_akas.shape

(3808257, 2)

In [24]:
# preprocessing title_basics

title_basics = pd.read_csv(r'/kaggle/input/kumu-bollywood-network/title.basics.tsv.gz', low_memory=False, sep="\t")[['tconst', 'titleType', 'primaryTitle', 'startYear']]

In [25]:
title_basics = title_basics[title_basics['titleType'] == 'movie']
title_basics = title_basics[title_basics['startYear'] >= '1990']
title_basics = title_basics[title_basics['startYear'] != '\\N']
title_basics = title_basics[title_basics['tconst'].isin(title_akas.index)]

In [26]:
title_basics = title_basics.set_index('tconst')

In [27]:
title_basics.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0081721,movie,Vincent and Me,1990
tt0092024,movie,El sueño de Tánger,1991
tt0092507,movie,Abhimanyu,1991
tt0096775,movie,Agneekaal,1990
tt0096786,movie,Alienator,1990


In [28]:
# preprocessing cast members

title_principals = pd.read_csv(r'/kaggle/input/kumu-bollywood-network/title.principals.tsv.gz', low_memory=True, sep="\t")[["tconst", "nconst", "category"]]

In [29]:
title_principals = title_principals[title_principals.category.isin({"actor","actress"})]
title_principals = title_principals[title_principals.tconst.isin(title_basics.index)]

In [30]:
title_principals.head()

Unnamed: 0,tconst,nconst,category
694236,tt0081721,nm0001409,actor
694237,tt0081721,nm0678113,actress
694238,tt0081721,nm0286612,actor
694239,tt0081721,nm0459325,actor
785355,tt0092024,nm0601377,actor


In [31]:
cast_index = title_principals.copy().set_index("nconst")

In [32]:
#preprocessing names

names_basics = pd.read_csv(r'/kaggle/input/kumu-bollywood-network/name.basics.tsv.gz', low_memory=True, sep="\t")[["nconst", "primaryName"]]

In [33]:
names_basics = names_basics[names_basics.nconst.isin(cast_index.index)]
names_basics = names_basics.set_index("nconst")

In [34]:
names_basics.head()

Unnamed: 0_level_0,primaryName
nconst,Unnamed: 1_level_1
nm0000002,Lauren Bacall
nm0000008,Marlon Brando
nm0000018,Kirk Douglas
nm0000029,Margaux Hemingway
nm0000032,Charlton Heston


In [35]:
#creating network
frequency = title_principals.value_counts()

graph = title_principals.copy()
graph["title"] = graph["tconst"].astype("category")
graph["name"] = graph["nconst"].astype("category")

row = graph['title'].cat.codes.values
col = graph['name'].cat.codes.values
data = np.ones(len(graph), dtype='int')

matrix = csr_matrix((data, (row, col)))
square = matrix.T * matrix
square.setdiag(0)
square = square.tocoo()

pairs = pd.DataFrame({
    'row': square.row,
    'col': square.col,
    'n': square.data
        })

pairs = pairs.reset_index(drop=True)
cat = names_basics.reindex(graph['name'].cat.categories)

pairs = pd.concat([
    pairs,
    cat.iloc[pairs.row].reset_index(drop=True),
    cat.iloc[pairs.col].reset_index(drop=True),
], axis=1)
pairs = pairs.drop(columns=['row', 'col'])
pairs.columns = ['count', 'name1', 'name2']
pairs = pairs.sort_values('count', ascending=False)
pairs = pairs[['name1', 'name2', 'count']]

In [36]:
ForKumu = pairs.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})

In [37]:
ForKumu.head()

Unnamed: 0,From,To,Strength
39670,Brahmanandam,Mohammad Ali,83
58894,Mohammad Ali,Brahmanandam,83
53861,Brahmanandam,Tanikella Bharani,55
58909,Tanikella Bharani,Brahmanandam,55
28235,Brahmanandam,Kota Srinivasa Rao,46


In [38]:
sample = ForKumu.head(10000).copy()

In [39]:
sample.head()

Unnamed: 0,From,To,Strength
39670,Brahmanandam,Mohammad Ali,83
58894,Mohammad Ali,Brahmanandam,83
53861,Brahmanandam,Tanikella Bharani,55
58909,Tanikella Bharani,Brahmanandam,55
28235,Brahmanandam,Kota Srinivasa Rao,46


In [40]:
sample["From"].value_counts()

Brahmanandam         171
Kiran Kumar           67
Jagathi Sreekumar     66
Rajan P. Dev          66
Shakti Kapoor         66
                    ... 
John Waterhouse        1
Shagufta Ali           1
Vimal                  1
Satya Krishnan         1
Purabi Sharma          1
Name: From, Length: 1900, dtype: int64

In [41]:
sample.to_csv("pairs.csv", index=False)
sample.to_excel("pairs.xlsx", index=False)