# Import relevant libraries

In [1]:
import os
import glob
import json
import numpy as np
import pandas as pd

# Load data

In [2]:
# setup data path
base_path = os.path.join('..', 'datasets')
files_path = glob.glob(os.path.join(base_path, '*.json'))
files_path

['../datasets/starwars-episode-1-interactions.json',
 '../datasets/starwars-episode-2-interactions.json',
 '../datasets/starwars-episode-3-interactions.json',
 '../datasets/starwars-episode-4-interactions.json',
 '../datasets/starwars-episode-5-interactions.json',
 '../datasets/starwars-episode-6-interactions.json',
 '../datasets/starwars-episode-7-interactions.json',
 '../datasets/starwars-full-interactions.json']

In [3]:
# load files by episode excluding the full-interactions file
for i in range(len(files_path) - 1):
    with open(files_path[i]) as json_file:
        var_name = '_'.join(['ep', str(i+1)])
        vars()[var_name] = json.load(json_file)
        print('A variable `{}` was created!'.format(var_name))

# load full interaction file to get all characters info
with open(files_path[-1]) as json_file:
    base = json.load(json_file)
    print('A variable `base` was created!')

A variable `ep_1` was created!
A variable `ep_2` was created!
A variable `ep_3` was created!
A variable `ep_4` was created!
A variable `ep_5` was created!
A variable `ep_6` was created!
A variable `ep_7` was created!
A variable `base` was created!


# Create summary table

In [4]:
# generate a table with all character as a base table for the summary
sum_table = pd.DataFrame(base['nodes'])[['name']]
sum_table

Unnamed: 0,name
0,QUI-GON
1,NUTE GUNRAY
2,PK-4
3,TC-14
4,OBI-WAN
...,...
105,YOLO ZIFF
106,COLONEL DATOO
107,ELLO ASTY
108,JESS


In [5]:
# create a list of episodes
ep_list = ['ep_1', 'ep_2', 'ep_3', 'ep_4', 'ep_5', 'ep_6', 'ep_7']

# loops over the list to merge information whether a character appears in each episode to the summary table
for ep in ep_list:
    # create a dataframe(`df`) from `nodes` element in each episode's dictionary
    df = pd.DataFrame(vars()[ep]['nodes'])
    # subset only the `name` column
    df = df[['name']]
    # create a new column with a name corresponding to its episode with a value 1 in each row
    df.loc[:, ep] = 1
    # merge the newly created dataframe to the summary table
    sum_table = pd.merge(sum_table, df, on = 'name', how = 'left')
    print('{} information was added to the summary table'.format(ep))

# fill NAN with 0
sum_table.fillna(0, inplace = True)

ep_1 information was added to the summary table
ep_2 information was added to the summary table
ep_3 information was added to the summary table
ep_4 information was added to the summary table
ep_5 information was added to the summary table
ep_6 information was added to the summary table
ep_7 information was added to the summary table


In [6]:
sum_table

Unnamed: 0,name,ep_1,ep_2,ep_3,ep_4,ep_5,ep_6,ep_7
0,QUI-GON,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,NUTE GUNRAY,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,PK-4,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,TC-14,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OBI-WAN,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
105,YOLO ZIFF,0.0,0.0,0.0,0.0,0.0,0.0,1.0
106,COLONEL DATOO,0.0,0.0,0.0,0.0,0.0,0.0,1.0
107,ELLO ASTY,0.0,0.0,0.0,0.0,0.0,0.0,1.0
108,JESS,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# create a dictionary format
keys = np.array(sum_table['name'])
values = np.array(sum_table.loc[:, sum_table.columns != 'name'])
sum_dict = {}

for key, value in zip(keys, values):
    sum_dict[key] = value

In [8]:
sum_dict

{'QUI-GON': array([1., 0., 1., 0., 0., 0., 0.]),
 'NUTE GUNRAY': array([1., 1., 1., 0., 0., 0., 0.]),
 'PK-4': array([1., 1., 0., 0., 0., 0., 0.]),
 'TC-14': array([1., 0., 0., 0., 0., 0., 0.]),
 'OBI-WAN': array([1., 1., 1., 1., 1., 1., 0.]),
 'DOFINE': array([1., 0., 0., 0., 0., 0., 0.]),
 'RUNE': array([1., 0., 0., 0., 0., 0., 0.]),
 'TEY HOW': array([1., 0., 0., 0., 0., 0., 0.]),
 'EMPEROR': array([1., 1., 1., 0., 1., 1., 0.]),
 'CAPTAIN PANAKA': array([1., 0., 0., 0., 0., 0., 0.]),
 'SIO BIBBLE': array([1., 1., 0., 0., 0., 0., 0.]),
 'JAR JAR': array([1., 1., 1., 0., 0., 0., 0.]),
 'TARPALS': array([1., 0., 0., 0., 0., 0., 0.]),
 'BOSS NASS': array([1., 0., 0., 0., 0., 0., 0.]),
 'PADME': array([1., 1., 1., 0., 0., 0., 0.]),
 'RIC OLIE': array([1., 0., 0., 0., 0., 0., 0.]),
 'WATTO': array([1., 1., 0., 0., 0., 0., 0.]),
 'ANAKIN': array([1., 1., 1., 0., 0., 1., 0.]),
 'SEBULBA': array([1., 0., 0., 0., 0., 0., 0.]),
 'JIRA': array([1., 0., 0., 0., 0., 0., 0.]),
 'SHMI': array([1., 