# Setup:

You can ignore pretty much all of this.

In [1]:
"""
Preable for most code and jupyter notebooks
@author: tobinsouth
"""

import numpy as np, pandas as pd

import math, string, nltk, re, pickle, time

from nltk.tokenize import RegexpTokenizer

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
plt.rcParams['savefig.dpi'] = 150

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['text.usetex'] = True
plt.rcParams['font.family'] = "serif"

In [2]:
import requests
from bs4 import BeautifulSoup
from collections import Counter



# Downloading

## Getting a list of all marvel movie scripts

In [3]:
url_marvel = "https://transcripts.fandom.com/wiki/Category:Marvel_Transcripts"

In [4]:
response = requests.get(url_marvel)

In [5]:
soup = BeautifulSoup(response.text, "html.parser")

In [6]:
url_endings = []
for link in soup.select('.category-page__member-link'):
    ending = link.get('href')
    if "Category" not in ending:
        url_endings.append(ending)

## Scraping Infinity Saga Transcripts

In [3]:
normal_list = [ # Commented Code Requires Special Work
 '/wiki/Ant-Man',
#  '/wiki/Ant-Man_and_the_Wasp',
#  '/wiki/The_Avengers',
 '/wiki/Avengers:_Age_of_Ultron',
#  '/wiki/Avengers:_Endgame',
 '/wiki/Avengers:_Infinity_War',
#  '/wiki/Black_Panther',
 '/wiki/Captain_America:_Civil_War',
 '/wiki/Captain_America:_The_First_Avenger',
 '/wiki/Captain_America:_The_Winter_Soldier',
 '/wiki/Captain_Marvel_(2019)',
#  '/wiki/Doctor_Strange',
 '/wiki/Guardians_of_the_Galaxy',
#  '/wiki/Guardians_of_the_Galaxy_Vol._2',
 '/wiki/Spider-Man:_Homecoming',
#  '/wiki/Iron_Man',
 '/wiki/Iron_Man_2',
#  '/wiki/Iron_Man_3',
#  '/wiki/The_Incredible_Hulk',
#  '/wiki/Thor',
#  '/wiki/Thor:_Ragnarok',
 '/wiki/Thor:_The_Dark_World',]

gray_format = [
     '/wiki/Thor',
     '/wiki/Thor:_Ragnarok',
]

others = [
 '/wiki/The_Incredible_Hulk',
 '/wiki/Guardians_of_the_Galaxy_Vol._2',
 '/wiki/Iron_Man',
 '/wiki/Doctor_Strange',
 '/wiki/Black_Panther',
 '/wiki/Avengers:_Endgame',
]

# WHERE STUFF CHANGES::

# Using "Name: " (The old method)

This is the original method and still works for most of the movies. I'm made minor corrections later on but this hasn't changed.

In [4]:
def get_char_seq(url):
    """ 
    Function to scrape the speaker from transcipts on 
    https://transcripts.fandom.com/
    Transcript have to have lines starting with a chacrter name followed by a ':'.
    Does not work on some scripts.
    @param url: The exact url of the transcript needing scraping.
    """
    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    body = soup.select('#mw-content-text')[0]
    lines = body.text.split('\n')

    script_characters = []
    for line in lines:
        line = line.replace(u'\xa0', u'') # Removes non-breaking spaces, which was causing issues
        clean_line = re.sub(r"\[(.*)\]", " ", line) # Remove [stuff] which is usual extra content
        clean_line = re.sub(r"\((.*)\)", " ", clean_line) # Remove (stuff)
        start_matches = re.findall( "^(.*?):", clean_line) 
        
        if len(start_matches) == 1: # This many deals with the case that there are no matches
            finding = start_matches[0]
            finding = finding.lower()
            finding = finding.lstrip().rstrip() # This is an alternative if you dont wnt to remove all spaces
#             finding = ''.join([i for i in finding if i.isalpha()])
            if len(finding) < 21 and len(finding)!=0:
                script_characters.append(finding)
            
    script_characters = script_characters[2:] # Remove 'Next Transcript' and 'Previous Transcript'
    
    return script_characters

Running function on the 'normal list'

In [5]:
counts = {} # Stores a list of tuples with (name, amount of lines)
seqs = {} # Stores the sequence of script allocations in order
for url in normal_list:
    long_url = "https://transcripts.fandom.com/"+url
    seq = get_char_seq(long_url)
    counts[url] = (Counter(seq).most_common())
    seqs[url[6:]] =  seq

# Now things get different.

# 'Gray Box' Scripts

Some scripts are in a different but standard format where all the text is surrounded by gray boxes. This is scraped for two Thor movies below.

## Thor 1

Url to demonstrate style:
https://transcripts.fandom.com/wiki/Thor

In [6]:
url = "https://transcripts.fandom.com/wiki/Thor"

In [7]:
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

body = soup.select('pre')

# Get the first line of each gray box
first_lines = [gray_bit.text.split('\n')[0] for gray_bit in body]

# Keep only lines that have the word at the center (based on format)
centred_lines = [line  for line in first_lines if line[0:24] == "                        "] 

remove_brackets = [re.sub(r"\((.*)\)", " ", line).lstrip()  for line in centred_lines]

# Remove location lines that have '-' in them.
clean_lines = [line for line in remove_brackets if line != '' and '-' not in line]

# Clean up text
thor_seqence = [''.join([i for i in name if i.isalpha()]) for name in clean_lines]

seqs['Thor'] = thor_seqence

## Ragnarok

Url: https://transcripts.fandom.com/wiki/Thor:_Ragnarok

In [8]:
url = "https://transcripts.fandom.com/wiki/Thor:_Ragnarok"

response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

body = soup.select('pre')

# Get the first line of each gray box
first_lines = [gray_bit.text.split('\n')[0] for gray_bit in body]

# Find lines that have mostly spaces as there indicate the was something placed at the centre. 
attempted_center = [line for line in first_lines if line.count(' ') / len(line) > 0.50]

# Removing the lines that said 'DRAFT' as these looked like attributions but were not
no_drafts = [line for line in attempted_center if 'DRAFT' not in line and 'OMITTED' not in line]

# Remove brackets and long spaces on either side of text
clean_lines = [re.sub(r"\((.*)\)", " ", line).lstrip().rstrip() for line in no_drafts]

# Clean up text
ragnarok_seqence = [''.join([i for i in name if i.isalpha()]) for name in clean_lines]

seqs['Thor:_Ragnarok'] = ragnarok_seqence

# Getting Weirder

We have 6 movies left. Iron Man, Black Panther, Avengers, Doctor Strange, Ant-Man_and_the_Wasp, Guardians of the Galaxy Vol 2.

All of these have weird formatting on the fandom. For example they will start out as expected but break as you get further in. Have a look to understand.

https://transcripts.fandom.com//wiki/Guardians_of_the_Galaxy_Vol._2

https://transcripts.fandom.com//wiki/Doctor_Strange

https://transcripts.fandom.com//wiki/Ant-Man_and_the_Wasp


------
I've found actual script for some movies from:
    www.scriptslug.com/

Guardians of the Galaxy Vol 2, Black Panther and Avengers have downloaded script at '../Data/ActualScripts/'

----

## So to summarise the remaining issues

Iron Man has no good data:
    https://transcripts.fandom.com//wiki/Iron_Man
  
Doctor Strange has no good data:
    https://transcripts.fandom.com//wiki/Doctor_Strange
    
The Incredible Hulk has no good data at:
    https://transcripts.fandom.com/wiki/The_Incredible_Hulk
**BUT** I have managed to download an actual copy of the script but I'm still working on it.
 

Iron Man 3 is actually an incomplete script:
    https://transcripts.fandom.com//wiki/Iron_Man_3
    
Ant Man and the Wasp is incomplete.
    https://transcripts.fandom.com//wiki/Ant-Man_and_the_Wasp


# PDF Processing
-----
-----
Using the package `textract` to process pdf text
    
Non-trivial installation:
https://textract.readthedocs.io/en/latest/installation.html

In [9]:
import textract

## GUARDIANS OF THE GALAXY VOL. 2

In [11]:
text = textract.process("../Data/ActualScripts/guardians-of-the-galaxy-vol-2-2017.pdf")
text = text.decode("utf-8") 
remove_brackets = [re.sub(r"\((.*)\)", " ", line) for line in text.split('\n')]
best_guess = []
for i, line in enumerate(remove_brackets):
    if line.isupper():
        if not remove_brackets[i+1].isupper():
            best_guess.append(line)
no_other_char = [line.lstrip().rstrip() for line in best_guess if '-' not in line and '.' not in line and '!' not in line]
only_short = [line for line in no_other_char if len(line) < 20]
remove_end = only_short[2:-1]

In [12]:
seqs['Guardians_of_the_Galaxy_Vol._2'] = remove_end

# The Avengers


https://transcripts.fandom.com//wiki/The_Avengers

In [23]:
text = textract.process("../Data/ActualScripts/the-avengers-2012.pdf")
text = text.decode("utf-8") 
remove_brackets = [re.sub(r"\((.*)\)", " ", line) for line in text.split('\n')]
remove_empty_lines = [line for line in remove_brackets if line.rstrip() != '']

In [24]:
best_guess = []
for i, line in enumerate(remove_empty_lines):
    if line.isupper():
        if not remove_brackets[i+1].isupper():
            best_guess.append(line)

In [25]:
no_other_char = [line.lstrip().rstrip() for line in best_guess 
                 if '-' not in line and '.' not in line and 
                 '!' not in line and ':' not in line and
                ';' not in line]
only_short = [line for line in no_other_char if len(line) < 20]

In [26]:
final_lines = only_short

In [27]:
seqs['The_Avengers'] = final_lines

## Black Panther

https://transcripts.fandom.com//wiki/Black_Panther

In [77]:
text = textract.process("../Data/ActualScripts/black-panther-2018.pdf")
text = text.decode("utf-8") 

In [78]:
remove_brackets = [re.sub(r"\((.*)\)", " ", line) for line in text.split('\n')]
remove_empty_lines = [line for line in remove_brackets if line.rstrip() != '']

In [79]:
best_guess = []
for i, line in enumerate(remove_empty_lines):
    if line.isupper():
        if not remove_brackets[i+1].isupper():
            best_guess.append(line)
            
no_other_char = [line.lstrip().rstrip() for line in best_guess if '-' not in line and '.' not in line and '!' not in line]
only_short = [line for line in no_other_char if len(line) < 20]


Making a decision to only keep strings that have been used more than once. 

In [80]:
def simple_string(s):
    return ''.join([i for i in s if i.isalpha()])

In [81]:
counts_dict = Counter([simple_string(s) for s in only_short])

In [82]:
counts_dict.most_common()[::-1][0:25]

[('CHILDREN', 1),
 ('TAG', 1),
 ('FRENCHAMBASSADOR', 1),
 ('AFRENCHAMBASSADOR', 1),
 ('CUTTOTITLE', 1),
 ('SIXKIDS', 1),
 ('BROKEN', 1),
 ('GIOT', 1),
 ('AMASSIVE', 1),
 ('SHURIS', 1),
 ('SPEAR', 1),
 ('SHURINAKIA', 1),
 ('DORA', 1),
 ('ARMORED', 1),
 ('BORDERTRIBE', 1),
 ('OFTHEJABARIARMY', 1),
 ('NJOBUSJOURNAL', 1),
 ('INTHESTANDS', 1),
 ('BLADE', 1),
 ('ONSHURI', 1),
 ('INTHESEATINGAREA', 1),
 ('SHURISPROJECTION', 1),
 ('SEVERALAFRICAN', 1),
 ('ABANDONED', 1),
 ('AFTERNOON', 1)]

In [83]:
final_lines = [line for line in only_short if counts_dict[simple_string(line)]>1]

In [86]:
final_lines

['BLACKPANTHER',
 'SON',
 'FATHER',
 'SON',
 'FATHER',
 'SON',
 'FATHER',
 'FATHER',
 'FATHER',
 'FATHER',
 'FATHER',
 'SON',
 'FATHER',
 'SON',
 "N'JOBU",
 'JAMES',
 "N'JOBU",
 'JAMES',
 "N'JOBU",
 'JAMES',
 "N'JOBU",
 'DORAMILAJE 1',
 "N'JOBU",
 'DORAMILAJE 1',
 "N'JOBU",
 "YOUNG T'CHAKA",
 "N'JOBU",
 "YOUNGT ' CHAKA",
 "N'JOBU",
 "YOUNGT'CHAKA",
 "YOUNGT'CHAKA",
 "YOUNGT'CHAKA",
 "N'JOBU",
 "YOUNGT'CHAKA",
 "N' JOBU",
 "YOUNGT' CHAKA",
 'JAMES',
 "N'JOBU",
 "N'JOBU",
 'JAMES',
 "N'JOBU",
 "YOUNGT'CHAKA",
 "YOUNGT'CHAKA",
 'OKOYE',
 "T'CHALLA",
 'OKOYE',
 "T'CHALLA",
 'MILITANT LEADER',
 'MILITANT LEADER',
 'MILITANT LEADER',
 'NAKIA',
 "T'CHALLA",
 'OKOYE',
 'NAKIA',
 "T'CHALLA",
 "T'CHALLA",
 'NAKIA',
 'OKOYE',
 'OKOYE',
 "T'CHALLA",
 'NAKIA',
 'RAMONDA',
 'OKOYE',
 'AYO',
 'OKOYE',
 "T' CHALLA",
 "T'CHALLA",
 "T'CHALLA",
 "T' CHALLA",
 "T'CHALLA",
 'RAMONDA',
 "T'CHALLA",
 'RAMONDA',
 'MUSEUMDIRECTOR',
 'KILLMONGER',
 'MUSEUMDIRECTOR',
 'KILLMON',
 'GER',
 'KILLMONGER',
 'MUSEUMDI

In [84]:
seqs['Black_Panther'] = final_lines

# Fixing Little Things

In [39]:
print(seqs['Captain_America:_Civil_War'][-2:])
seqs['Captain_America:_Civil_War'] = seqs['Captain_America:_Civil_War'][:-2]

['↑ https', '↑ http']


In [40]:
print(seqs['Captain_America:_The_First_Avenger'][-2:])
seqs['Captain_America:_The_First_Avenger'] = seqs['Captain_America:_The_First_Avenger'][:-1]

['nick fury', '↑ https']


In [41]:
print(seqs['Captain_America:_The_Winter_Soldier'][-4:])
seqs['Captain_America:_The_Winter_Soldier'] = seqs['Captain_America:_The_Winter_Soldier'][:-4]

['list', 'list', 'list', 'list']


In [42]:
print(seqs['Thor:_The_Dark_World'][:1])
seqs['Thor:_The_Dark_World'] = seqs['Thor:_The_Dark_World'][1:]

['captain america']


Remove 'ger'

In [97]:
seqs['Black_Panther'] = [line for line in seqs['Black_Panther'] if ''.join([i for i in line.lower() if i.isalpha()]) != 'ger' ]

In [98]:
[(k,len(v)) for k,v in seqs.items()]

[('Ant-Man', 865),
 ('Avengers:_Age_of_Ultron', 975),
 ('Avengers:_Infinity_War', 991),
 ('Captain_America:_Civil_War', 982),
 ('Captain_America:_The_First_Avenger', 619),
 ('Captain_America:_The_Winter_Soldier', 822),
 ('Captain_Marvel_(2019)', 686),
 ('Guardians_of_the_Galaxy', 576),
 ('Spider-Man:_Homecoming', 1558),
 ('Iron_Man_2', 1006),
 ('Thor:_The_Dark_World', 732),
 ('Thor', 873),
 ('Thor:_Ragnarok', 970),
 ('Guardians_of_the_Galaxy_Vol._2', 956),
 ('The_Avengers', 830),
 ('Black_Panther', 728)]

# Using Matt's Alias'

In [99]:
matt_git_csv = 'https://raw.githubusercontent.com/mroughan/AlephZeroHeroesData/master/MarvelCinematicUniverse/alias_list.csv'
matts_alias = pd.read_csv(matt_git_csv)

In [100]:
name_to_alias = {}
alias_to_name = {}
for i, row in matts_alias.iterrows():
    alias_list = row.Aliases.split(',')
    name_to_alias[row.Character] = alias_list
    
    # More importantly, to map between my data and Matt's
    for alias in alias_list:
        steralised_alias = ''.join([i for i in alias.lower() if i.isalpha()])
        alias_to_name[steralised_alias] = row.Character

## Adding to Matts Database

In [101]:
alias_to_name['peter'] = 'Spider-Man'
alias_to_name['tony'] = 'Iron Man'
alias_to_name['carol'] = 'Captain Marvel'
alias_to_name['pepper'] = 'Pepper Potts'
alias_to_name['toomes'] = 'Vulture'
alias_to_name['rocketraccoon'] = 'Rocket' # Note the double 'c'
alias_to_name['justin'] = 'Justin Hammer'
alias_to_name['scott'] = 'Ant-Man'
alias_to_name['natasha'] = 'Black Widow'
alias_to_name['natalie'] = 'Black Widow'
alias_to_name['hank'] = 'Pym'
alias_to_name['steve'] = 'Captain America'

alias_to_name['karen'] = 'Karen' # Spidey suit AI
alias_to_name['suitlady'] = 'Karen'

alias_to_name['may'] = 'May Parker' # Aunt May
alias_to_name['mason'] = 'Phineas Mason' # Spider-man tinkerer
alias_to_name['ivan'] = 'Whiplash'
alias_to_name['maria'] = 'Maria Hill' # Could be 'Maria Stark' but less likely to have dialogue

alias_to_name['talos'] = 'Talos'
alias_to_name['sohlarr'] = 'Talos' # Soh-Larr for you database
alias_to_name['keller'] = 'Talos'  # Keller
alias_to_name['generaltalos'] = 'Talos'
alias_to_name['christine'] = 'Dr. Christine Palmer' # Either 'Christine Everhart' or 'Dr. Christine Palmer'
alias_to_name['stern'] = 'Senator Stern' # 'Dr. Samuel Sterns' or 'Senator Stern' by we don't have Hulk so..
alias_to_name['nakia'] = 'Nakia' # From black panther
alias_to_name['aaron'] = 'Aaron Davis' # Also 'Prowler' or 'The Prowler' for your database
alias_to_name['ianboothby'] = 'Ian Boothby' # Also The Intern, Mr. Selvig, from Thor: Dark World
alias_to_name['monica'] = 'Monica Rambeau' # Daughter in Captain Marvel
alias_to_name['brice'] = 'Shocker 1'
alias_to_name['michelle'] = 'Michelle Jones'
alias_to_name['banner'] = 'Hulk'
alias_to_name['corpsmandey'] = 'Rhomann Dey'
alias_to_name['eitri'] = 'Eitri'
alias_to_name['lawson'] = 'Mar-Vell' # Also Wendy Lawson
alias_to_name['secretaryross'] = 'Thaddeus Ross'
alias_to_name['darrencross'] = 'Darren Cross'
alias_to_name['liz'] = 'Liz Allan' # Brand new in your database, from villians daughter Spiderman 

# New
alias_to_name['strange'] = 'Doctor Strange'
alias_to_name['jane'] = 'Jane Foster'
alias_to_name['killmon'] = 'Killmonger'
alias_to_name['ger'] = ''
alias_to_name['onokoye'] = 'Okoye'
alias_to_name['onnakia']='Nakia'

## Using Alias

In [102]:
clean_seqs = {}
failed_names = []
for movie_name, seq in seqs.items():
    new_seq = [] # Make a new array to preserve data in memory
    for line in seq:
        line = ''.join([i for i in line.lower() if i.isalpha()])
        if line in alias_to_name:
            new_seq.append(alias_to_name[line])
        else:
            new_seq.append(line)
            failed_names.append(line) # Save everything that we don't catch to update Matt's list
    clean_seqs[movie_name] = new_seq

In [103]:
Counter(failed_names).most_common()

[('klaue', 44),
 ('barton', 32),
 ('richard', 24),
 ('ayesha', 22),
 ('ramonda', 20),
 ('shieldagent', 19),
 ('njobu', 19),
 ('director', 15),
 ('securityguard', 15),
 ('newsanchor', 14),
 ('furyscar', 14),
 ('frenchpirate', 13),
 ('shieldcomputer', 13),
 ('doris', 13),
 ('policeofficer', 13),
 ('man', 12),
 ('actorthor', 12),
 ('councilman', 12),
 ('dale', 11),
 ('vasilykarpov', 11),
 ('senatorbrandt', 11),
 ('minnerva', 11),
 ('novaprime', 11),
 ('charles', 11),
 ('actorloki', 11),
 ('dumdumdugan', 10),
 ('abe', 10),
 ('omitted', 10),
 ('councilmanrockwell', 9),
 ('doctor', 9),
 ('newsreporter', 9),
 ('denariansaal', 9),
 ('quinjet', 9),
 ('intercomvoice', 9),
 ('gabejones', 8),
 ('jimmorita', 8),
 ('bronchar', 8),
 ('skrullscientist', 8),
 ('admissionsnurse', 8),
 ('soothingvoice', 8),
 ('monstrousravager', 8),
 ('father', 8),
 ('griot', 8),
 ('towerkeeper', 7),
 ('fdoctor', 7),
 ('general', 7),
 ('comtech', 7),
 ('marshal', 7),
 ('mysteriousman', 7),
 ('museumdirector', 7),
 ('oakl

## Saving to CSVs

In [108]:
for name, seq in clean_seqs.items():
    series = pd.Series(seq)
    series.to_csv('../Data/MarvelScripts/'+name+'.csv')

## Making new counts

In [104]:
counts = {}
for name, seq in clean_seqs.items():
    counts[name] = (Counter(seq).most_common())

# Calculations 

## Running Shannon Entropy

In [106]:
shannon_entropys = {}
for name, count_list in counts.items():
    total = np.sum([person[1] for person in count_list])
    shannon_entropy = 0
    for person in count_list:
        p = person[1]/total
        shannon_entropy+= p*math.log(p,2)
        
    shannon_entropys[name] = -shannon_entropy

pd.Series(shannon_entropys)

Ant-Man                                3.401423
Avengers:_Age_of_Ultron                3.972530
Avengers:_Infinity_War                 4.673448
Captain_America:_Civil_War             4.162912
Captain_America:_The_First_Avenger     4.115598
Captain_America:_The_Winter_Soldier    3.915047
Captain_Marvel_(2019)                  3.512934
Guardians_of_the_Galaxy                3.654280
Spider-Man:_Homecoming                 3.893126
Iron_Man_2                             3.400770
Thor:_The_Dark_World                   3.815437
Thor                                   3.935075
Thor:_Ragnarok                         3.379268
Guardians_of_the_Galaxy_Vol._2         3.624661
The_Avengers                           4.065723
Black_Panther                          4.240313
dtype: float64

## Diversity Index

" [Diversity index](https://en.wikipedia.org/wiki/Diversity_index) is a quantitative measure that reflects how many different types (such as species) there are in a dataset (a community), and simultaneously takes into account how evenly the basic entities (such as individuals) are distributed among those types. "

$$
^{q} D=\frac{1}{M_{q-1}} = \left(\sum_{i=1}^{R} p_{i}^{q}\right)^{1 /(1-q)}
$$

with $M_{q-1}$ as weighted generalized mean with exponent q-1

$$
^{1} D=\frac{1}{\prod_{i=1}^{R} p_{i}^{p_{i}}}=\exp \left(-\sum_{i=1}^{R} p_{i} \ln \left(p_{i}\right)\right)
$$


"In other domains, this ($^{1} D$) statistic is also known as the perplexity."

In [107]:
diversities = {} # Using D1, we just take exp(shannon entropy)
for name, en in shannon_entropys.items():
    diversities[name]  = np.power(2,en)
    
pd.Series(diversities)

Ant-Man                                10.566478
Avengers:_Age_of_Ultron                15.698235
Avengers:_Infinity_War                 25.518089
Captain_America:_Civil_War             17.912712
Captain_America:_The_First_Avenger     17.334781
Captain_America:_The_Winter_Soldier    15.085048
Captain_Marvel_(2019)                  11.415594
Guardians_of_the_Galaxy                12.590645
Spider-Man:_Homecoming                 14.857569
Iron_Man_2                             10.561698
Thor:_The_Dark_World                   14.078649
Thor                                   15.295916
Thor:_Ragnarok                         10.405453
Guardians_of_the_Galaxy_Vol._2         12.334790
The_Avengers                           16.745749
Black_Panther                          18.899979
dtype: float64

## Entropy Rates

In [109]:
def hhat(list_of_string):
    """ Non parametric entropy rate estimator for a single random process. """
    word_set = set(list_of_string)

    ref = dict(zip(word_set, list(range(len(word_set)))))

    data = [ref[w] for w in list_of_string]

    N = len(data)
    Lambdas = 0
    pre_string = "  "  # Create a string containing the previous words
    for i in range(N):  
        count = 0
        string_in = True 
        while string_in and i+count<N: # Loop through each possible subsequence until break
            count+=1
            string_in = " " + " ".join([str(i) for i in data[i:i+count]]) + " " in pre_string  # Use 'in' to check
        Lambdas += count
        pre_string += "%s  " % data[i]  # Append to previous words varible
    return N*math.log(N,2)/Lambdas

In [110]:
entropy_rates = {}
for name, seq in seqs.items():
    entropy_rates[name] = hhat(seq)
    
pd.Series(entropy_rates)

Ant-Man                                4.987838
Avengers:_Age_of_Ultron                5.068600
Avengers:_Infinity_War                 5.118405
Captain_America:_Civil_War             5.102283
Captain_America:_The_First_Avenger     4.919006
Captain_America:_The_Winter_Soldier    5.031240
Captain_Marvel_(2019)                  4.874462
Guardians_of_the_Galaxy                4.745622
Spider-Man:_Homecoming                 5.453247
Iron_Man_2                             5.116910
Thor:_The_Dark_World                   4.905276
Thor                                   5.025968
Thor:_Ragnarok                         5.068028
Guardians_of_the_Galaxy_Vol._2         5.056212
The_Avengers                           5.014631
Black_Panther                          4.979622
dtype: float64

# Combining Results

Making a dataframe of my work

In [111]:
df = pd.DataFrame()
df['entropy_rate'] = pd.Series(entropy_rates)
df['diversity'] = pd.Series(diversities)
df['shannon_entropy'] = pd.Series(shannon_entropys)

Getting Matt's meta data

In [112]:
matt_imdb_meta = pd.read_csv(
    'https://raw.githubusercontent.com/mroughan/AlephZeroHeroesData/master/MarvelCinematicUniverse/imdb_data.csv')

### Aligning DataFrame Index

In [113]:
df['Transcript URL Tail'] = df.index

In [114]:
df['Index'] = [''.join([i for i in d.lower() if i.isalnum()]) for d in df.index]
df['Index']= ['captainmarvel' if 'captainmarvel' in i else i for i in df.Index ]

In [115]:
index_matt = [''.join([i for i in d.lower() if i.isalnum()]) for d in matt_imdb_meta.Title]
matt_imdb_meta['Index'] =  index_matt

### Merging Dataframes

In [116]:
df = pd.merge(df, matt_imdb_meta, on = 'Index')

In [117]:
df

Unnamed: 0,entropy_rate,diversity,shannon_entropy,Transcript URL Tail,Index,Title,ReleaseYear,IMDBid,Runtime,imdbRating,imdbVotes,Metascore,Released,BoxOffice
0,4.987838,10.566478,3.401423,Ant-Man,antman,Ant-Man,2015.0,tt0478970,117.0,7.3,492948.0,64.0,2015-07-17,138002223.0
1,5.0686,15.698235,3.97253,Avengers:_Age_of_Ultron,avengersageofultron,Avengers: Age of Ultron,2015.0,tt2395427,141.0,7.3,654822.0,66.0,2015-05-01,429113729.0
2,5.118405,25.518089,4.673448,Avengers:_Infinity_War,avengersinfinitywar,Avengers: Infinity War,2018.0,tt4154756,149.0,8.5,641255.0,68.0,2018-04-27,664987816.0
3,5.102283,17.912712,4.162912,Captain_America:_Civil_War,captainamericacivilwar,Captain America: Civil War,2016.0,tt3498820,147.0,7.8,574126.0,75.0,2016-05-06,408080554.0
4,4.919006,17.334781,4.115598,Captain_America:_The_First_Avenger,captainamericathefirstavenger,Captain America: The First Avenger,2011.0,tt0458339,124.0,6.9,662101.0,66.0,2011-07-22,176636816.0
5,5.03124,15.085048,3.915047,Captain_America:_The_Winter_Soldier,captainamericathewintersoldier,Captain America: The Winter Soldier,2014.0,tt1843866,136.0,7.8,655267.0,70.0,2014-04-04,228636083.0
6,4.874462,11.415594,3.512934,Captain_Marvel_(2019),captainmarvel,Captain Marvel,2019.0,tt4154664,123.0,7.1,258034.0,64.0,2019-03-08,
7,4.745622,12.590645,3.65428,Guardians_of_the_Galaxy,guardiansofthegalaxy,Guardians of the Galaxy,2014.0,tt2015381,121.0,8.1,936430.0,76.0,2014-08-01,270592504.0
8,5.453247,14.857569,3.893126,Spider-Man:_Homecoming,spidermanhomecoming,Spider-Man: Homecoming,2017.0,tt2250912,133.0,7.5,418900.0,73.0,2017-07-07,334166825.0
9,5.11691,10.561698,3.40077,Iron_Man_2,ironman2,Iron Man 2,2010.0,tt1228705,124.0,7.0,650168.0,57.0,2010-05-07,312057433.0


In [118]:
df.to_csv('../Data/Marvel_Script_Perplexity.csv')