In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string


In [2]:
pd.set_option('display.max_rows', None)

# Step 1: Read and display the scri

In [43]:
characters_df = pd.read_csv('../Data/lotr/lotr_characters.csv')
scripts_df = pd.read_csv('../Data/lotr/lotr_scripts.csv')

                            

In [4]:
scripts_df.head(2)

Unnamed: 0.1,Unnamed: 0,char,dialog,movie
0,0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King
1,1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King


In [5]:
scripts_df.drop('Unnamed: 0', inplace=True, axis=1)

In [6]:
scripts_df.describe()

Unnamed: 0,char,dialog,movie
count,2390,2389,2390
unique,118,2325,3
top,FRODO,DEATH!,The Two Towers
freq,225,6,1010


In [7]:
scripts_df.dtypes

char      object
dialog    object
movie     object
dtype: object

In [8]:
characters_df.dtypes

birth     object
death     object
gender    object
hair      object
height    object
name      object
race      object
realm     object
spouse    object
dtype: object

In [9]:
# Convert all types to string from object and remove punctuation from character names
scripts_df = scripts_df.astype(str)
characters_df = characters_df.astype(str)
scripts_df['review'].str.replace('[{}]'.format(string.punctuation), '')

In [10]:
scripts_df.columns.to_list()

['char', 'dialog', 'movie']

In [11]:
scripts_df.char = scripts_df.char.str.strip()
scripts_df.movie = scripts_df.movie.str.strip()

In [12]:
scripts_df.char.value_counts()

FRODO                       226
SAM                         217
GANDALF                     205
ARAGORN                     187
PIPPIN                      163
MERRY                       137
GOLLUM                      133
GIMLI                       116
THEODEN                     110
FARAMIR                      65
EOWYN                        56
LEGOLAS                      55
SMEAGOL                      49
TREEBEARD                    46
BILBO                        46
DENETHOR                     45
BOROMIR                      41
ARWEN                        39
EOMER                        36
SOLDIER                      36
SARUMAN                      33
ELROND                       29
GRIMA                        25
STRIDER                      25
ORC                          22
GAMLING                      15
GOTHMOG                      14
GALADRIEL VOICE OVER         12
SHAGRAT                       9
UGLUK                         9
KING OF THE DEAD              8
WITCH KI

*Based on the character value counts, it appears that Frodo and Sam have the most lines in all the movies*<br>

In [13]:
scripts_df.dialog.value_counts()

    DEATH!                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         6
Pippin!                                                                                                                                                                            

*Based on this scripts value counts, most of the lines are unique*

In [14]:
scripts_df.movie.value_counts()

The Two Towers                1010
The Return of the King         873
The Fellowship of the Ring     507
Name: movie, dtype: int64

*Based on the movie value counts, the second movie "The Two Towers" has the most dialogue.*

In [15]:
scripts_df.movie.unique()

array(['The Return of the King', 'The Two Towers',
       'The Fellowship of the Ring'], dtype=object)

In [16]:
fellowship_df = scripts_df[scripts_df.movie=='The Fellowship of the Ring']
two_towers_df = scripts_df[scripts_df.movie=='The Two Towers']
return_king_df = scripts_df[scripts_df.movie=='The Return of the King']

In [17]:
characters_df.columns

Index(['birth', 'death', 'gender', 'hair', 'height', 'name', 'race', 'realm',
       'spouse'],
      dtype='object')

In [18]:
# We note that STRIDER and ARAGORN are the same person
characters_df.name = characters_df.name.replace({'STRIDER':'ARAGORN'})

In [19]:
two_towers_df.groupby('char')['dialog'].count().reset_index(name='obs').sort_values(['obs'], ascending=False).head(10)

Unnamed: 0,char,obs
0,ARAGORN,99
40,SAM,88
14,FRODO,83
22,GOLLUM,78
48,THEODEN,64
21,GIMLI,58
37,PIPPIN,56
31,MERRY,55
49,TREEBEARD,43
12,FARAMIR,41


In [20]:
return_king_df.groupby('char').size().reset_index(name='obs').sort_values(['obs'], ascending=False).head(10)

Unnamed: 0,char,obs
24,GANDALF,94
48,SAM,91
17,FRODO,73
45,PIPPIN,69
1,ARAGORN,61
28,GOLLUM,52
59,THEODEN,46
39,MERRY,41
9,DENETHOR,35
27,GIMLI,34


*Results so far: In the first and third movies, Gandalf has the most lines of any character. He is noticably missing from the second movie because he fell into a pit with the Balrog. Now we will join our data with the characters dataframe to try to find additional insights*

In [21]:
characters_df.columns.to_list()

['birth',
 'death',
 'gender',
 'hair',
 'height',
 'name',
 'race',
 'realm',
 'spouse']

In [22]:
characters_df.name.unique()

array(['Adanel', 'Boromir', 'Lagduf', 'Tarcil', 'Fire-drake of Gondolin',
       'Ar-Adûnakhôr', 'Annael', 'Angrod', 'Angrim', 'Anárion',
       'Ar-Pharazôn', 'Ar-Sakalthôr', 'Ar-Gimilzôr', 'Angelimir',
       'Angelimar', 'Angbor', 'Nurwë', 'Linda (Baggins) Proudfoot',
       'Bodo Proudfoot', 'Penlod', 'Pengolodh', 'Tarannon Falastur',
       'Tar-Vanimeldë', 'Tar-Telperiën', 'Tar-Telemmaitë', 'Tar-Súrion',
       'Tar-Palantir', 'Tar-Míriel', 'Tar-Minastir', 'Tar-Elendil',
       'Tar-Calmacil', 'Tar-Atanamir', 'Tar-Meneldur', 'Tar-Ardamin',
       'Tar-Ciryatan', 'Tar-Anárion', 'Tar-Ancalimë', 'Tar-Ancalimon',
       'Tar-Amandil', 'Tar-Alcarin', 'Tanta (Hornblower) Baggins',
       'Morwen Steelsheen', 'Squint-eyed Southerner', 'Soronto',
       'Tar-Aldarion', 'Finwë', 'Finrod', 'Fingon', 'Fingolfin',
       'Finduilas of Dol Amroth', 'Finduilas', 'Findis', 'Findegil',
       'Finarfin', 'Fimbrethil', 'Ferumbras Took II', 'Bill Ferny',
       'Fengel', 'Fastred of Greenholm', 'F

*We note that there are first and last names here and that not all the names are capitalized. Given that the focus is on the top characters, removing the last names and capitalizing will allow us to uniquely join the two dataframes on the name key*

In [23]:
def process_name(name):
    first_name = str(name).split()[0].upper()
    if first_name == 'SAMWISE':
        first_name = 'SAM'
    elif first_name == 'SMEAGOL':
        first_name = 'GOLLUM'
    
    return first_name.upper()

In [24]:
characters_df.name = characters_df.name.apply(process_name)

In [25]:
characters_df.name

0               ADANEL
1              BOROMIR
2               LAGDUF
3               TARCIL
4           FIRE-DRAKE
5         AR-ADÛNAKHÔR
6               ANNAEL
7               ANGROD
8               ANGRIM
9              ANÁRION
10         AR-PHARAZÔN
11        AR-SAKALTHÔR
12         AR-GIMILZÔR
13           ANGELIMIR
14           ANGELIMAR
15              ANGBOR
16               NURWË
17               LINDA
18                BODO
19              PENLOD
20           PENGOLODH
21            TARANNON
22       TAR-VANIMELDË
23       TAR-TELPERIËN
24      TAR-TELEMMAITË
25          TAR-SÚRION
26        TAR-PALANTIR
27          TAR-MÍRIEL
28        TAR-MINASTIR
29         TAR-ELENDIL
30        TAR-CALMACIL
31        TAR-ATANAMIR
32        TAR-MENELDUR
33         TAR-ARDAMIN
34        TAR-CIRYATAN
35         TAR-ANÁRION
36        TAR-ANCALIMË
37       TAR-ANCALIMON
38         TAR-AMANDIL
39         TAR-ALCARIN
40               TANTA
41              MORWEN
42         SQUINT-EYED
43         

In [26]:
characters_df.rename(columns={'name':'char'}, inplace=True)

In [27]:
joined_df = scripts_df.merge(characters_df, how='left', on='char')

In [28]:
joined_df.head(100)

Unnamed: 0,char,dialog,movie,birth,death,gender,hair,height,race,realm,spouse
0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King,,,,,,,,
1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King,,,,,,,,
2,DEAGOL,Arrghh!,The Return of the King,,,,,,,,
3,SMEAGOL,Deagol!,The Return of the King,,,,,,,,
4,SMEAGOL,Deagol!,The Return of the King,,,,,,,,
5,SMEAGOL,Deagol!,The Return of the King,,,,,,,,
6,SMEAGOL,Give us that! Deagol my love,The Return of the King,,,,,,,,
7,DEAGOL,Why?,The Return of the King,,,,,,,,
8,SMEAGOL,"Because' , it's my birthday and I wants it.",The Return of the King,,,,,,,,
9,SMEAGOL,My precious.,The Return of the King,,,,,,,,


In [29]:
#Extract only the necessary columns
joined_df = joined_df[['char', 'dialog', 'movie', 'race']]

In [30]:
joined_df.shape

(3401, 4)

In [31]:
joined_df.head(100)

Unnamed: 0,char,dialog,movie,race
0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King,
1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King,
2,DEAGOL,Arrghh!,The Return of the King,
3,SMEAGOL,Deagol!,The Return of the King,
4,SMEAGOL,Deagol!,The Return of the King,
5,SMEAGOL,Deagol!,The Return of the King,
6,SMEAGOL,Give us that! Deagol my love,The Return of the King,
7,DEAGOL,Why?,The Return of the King,
8,SMEAGOL,"Because' , it's my birthday and I wants it.",The Return of the King,
9,SMEAGOL,My precious.,The Return of the King,


In [32]:
# Inspecting the Race column, we see that there is significant data cleaning that needs to be performed
joined_df.race.unique()

array([nan, 'Hobbits', 'Hobbit', 'nan', 'Dwarves', 'Maiar', 'Men',
       'Ents,Onodrim', 'Half-elven', 'Orc', 'Maiar,Balrogs', 'Elves',
       'Half-elven,Men', 'Uruk-hai,Orc', 'Black Uruk', 'Orcs'],
      dtype=object)

*We will perform the following modifications*
1. Uppercase every race
2. Keep only the basic races as the characters with the most lines have homogenous races
3. Replace any singular races with their plural form

In [33]:
joined_df.race = joined_df.race.str.upper()


In [34]:
joined_df.race = joined_df.race.replace({'HOBBIT': 'HOBBITS', 'ORC':'ORCS'})

In [35]:
joined_df.race.unique()

array([nan, 'HOBBITS', 'NAN', 'DWARVES', 'MAIAR', 'MEN', 'ENTS,ONODRIM',
       'HALF-ELVEN', 'ORCS', 'MAIAR,BALROGS', 'ELVES', 'HALF-ELVEN,MEN',
       'URUK-HAI,ORC', 'BLACK URUK'], dtype=object)

In [36]:
race_list = ['HOBBITS', 'NAN', 'DWARVES', 'MAIAR', 'MEN', 'ORCS', 'ELVES']
joined_df = joined_df[joined_df['race'].isin(race_list)]


In [37]:
joined_df.head(2)

Unnamed: 0,char,dialog,movie,race
16,FRODO,Gandalf?,The Return of the King,HOBBITS
17,FRODO,Gandalf?,The Return of the King,HOBBITS


In [38]:
joined_df.race.unique()

array(['HOBBITS', 'NAN', 'DWARVES', 'MAIAR', 'MEN', 'ORCS', 'ELVES'],
      dtype=object)

In [39]:
joined_df.head(100)

Unnamed: 0,char,dialog,movie,race
16,FRODO,Gandalf?,The Return of the King,HOBBITS
17,FRODO,Gandalf?,The Return of the King,HOBBITS
18,FRODO,Oooohhh!,The Return of the King,HOBBITS
19,FRODO,Oooohhh!,The Return of the King,HOBBITS
20,MERRY,Frodo!,The Return of the King,NAN
21,GIMLI,Aaaahh!,The Return of the King,NAN
22,GIMLI,Aaaahh!,The Return of the King,DWARVES
23,FRODO,Gimli!,The Return of the King,HOBBITS
24,FRODO,Gimli!,The Return of the King,HOBBITS
25,GOLLUM,My precious!,The Return of the King,HOBBITS


In [40]:
joined_df.char.unique() # Let's see what is left

array(['FRODO', 'MERRY', 'GIMLI', 'GOLLUM', 'SAM', 'GANDALF', 'ARAGORN',
       'PIPPIN', 'ROSIE', 'BILBO', 'SARUMAN', 'FARAMIR', 'GOTHMOG',
       'DENETHOR', 'LEGOLAS', 'GALADRIEL', 'GRIMBOLD', 'GAMLING',
       'DAMROD', 'HALDIR', 'BOROMIR', 'MORWEN', 'SNAGA', 'SANDYMAN',
       'MAN', 'BARLIMAN', 'MEN', 'SAURON'], dtype=object)

In [41]:
hobbits_list = ['FRODO', 'MERRY', 'GOLLUM', 'PIPPIN' ]