## HTML way

### Parse the text

In [97]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
import pandas as pd
import re
output_string = StringIO()
with open('Crimes_of_the_Future_PINK_REVISION_Script_06.21.21.pdf', 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(),
                     output_type='html', codec=None)
html = output_string.getvalue().strip()

In [2]:
print(html)

<html><head>
<meta http-equiv="Content-Type" content="text/html">
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:235px; top:265px; width:140px; height:12px;"><span style="font-family: CourierFinalDraft; font-size:12px">CRIMES OF THE FUTURE
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:229px; top:289px; width:154px; height:36px;"><span style="font-family: CourierFinalDraft; font-size:12px">an original screenplay
<br>by
<br>David Cronenberg
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:259px; top:349px; width:91px; height:24px;"><span style="font-family: CourierFinalDraft; font-size:12px">Pink Revision
<br>June 21, 2021
<br></span></div><div style="posi

### Remove title occurencies in the text

In [93]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
txt_to_remove = []
for div in soup.find_all('div'):
    if div.get_text().strip() == 'Page 2':
        break
    else:
        txt_to_remove.append(div.get_text().replace('\n',' ').strip())

txt_to_remove

['Page 1',
 'CRIMES OF THE FUTURE',
 'an original screenplay by David Cronenberg',
 'Pink Revision June 21, 2021',
 'Serendipity Point Films Inc.',
 'Previous Versions: White Draft - February 10, 2020']

### Get the texts that are not dialogs

In [94]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# find all div elements with left value greater than 108px
elements = soup.select('div[style*="left:"][style*="px;"]')
filtered_elements = [e for e in elements if not (int(e['style'].split('left:')[1].split('px')[0]) >= 178 and int(e['style'].split('left:')[1].split('px')[0]) < 350)]

# extract the text from the filtered elements
texts = [e.get_text() for e in filtered_elements]
print('\n'.join(texts))

Previous Versions:
White Draft - February 10, 2020

FADE IN:

1

EXT. SEEDY MOTEL - DAY

1

A seedy motel at the edge of a small bay. 

A boy, BRECKEN, an ordinary-looking eight-year-old, sits in 
the shallow water which comes within yards of the motel. He's 
DIGGING around in the pebbly bay bottom with a soup spoon, 
looking for crayfish, leeches, anything interesting.

His mother, DJUNA DOTRICE, sits on a tattered lawn chair in 
front of their motel room, WATCHING her son with exaggerated 
intensity, nervously fingering an old cell phone. She is in 
her early thirties, attractive but strained, tense, 
exhausted.

*

Brecken hears her, but he doesn't answer, he just keeps 
digging. 

2

INT. SEEDY MOTEL ROOM. BEDROOM - NIGHT

Djuna sits on one of two beds, DISTRACTEDLY watching the 
cigarette-scarred TV. Her cell phone is on the bed beside 
her.

3

INT. SEEDY MOTEL ROOM. BATHROOM - NIGHT

2

3

In the tacky bathroom, Brecken is brushing his teeth, looking 
at himself in the mirror. H

In [95]:
without_dialogs = '\n'.join(texts).replace('*','')

# for txt in txt_to_remove:
#     without_dialogs = without_dialogs.replace(txt,'')

In [96]:
print(without_dialogs)

Previous Versions:
White Draft - February 10, 2020

FADE IN:

1

EXT. SEEDY MOTEL - DAY

1

A seedy motel at the edge of a small bay. 

A boy, BRECKEN, an ordinary-looking eight-year-old, sits in 
the shallow water which comes within yards of the motel. He's 
DIGGING around in the pebbly bay bottom with a soup spoon, 
looking for crayfish, leeches, anything interesting.

His mother, DJUNA DOTRICE, sits on a tattered lawn chair in 
front of their motel room, WATCHING her son with exaggerated 
intensity, nervously fingering an old cell phone. She is in 
her early thirties, attractive but strained, tense, 
exhausted.



Brecken hears her, but he doesn't answer, he just keeps 
digging. 

2

INT. SEEDY MOTEL ROOM. BEDROOM - NIGHT

Djuna sits on one of two beds, DISTRACTEDLY watching the 
cigarette-scarred TV. Her cell phone is on the bed beside 
her.

3

INT. SEEDY MOTEL ROOM. BATHROOM - NIGHT

2

3

In the tacky bathroom, Brecken is brushing his teeth, looking 
at himself in the mirror. He

### Match the beginning of each scene and append each scene's text in a list

In [59]:
pattern = r"\n(\d+)\n\n((?:INT\.|EXT\.)[^a-z\n]*(?<!CONTINUED))"

In [64]:
matches = re.finditer(pattern, without_dialogs)
indices = []
for match in matches:
    indices.append([match.start(),match.end()])

In [65]:
chapters = []
for i in range(len(indices)):
    if i == len(indices)-1:
        chapters.append(re.sub(r'\n|\n1',' ',without_dialogs[indices[i][0]:]))
    else:
        chapters.append(re.sub(r'\n|\n1',' ',without_dialogs[indices[i][0]:indices[i+1][0]]))

In [66]:
chapters

[" 1  EXT. SEEDY MOTEL - DAY  1  A seedy motel at the edge of a small bay.   A boy, BRECKEN, an ordinary-looking eight-year-old, sits in  the shallow water which comes within yards of the motel. He's  DIGGING around in the pebbly bay bottom with a soup spoon,  looking for crayfish, leeches, anything interesting.  His mother, DJUNA DOTRICE, sits on a tattered lawn chair in  front of their motel room, WATCHING her son with exaggerated  intensity, nervously fingering an old cell phone. She is in  her early thirties, attractive but strained, tense,  exhausted.  *  Brecken hears her, but he doesn't answer, he just keeps  digging.  ",
 ' 2  INT. SEEDY MOTEL ROOM. BEDROOM - NIGHT  Djuna sits on one of two beds, DISTRACTEDLY watching the  cigarette-scarred TV. Her cell phone is on the bed beside  her. ',
 " 3  INT. SEEDY MOTEL ROOM. BATHROOM - NIGHT  2  3  In the tacky bathroom, Brecken is brushing his teeth, looking  at himself in the mirror. He is thin, malnourished, pallid,  looks sad in hi

### Make a df to handle later

In [98]:
int_ext = []
name = []
time = []
words = []
for chapter in chapters:
    words.append(len(chapter.split()))
    txt = re.findall(r'(?:INT\.|EXT\.) [^a-z0-9]+\s',chapter)
    k = txt[0].split('.', 1)
    int_ext.append(k[0])
    name.append(k[1].split('-')[0])
    time.append(k[1].split('-')[1])

df = pd.DataFrame({'Interior/Exterior':int_ext,
                    'Name':name,
                    'Time':time,
                    'Word count':words})

In [99]:
df

Unnamed: 0,Interior/Exterior,Name,Time,Word count
0,EXT,SEEDY MOTEL,DAY,104
1,INT,SEEDY MOTEL ROOM. BEDROOM,NIGHT,29
2,INT,SEEDY MOTEL ROOM. BATHROOM,NIGHT,294
3,INT,SEEDY MOTEL ROOM. BEDROOM,LATER,146
4,INT,SEEDY MOTEL ROOM. BATHROOM,LATER,185
...,...,...,...,...
60,INT,VICTORIAN BANK,NIGHT,958
61,EXT,VICTORIAN BANK,NIGHT,193
62,INT,TENSER'S BEDROOM,NIGHT,190
63,EXT,SHIP GRAVEYARD,NIGHT,159
