In [1]:
import pandas as pd
import re

In [2]:
with open("data\Friends_Transcript.txt", "r", encoding="utf-8") as f:
    script = f.read()

In [3]:
print(script[:50000])

THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)
Written by: Marta Kauffman & David Crane
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]
Monica: There's nothing to tell! He's just some guy I work with!
Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!
Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?
Phoebe: Wait, does he eat chalk?
(They all stare, bemused.)
Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!
Monica: Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
Chandler: Sounds like a date to me.
[Time Lapse]
Chandler: Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked.
All: Oh, yeah. Had that dream.
Chandler: Then I look down, and I realize there's a phone... there.
Joey: Instead of...?
Chandler: That's right.


Thoughts:
- Episode titles, writing credits, scene setups are present in the script
- Expression statements
- All as a character

#### Data Cleaning

##### Parse the Script & Tag Episodes

###### Identify Episode Boundaries

In [15]:
lines = script.split('\n')

In [23]:
episode_titles = []
for line in lines:
    clean_line = line.strip()
    # We check if line starts with "The One" ignoring case
    if clean_line.lower().startswith("the one"):
        episode_titles.append(clean_line)

print("Found", len(episode_titles), "episode title lines.")

i=1
for t in episode_titles[:len(episode_titles)-1]:
    print(">",i,t)
    i=i+1

Found 193 episode title lines.
> 1 THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)
> 2 THE ONE WITH THE SONOGRAM AT THE END
> 3 THE ONE WITH THE THUMB
> 4 THE ONE WITH GEORGE STEPHANOPOULOS
> 5 THE ONE WITH THE EAST GERMAN LAUNDRY DETERGENT
> 6 THE ONE WITH THE BUTT
> 7 THE ONE WITH THE BLACKOUT
> 8 THE ONE WHERE NANA DIES TWICE
> 9 THE ONE WHERE UNDERDOG GETS AWAY
> 10 THE ONE WITH THE MONKEY
> 11 THE ONE WITH MRS. BING
> 12 THE ONE WITH THE DOZEN LASAGNAS
> 13 THE ONE WITH THE BOOBIES
> 14 THE ONE WITH THE CANDY HEARTS
> 15 THE ONE WITH THE STONED GUY
> 16 THE ONE WITH TWO PARTS, PART 1
> 17 THE ONE WITH TWO PARTS, PART 2
> 18 THE ONE WITH ALL THE POKER
> 19 THE ONE WHERE THE MONKEY GETS AWAY
> 20 THE ONE WITH THE EVIL ORTHODONTIST
> 21 THE ONE WITH THE FAKE MONICA
> 22 THE ONE WITH THE ICK FACTOR
> 23 THE ONE WITH THE BIRTH
> 24 THE ONE WHERE RACHEL FINDS OUT
> 25 THE ONE WITH ROSS' NEW GIRLFRIEND
> 26 THE ONE WITH THE BREAST MILK
> 27 THE ONE WHERE MR. HECKLES

In [None]:
# The one after the superbowl has two parts, but is present as one single episode in script
# The one with Ross's wedding is present in 3 episodes?
# The one in vegas should be two episodes but present as one
# The one that could have been should be two episodes
# The one with the proposal should be two episodes
# The one with chandler's wedding should be two episodes
# The one with the birthing video is listed twice
# only till season 9, ep 2

In [24]:
def get_script_segment(lines, start_pattern, end_pattern=None, 
                       max_lines=None, case_insensitive=True):
    """
    Extract a slice of 'lines' that starts at the first line matching 'start_pattern'
    and continues until either:
      - The first line matching 'end_pattern' (not included), or
      - We've read 'max_lines' lines, or
      - We reach the end of 'lines'.
    
    Parameters:
    -----------
    lines : list of str
        The list of script lines to search in.
    start_pattern : str
        The text that signals where to begin extraction.
    end_pattern : str or None
        If provided, signals where to end extraction (line not included).
    max_lines : int or None
        If provided, limits how many lines (after the start) we return.
    case_insensitive : bool
        Whether the matching for start_pattern and end_pattern is case-insensitive.
    
    Returns:
    --------
    segment : list of str
        The extracted lines from the script.
    """
    # Helper function to compare lines
    def matches(line, pattern):
        return pattern.lower() in line.lower() if case_insensitive else pattern in line

    start_index = None
    end_index = None

    # 1. Find the first line that contains 'start_pattern'
    for i, line in enumerate(lines):
        if matches(line, start_pattern):
            start_index = i
            break
    
    if start_index is None:
        # No start pattern found
        print(f"[INFO] Could not find the start pattern: '{start_pattern}'")
        return []
    
    # 2. If end_pattern is specified, find first occurrence after start_index
    if end_pattern:
        for j in range(start_index + 1, len(lines)):
            if matches(lines[j], end_pattern):
                end_index = j
                break

    # 3. Determine slice end
    # If we found an end_index, we'll go up to (but not including) that
    # If not, we'll go until the end of lines or until max_lines is reached
    if end_index is not None:
        slice_end = end_index
    else:
        slice_end = len(lines)
    
    # 4. If max_lines is given, limit how far we go from start_index
    if max_lines is not None:
        slice_end = min(slice_end, start_index + max_lines)
    
    # 5. Return the portion
    segment = lines[start_index:slice_end]
    return segment


In [12]:
def parse_episodes(text):
    lines = text.split('\n')
    episodes_data = []
    current_episode = None
    episode_lines = []

    for line in lines:
        line = line.strip()

        # Check if the line indicates a new episode title
        # We assume episodes are titled "THE ONE WHERE..." or "THE ONE WITH"
        # if (line.startswith("THE ONE WHERE") or line.startswith("THE ONE WITH")):
        if line.startswith("THE ONE "):
            # If we already have an episode in progress, store it
            if current_episode and episode_lines:
                episodes_data.append((current_episode, episode_lines))
                episode_lines = []

            current_episode = line  # set new episode title
            continue

        # If line is "End", that might signal the end of the current episode
        if line == "End":
            if current_episode and episode_lines:
                episodes_data.append((current_episode, episode_lines))
                episode_lines = []
            current_episode = None
            continue

        # Otherwise, if we have an active episode, collect the line
        if current_episode:
            episode_lines.append(line)
    
    # If something remains at the end
    if current_episode and episode_lines:
        episodes_data.append((current_episode, episode_lines))

    return episodes_data

In [27]:
segment_1 = get_script_segment(
    lines=lines, 
    start_pattern="The One With Ross's Wedding", 
    max_lines=2000  # grab 20 lines after that line
)

for i, text_line in enumerate(segment_1):
    print(f"{i}:\t{text_line}")


0:	THE ONE WITH ROSS'S WEDDING PARTS I AND II
1:	Part I Written by: Michael Borkow Part II Teleplay by: Shana Goldberg-Meehan & Scott Silveri Part II Story by: Jill Condon & Amy Toomin Part I Transcribed by: Eric Aasen  Part II Transcribed by: Aaron D. Miller
2:	[Scene: Chandler and Joey's, Joey and Chandler are getting ready for the flight to London and Monica comes running in.]
3:	Monica: Guys, hurry up! The flight leaves in four hours! It could take time to get a taxi! There could be traffic! The plane could leave early! When we get to London, there could be a line at customs! Come on!! (She runs back to her apartment.)
4:	Chandler: Six-hour trip to London. Thats a lot of Monica.
5:	[Cut to the girls apartment, Monica is putting things into her purse as Phoebe and Rachel watch.]
6:	Monica: Passport, check! (As she puts away each item, she says check.) Camera, check! Travellers cheques, check! 
7:	Rachel: Who are you saying "check" too?
8:	Monica: Myself. Yknow for remembering to pac

In [13]:
episodes_list = parse_episodes(script)

In [14]:
len(episodes_list)

191

In [7]:
lines = script.split("\n")
print(len(lines), "lines in total.")
print(lines[:5])

66541 lines in total.
['THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)', 'Written by: Marta Kauffman & David Crane', '[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]', "Monica: There's nothing to tell! He's just some guy I work with!", "Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!"]


In [None]:
def clean_line(line):
    # Lowercase
    line = line.lower()
    # Removing anything in brackets [like this] or parentheses (like this)