In [43]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome()

import logging
import re
import sys
import urllib.request
import time

from bs4 import BeautifulSoup
from queue import Queue
from urllib import parse, request
from queue import PriorityQueue
from operator import itemgetter
import re

urls = ["https://www.dailyscript.com/movie.html", "https://www.dailyscript.com/movie_n-z.html"]

def parse_links(root, html):
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            text = link.string
            if not text:
                text = ''
            text = re.sub('\s+', ' ', text).strip()
            if "scripts" in link.get('href') and ".html" in link.get('href'):
                yield (parse.urljoin(root, link.get('href')))

def get_links(urls):
    l = []
    for url in urls:
        res = request.urlopen(url)
        l = l + list(parse_links(url, res.read()))
    return l

def get_titles(urls):
    t = []
    for url in urls:
        res = request.urlopen(url)
        soup = BeautifulSoup(res.read(), 'html.parser')
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                text = link.string
                if not text:
                    text = ''
                text = re.sub('\s+', ' ', text).strip()
                if "scripts" in link.get('href') and ".html" in link.get('href'):
                    t.append((re.search(r'<a.*?>(.*?)</a>', str(link))).group(1))
    return t


def countOccurrence(a):
    k = {}
    not_in_list = ["I", "INT", "AND", "EXT", "ED", "HE", "ON", "OF", "THE", "CUT", "DR", "MRS", "IN", "TO", "MR", "CLOSE", "FADE", "LOC"]
    for j in a:
        pattern = r"[:.,]"
        j = re.sub(pattern, "", j)
        j = re.sub(r"[':]", "", j)
        j = re.sub(r"\s*\(.*\)", "", j)

        if j.isupper() and j not in not_in_list and len(j) > 1:
            if j in k:
                k[j] +=1
            else:
                k[j] = 1
    return dict(sorted(k.items(), key = itemgetter(1), reverse = True)[:5])

def get_html(url):
    res = request.urlopen(url)
    soup = str(BeautifulSoup(res, 'html.parser').text)
    splits = soup.split()
    counts = countOccurrence(splits)
    return counts, soup

def extract_dialogue(script, characters):
    dialogue = []
    match = False
    upper = False
    for line in script.split('\n'):
        match = False
        line = line.lstrip()
        for character in characters:
            if line.startswith(character):
                match = True
                upper = False
                speaker = character
        if match:
            spoken = line
            if speaker.isupper():
                if speaker.strip() != spoken.strip():
                    dialogue.append((speaker.strip(), spoken.strip()))
                else: 
                    dialogue.append((speaker.strip(), ""))
            # If the speaker isn't all uppercase, it's likely dialogue continuation or other text, so append it to the previous line
        elif len(dialogue) > 0:
            line_split = line.split()
            for l in line_split:
                if l.isupper():
                    upper = True
            if not upper:
                dialogue[-1] = (dialogue[-1][0], dialogue[-1][1] + ' ' + line.strip())
    return dialogue

def clean_text(dialogue_list, characters):
    returning_list = []
    for i in range(len(dialogue_list)):
        if len(dialogue_list[i][1]) == 0:
            continue
        if dialogue_list[i][0] not in characters:
            continue
        s = re.sub("[\(\[].*?[\)\]]", "", dialogue_list[i][1])
        s = " ".join(s.split())
        if len(s) != 0:
            returning_list.append((dialogue_list[i][0], s))
    return returning_list

def retrieve_in_string(cleaned_dialogue):
    s = ""
    for element in cleaned_dialogue:
        s += element[1] + "\n"
    return s

In [31]:
def generate_regex(movie_title):
    # Escape special characters in the movie title
    escaped_title = re.escape(movie_title)
    # Replace spaces with "\s+" to allow for variations in spacing
    regex = re.sub(r"\s+", r"\\s+", escaped_title)
    regex = regex.replace("\\\\", "\\")
    # Add word boundary anchors
    regex = r"\b" + regex + r"\b"
    return regex

In [32]:
def get_MBTI(url):
    driver = webdriver.Chrome()
    driver.get(url)
    personality = []
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'profile-name')))
    try:
        MBTI = driver.find_elements(By.CLASS_NAME, 'vote-detail-letter')
        for letter in MBTI:
            personality.append((letter.text)[-1])
        personality.append(((driver.find_element(By.CLASS_NAME, 'profile-description')).find_element(By.CLASS_NAME, 'container')).text)
    except:
        driver.close()
        return ["", "", "", "", ""]
    driver.close()
    return personality

In [39]:
def scrape_personalities(names, url):
    personality = []
    driver = webdriver.Chrome()
    driver.get(url)
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'community-title')))
    try:
        characters = driver.find_elements(By.CLASS_NAME, 'profile-card-link')
        for name in names:
            regex = generate_regex(name).lower()
            for character in characters:
                a = character.find_element(By.CLASS_NAME, 'info-name')
                print("found: " + a.text + "; target: " + name +", with regex " + regex)
                if (re.search(regex, a.text, re.IGNORECASE)):
                    print("Found " + a.text)
                    personality = get_MBTI(character.get_attribute('href'))
                    print("\n")
                    break
                print("\n")
    except:
        driver.close()
        return ["", "", "", "", ""]
    driver.close()
    return personality

In [40]:
script_list = get_links(urls)
movie_list = get_titles(urls)
count, soup = get_html(script_list[1])
for character in list(count.keys()):
    dialogue = extract_dialogue(soup, [character])
    cleaned_dialogue = clean_text(dialogue, [character])
    s = retrieve_in_string(cleaned_dialogue)

In [41]:
l = []
characters_in = []
for script in script_list[:20]:
    count, soup = get_html(script)
    i = 0
    for character in list(count.keys()):
        dialogue = extract_dialogue(soup, [character])
        cleaned_dialogue = clean_text(dialogue, [character])
        s = retrieve_in_string(cleaned_dialogue)
        if len(cleaned_dialogue) > 20:
            s = retrieve_in_string(cleaned_dialogue)
            title = script[script.rfind("/") + 1 :script.find(".html")]
            l.append([title, character, s])
            i = i + 1
    characters_in.append(i)

In [44]:
personality = []
l_2 = l;
curr = 0
for i, num_actors in enumerate(characters_in):
    url = "https://www.personality-database.com/search?keyword=" + (movie_list[i]).replace(' ', '%20') + "&type=subcategories"
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/section/main/div[1]')))
    try:
        option = driver.find_element(By.CLASS_NAME, 'subcategory-card')
        movie = (option.find_element(By.CLASS_NAME, 'name')).text
        regex = generate_regex(str(movie_list[i]))
        if (re.search(regex, movie)):
            movie_url = option.get_attribute('href')
            characters = [person[1] for person in l[curr:(curr + characters_in[i])]]
            l_2[i].extend(scrape_personalities(characters, movie_url))
        # In the case that there are results, but none are relevant movies:
        else:
            l_2[i].extend(["", "", "", "", ""])
    # In the case that there are no search results for given movie query:
    except:
        l_2[i].extend(["", "", "", "", ""])
        continue
    curr = curr + num_actors

found: Kat Stratford; target: KAT, with regex \bkat\b
Found Kat Stratford


found: Kat Stratford; target: PATRICK, with regex \bpatrick\b


found: Patrick Verona; target: PATRICK, with regex \bpatrick\b
Found Patrick Verona


found: Kat Stratford; target: BIANCA, with regex \bbianca\b


found: Patrick Verona; target: BIANCA, with regex \bbianca\b


found: Joey Donner; target: BIANCA, with regex \bbianca\b


found: Bianca Stratford; target: BIANCA, with regex \bbianca\b
Found Bianca Stratford


found: Kat Stratford; target: CAMERON, with regex \bcameron\b


found: Patrick Verona; target: CAMERON, with regex \bcameron\b


found: Joey Donner; target: CAMERON, with regex \bcameron\b


found: Bianca Stratford; target: CAMERON, with regex \bcameron\b


found: Cameron James; target: CAMERON, with regex \bcameron\b
Found Cameron James


found: Kat Stratford; target: MICHAEL, with regex \bmichael\b


found: Patrick Verona; target: MICHAEL, with regex \bmichael\b


found: Joey Donner; target: MI

found: Ashram Monk; target: KORBEN, with regex \bkorben\b


found: Spike; target: KORBEN, with regex \bkorben\b


found: Ace Ventura; target: CORNELIUS, with regex \bcornelius\b


found: Lt. Lois Einhorn / Ray Finkle; target: CORNELIUS, with regex \bcornelius\b


found: Melissa Robinson; target: CORNELIUS, with regex \bcornelius\b


found: Fulton Greenwall; target: CORNELIUS, with regex \bcornelius\b


found: Vincent Cadby; target: CORNELIUS, with regex \bcornelius\b


found: Burton Quinn; target: CORNELIUS, with regex \bcornelius\b


found: Mick Katie; target: CORNELIUS, with regex \bcornelius\b


found: Gahjii; target: CORNELIUS, with regex \bcornelius\b


found: Ouda; target: CORNELIUS, with regex \bcornelius\b


found: Wachati Chief; target: CORNELIUS, with regex \bcornelius\b


found: Wachati Princess; target: CORNELIUS, with regex \bcornelius\b


found: Ashram Monk; target: CORNELIUS, with regex \bcornelius\b


found: Spike; target: CORNELIUS, with regex \bcornelius\b


found: El



found: Jonesy; target: DINO, with regex \bdino\b


found: Carter J. Burke; target: DINO, with regex \bdino\b


found: Ash; target: DINO, with regex \bdino\b


found: Dennis Parker; target: DINO, with regex \bdino\b


found: Elizabeth Shaw; target: DINO, with regex \bdino\b


found: Corporal Dwayne Hicks; target: DINO, with regex \bdino\b


found: Thomas Kane; target: DINO, with regex \bdino\b


found: Lieutenant Scott Gorman; target: DINO, with regex \bdino\b


found: Samuel Brett; target: DINO, with regex \bdino\b


found: Private First Class Jenette Vasquez; target: DINO, with regex \bdino\b


found: Dr. Jonathan Clemens; target: DINO, with regex \bdino\b


found: Walter One; target: DINO, with regex \bdino\b


found: Facehugger; target: DINO, with regex \bdino\b


found: Peter Weyland; target: DINO, with regex \bdino\b


found: Arthur Dallas; target: DINO, with regex \bdino\b


found: Joan Lambert; target: DINO, with regex \bdino\b


found: Lance Bishop; target: DINO, with regex \

found: Ron Johner; target: DINO, with regex \bdino\b


found: Janek; target: DINO, with regex \bdino\b


found: Meredith Vickers; target: DINO, with regex \bdino\b


found: Rebecca "Newt" Jorden; target: DINO, with regex \bdino\b


found: Annalee Call; target: DINO, with regex \bdino\b


found: Dom Vriess; target: DINO, with regex \bdino\b


found: Walter Golic; target: DINO, with regex \bdino\b


found: Robert Morse; target: DINO, with regex \bdino\b


found: Chris Oram; target: DINO, with regex \bdino\b


found: Tennessee Faris; target: DINO, with regex \bdino\b


found: Amanda Ripley; target: DINO, with regex \bdino\b


found: Katherine Daniels; target: DINO, with regex \bdino\b


found: Maggie Faris; target: DINO, with regex \bdino\b


found: Dr. Mason Wren; target: DINO, with regex \bdino\b


found: Mark Drake; target: DINO, with regex \bdino\b


found: Dr Paul Church; target: DINO, with regex \bdino\b


found: Ellen Ripley; target: CORSO, with regex \bcorso\b


found: Alien / Xen

found: Dr Paul Church; target: LIANA, with regex \bliana\b




In [19]:
curr = 0
for i, num_actors in enumerate(characters_in):
    print("Characters in " + movie_list[i])
    for person in l[curr:(curr + characters_in[i])]:
        print(person[1])
    curr = curr + num_actors

Characters in 10 Things I Hate About You
KAT
PATRICK
BIANCA
CAMERON
MICHAEL
Characters in 12 Monkeys
COLE
RAILLY
JEFFREY
ANGLE
Characters in 13 Days
KENNY
BOBBY
Characters in 1492: Conquest of Paradise:
COLUMBUS
FERNANDO
SANCHEZ
UTAPAN
Characters in 15 Minutes
JORDY
EDDIE
EMIL
OLEG
NICOLETTE
Characters in 15 Minutes 
JORDY
EDDIE
EMIL
OLEG
NICOLETTE
Characters in 2001: A Space Odyssey
BOWMAN
POOLE
FLOYD
HAL
Characters in 3 Kings
ARCHIE
TROY
DOC
VIG
Characters in 3 Kings (Spoils of War)
EPPS
JAEGER
WES
Characters in 48 Hours
CATES
HAMMOND
ELAINE
GANZ
Characters in The 5th Element
KORBEN
CORNELIUS
ZORG
Characters in 8 Millimeter
WELLES
EDDIE
DINO
Characters in 8 MM
WELLES
EDDIE
DINO
Characters in The 9th Gate
CORSO
BALKAN
LIANA
Characters in The Abyss
BUD
LINDSEY
HIPPY
COFFEY
ONE
Characters in Ace Ventura: Pet Detective
ACE
MELISSA
EINHORN
WOODSTOCK
Characters in The Adventures of Ford Fairlane (Ford Fairlane )
FORD
ZUZU
JAZZ
GRENDEL
COLLEEN
Characters in Airforce One
MARSHALL
KORSHUNOV
R

In [67]:
print(l_2[2])

['10Things', 'BIANCA', 'BIANCA STRATFORD, a beautiful sophomore, stands facing the mirror, applying lipstick. Her less extraordinary, but\nDid you change your hair?\nYou might wanna think about it Leave the girls\' room and enter the hallway.\nNot really. The teacher shakes her head, but lets it go.\nNowhere... Hi, Daddy. She kisses him on the cheek\nIs there even a question that we want her to stay? Kat gives Bianca an evil look then smiles sweetly at\nNow don\'t get upset. Daddy, but there\'s\nWhat if she never starts dating?\nBut it\'s not fair -- she\'s a mutant, Daddy!\nBIANCA AND WALTER The sound of a fifteen-year-old in labor.\nBut she doesn\'t want to date.\nCan we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.\nNot the hacking and gagging and spitting part. Please.\nYou\'re asking me out. That\'s so cute. What\'s your name again?\nNo, no, it\'s my fault -- we didn\'t have a proper introduction ---\nT

In [None]:
import pandas as pd
df = pd.DataFrame(l)

In [None]:
for name in df[1]:
  print(name)

In [None]:
df.to_csv("scripts.csv")