In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome()

import logging
import re
import sys
import urllib.request
import time

from bs4 import BeautifulSoup
from queue import Queue
from urllib import parse, request
from queue import PriorityQueue
from operator import itemgetter
import re

urls = ["https://www.dailyscript.com/movie.html", "https://www.dailyscript.com/movie_n-z.html"]

def parse_links(root, html):
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            text = link.string
            if not text:
                text = ''
            text = re.sub('\s+', ' ', text).strip()
            if "scripts" in link.get('href') and ".html" in link.get('href'):
                yield (parse.urljoin(root, link.get('href')))

def get_links(urls):
    l = []
    for url in urls:
        res = request.urlopen(url)
        l = l + list(parse_links(url, res.read()))
    return l

def get_titles(urls):
    t = []
    for url in urls:
        res = request.urlopen(url)
        soup = BeautifulSoup(res.read(), 'html.parser')
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                text = link.string
                if not text:
                    text = ''
                text = re.sub('\s+', ' ', text).strip()
                if "scripts" in link.get('href') and ".html" in link.get('href'):
                    t.append((re.search(r'<a.*?>(.*?)</a>', str(link))).group(1))
    return t


def countOccurrence(a):
    k = {}
    not_in_list = ["I", "INT", "AND", "EXT", "ED", "HE", "ON", "OF", "THE", "CUT", "DR", "MRS", "IN", "TO", "MR", "CLOSE", "FADE", "LOC"]
    for j in a:
        pattern = r"[:.,]"
        j = re.sub(pattern, "", j)
        j = re.sub(r"[':]", "", j)
        j = re.sub(r"\s*\(.*\)", "", j)

        if j.isupper() and j not in not_in_list and len(j) > 1:
            if j in k:
                k[j] +=1
            else:
                k[j] = 1
    return dict(sorted(k.items(), key = itemgetter(1), reverse = True)[:5])

def get_html(url):
    res = request.urlopen(url)
    soup = str(BeautifulSoup(res, 'html.parser').text)
    splits = soup.split()
    counts = countOccurrence(splits)
    return counts, soup

def extract_dialogue(script, characters):
    dialogue = []
    match = False
    upper = False
    for line in script.split('\n'):
        match = False
        line = line.lstrip()
        for character in characters:
            if line.startswith(character):
                match = True
                upper = False
                speaker = character
        if match:
            spoken = line
            if speaker.isupper():
                if speaker.strip() != spoken.strip():
                    dialogue.append((speaker.strip(), spoken.strip()))
                else: 
                    dialogue.append((speaker.strip(), ""))
            # If the speaker isn't all uppercase, it's likely dialogue continuation or other text, so append it to the previous line
        elif len(dialogue) > 0:
            line_split = line.split()
            for l in line_split:
                if l.isupper():
                    upper = True
            if not upper:
                dialogue[-1] = (dialogue[-1][0], dialogue[-1][1] + ' ' + line.strip())
    return dialogue

def clean_text(dialogue_list, characters):
    returning_list = []
    for i in range(len(dialogue_list)):
        if len(dialogue_list[i][1]) == 0:
            continue
        if dialogue_list[i][0] not in characters:
            continue
        s = re.sub("[\(\[].*?[\)\]]", "", dialogue_list[i][1])
        s = " ".join(s.split())
        if len(s) != 0:
            returning_list.append((dialogue_list[i][0], s))
    return returning_list

def retrieve_in_string(cleaned_dialogue):
    s = ""
    for element in cleaned_dialogue:
        s += element[1] + "\n"
    return s

In [16]:
def generate_regex(movie_title):
    escaped_movie = re.escape(movie_title) + "\s*"
    regex = escaped_movie.replace("\ ", "\s+") + "(\([0-9a-zA-Z\\s]*\)\s*)*"
    return regex

In [17]:
def get_MBTI(url):
    driver = webdriver.Chrome()
    driver.get(url)
    personality = []
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'profile-name')))
    try:
        MBTI = driver.find_elements(By.CLASS_NAME, 'vote-detail-letter')
        for letter in MBTI:
            personality.append((letter.text)[-1])
        personality.append(((driver.find_element(By.CLASS_NAME, 'profile-description')).find_element(By.CLASS_NAME, 'container')).text)
    except:
        driver.quit()
        return personality
    driver.quit()
    return personality

In [18]:
def scrape_personalities(name, url):
    personality = ["", "", "", "", ""]
    driver = webdriver.Chrome()
    driver.get(url)
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'community-title')))
    try:
        characters = driver.find_elements(By.CLASS_NAME, 'profile-card-link')
        regex = generate_regex(name)
        for character in characters:
            a = character.find_element(By.CLASS_NAME, 'info-name')
            if (re.search(regex, a.text, re.IGNORECASE)):
                print("Found " + a.text)
                personality = get_MBTI(character.get_attribute('href'))
                print(personality + "\n")
                break
    except:
        driver.quit()
        return personality
    driver.quit()
    return personality

In [19]:
script_list = get_links(urls)
movie_list = get_titles(urls)
count, soup = get_html(script_list[1])
for character in list(count.keys()):
    dialogue = extract_dialogue(soup, [character])
    cleaned_dialogue = clean_text(dialogue, [character])
    s = retrieve_in_string(cleaned_dialogue)

In [None]:
l = []
characters_in = []
for script in script_list:
    count, soup = get_html(script)
    i = 0
    for character in list(count.keys()):
        dialogue = extract_dialogue(soup, [character])
        cleaned_dialogue = clean_text(dialogue, [character])
        s = retrieve_in_string(cleaned_dialogue)
        if len(cleaned_dialogue) > 20:
            s = retrieve_in_string(cleaned_dialogue)
            title = script[script.rfind("/") + 1 :script.find(".html")]
            l.append([title, character, s])
            i = i + 1
    characters_in.append(i)

In [10]:
personality = []
curr = 0
for i, num_actors in enumerate(characters_in):
    url = "https://www.personality-database.com/search?keyword=" + (movie_list[i]).replace(' ', '%20') + "&type=subcategories"
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/section/main/div[1]')))
    try:
        option = driver.find_element(By.CLASS_NAME, 'subcategory-card')
        movie = (option.find_element(By.CLASS_NAME, 'name')).text
        regex = generate_regex(str(movie_list[i]))
        if (re.search(regex, movie)):
            movie_url = option.get_attribute('href')
            for i, person in enumerate(l[curr:(curr + characters_in[i])]):
                l[i + curr].extend(scrape_personalities(person[1], movie_url))
        # In the case that there are results, but none are relevant movies:
        else:
            for i, person in enumerate(l[curr:(curr + characters_in[i])]):
                l[i + curr].extend(["", "", "", "", ""])
    # In the case that there are no search results for given movie query:
    except:
        for i, person in enumerate(l[curr:(curr + characters_in[i])]):
                l[i + curr].extend(["", "", "", "", ""])
        curr = curr + num_actors
        continue
    curr = curr + num_actors
driver.quit()

Found Kat Stratford
Found Patrick Verona
Found Bianca Stratford
Found Cameron James
Found Michael Eckman
Found James Cole
Found Kathryn Railly
Found Jeffrey Goines


In [65]:
curr = 0
for i, num_actors in enumerate(characters_in):
    print("Characters in " + movie_list[i])
    for person in l[curr:(curr + characters_in[i])]:
        print(person[1])
    curr = curr + num_actors

Characters in 10 Things I Hate About You
KAT
PATRICK
BIANCA
CAMERON
MICHAEL
Characters in 12 Monkeys
COLE
RAILLY
JEFFREY
ANGLE
Characters in 13 Days
KENNY
BOBBY
Characters in 1492: Conquest of Paradise:
COLUMBUS
FERNANDO
SANCHEZ
UTAPAN
Characters in 15 Minutes
JORDY
EDDIE
EMIL
OLEG
NICOLETTE
Characters in 15 Minutes 
JORDY
EDDIE
EMIL
OLEG
NICOLETTE
Characters in 2001: A Space Odyssey
BOWMAN
POOLE
FLOYD
HAL
Characters in 3 Kings
ARCHIE
TROY
DOC
VIG
Characters in 3 Kings (Spoils of War)
EPPS
JAEGER
WES
Characters in 48 Hours
CATES
HAMMOND
ELAINE
GANZ
Characters in The 5th Element
KORBEN
CORNELIUS
ZORG
Characters in 8 Millimeter
WELLES
EDDIE
DINO
Characters in 8 MM
WELLES
EDDIE
DINO
Characters in The 9th Gate
CORSO
BALKAN
LIANA
Characters in The Abyss
BUD
LINDSEY
HIPPY
COFFEY
ONE
Characters in Ace Ventura: Pet Detective
ACE
MELISSA
EINHORN
WOODSTOCK
Characters in The Adventures of Ford Fairlane (Ford Fairlane )
FORD
ZUZU
JAZZ
GRENDEL
COLLEEN
Characters in Airforce One
MARSHALL
KORSHUNOV
R

In [9]:
print(l_2[7])

['twelve_monkeys', 'JEFFREY', 'How much you gonna pay me? Huh? I\'d be doing your job.\nOkay, Billings. Five thousand. That\'s enough. Five thousand dollars. I\'ll give him the Deluxe Mental Hospital Tour.\nKid around, kid around. It makes them feel good, we\'re all pals. We\'re prisoners, they\'re the guards, but it\'s all in good fun, you see?\nHere\'s the games. Games vegitize you. If you play the games, you\'re voluntarily taking a tranquilizer.\nWhat\'d they give you? Thorazine? How much? Learn your drugs -- know your doses.\nSo if you want to watch a particular program, say "All My Children" or something, you go to the Charge Nurse and tell her what day and time the show you want to see is on. But you have to tell her before the show is scheduled to be on. There was this one guy who was always requesting shows that had already played. He couldn\'t quite grasp the idea that the Charge Nurse couldn\'t just make it be yesterday for him, turn back time ha ha. What a fruitcake!!\nSeri

In [None]:
import pandas as pd
df = pd.DataFrame(l)

In [None]:
for name in df[1]:
  print(name)

In [None]:
df.to_csv("scripts.csv")