In [130]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [144]:
# Define a function scraping soundtracks from imdb by movie ID

def get_imdb_soundtrack(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/soundtrack"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Error loading page")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    lis = soup.find_all('li')

    tracks = []
    for li in lis:
        text = li.get_text(strip=True)
        # Only include lines that look like soundtrack data
        '''
        if any(kw in text for kw in ['Written', 'Performed', 'Sung', 'Recorded', 'Courtesy']):
            tracks.append(text)
        '''
        if 'by' in text :
            tracks.append(text)

    return tracks

In [145]:
soundtrack_list = get_imdb_soundtrack("tt0095853")
for i, line in enumerate(soundtrack_list):
    print(f"{i+1}. {line}")

1. Baby the TransWritten, Performed & Produced byJoe StrummerJoe Strummer performs courtesy of CBS Records United Kingdom Limited
2. Nothin' 'Bout Nothin'Written, Performed & Produced byJoe StrummerJoe Strummer performs courtesy of CBS Records United Kingdom Limited
3. Trash CityWritten, Performed & Produced byJoe StrummerJoe Strummer performs courtesy of CBS Records United Kingdom Limited
4. Nefertiti RockWritten, Performed & Produced byJoe StrummerJoe Strummer performs courtesy of CBS Records United Kingdom Limited
5. Cholo VestWritten, Performed & Produced byJoe StrummerJoe Strummer performs courtesy of CBS Records United Kingdom Limited
6. Something HappenedWritten & Performed byLou ReedProduced byLou Reed&Arthur Baker
7. H.M.S. Pinafore ExcerptsWritten byW.S. Gilbert&Arthur Sullivan
8. She's on My MindWritten by John HusseyPerformed & Produced byLittle AmericaLittle America performs courtesy of Geffen Records
9. Waiting on LoveWritten byKurt Neumann&Sammy Llanas(as Sam Llanas)Perf

In [146]:
# Read the curated movie list csv file
df_movies = pd.read_csv("Movie List.csv")
movie_ids = df_movies['IMDb ID'].to_list()
df_movies.head()

Unnamed: 0,Movie Title,Year,Vibe Keywords,IMDb ID
0,SLC Punk,1998,"rebellious nostalgia, punk decay, emotional chaos",tt0133189
1,SubUrbia (Linklater),1996,"aimlessness, Gen X burnout, parking lot dread",tt0120238
2,Suburbia (Penelope Spheeris),1983,"raw punk squat teens, emotional vacuum",tt0086589
3,Smithereens,1982,"NYC no wave desperation, scene-girl collapse",tt0084698
4,Ghost World,2001,"post-high school alienation, ironic distance",tt0162346


In [147]:
all_data = []

# You already have this:
# movie_ids = df_movies['IMDb ID'].to_list()

# Assume df_movies also has the movie titles
for _, row in df_movies.iterrows():
    title = row['Movie Title']  # or whatever column contains the title
    imdb_id = row['IMDb ID']
    
    print(f"Scraping {title} ({imdb_id})")
    raw_lines = get_imdb_soundtrack(imdb_id)

    for line in raw_lines:
        all_data.append({
            "movie": title,
            "imdb_id": imdb_id,
            "raw_track": line
        })

Scraping SLC Punk (tt0133189)
Scraping SubUrbia (Linklater) (tt0120238)
Scraping Suburbia (Penelope Spheeris) (tt0086589)
Scraping Smithereens (tt0084698)
Scraping Ghost World (tt0162346)
Scraping Out of the Blue (tt0081291)
Scraping Ladies and Gentlemen, The Fabulous Stains (tt0082639)
Scraping Times Square (tt0081635)
Scraping River’s Edge (tt0091860)
Scraping Permanent Record (tt0095853)
Scraping Gummo (tt0119237)
Scraping Kids (tt0113540)
Scraping The Doom Generation (tt0112887)
Scraping My Own Private Idaho (tt0102494)
Scraping We Are the Best! (tt2364975)
Scraping Mid90s (tt5613484)
Scraping Paranoid Park (Gus Van Sant) (tt0842929)
Scraping Wild Tigers I Have Known (tt0430768)
Scraping Slacker (tt0102943)
Scraping Nowhere (tt0119809)


In [148]:
df_soundtracks = pd.DataFrame(all_data)
df_soundtracks.to_csv("all_soundtracks_raw.csv", index=False)

In [136]:
# Define a function to parse and extract track title and artist

def parse_track_entry_clean(entry):
    # Extract title (everything before "Written by" or "Performed by")
    title_match = re.split(r"Written|Performed|Courtesy|Lyrics by", entry)
    title = title_match[0].strip()

    print(title)

    # Try to extract performer
    performer_match = re.search(
    r"(?:Performed by|Performed & Written by|Sung by|Written, Performed & Produced by)\s*([\w\s\.\-&'()]+)",
    entry
    )

    artist = None

    if performer_match:
        artist = performer_match.group(1)
        artist = re.split(r"from|Under license|Courtesy of|Published by|Written by|Played at", artist, flags=re.IGNORECASE)[0]
        artist = artist.replace("&", " & ").replace("  ", " ").strip()
    

    return {
        "track": title,
        "artist": artist if artist else None
    }

In [183]:
# Define a function to parse and extract track title and artist

def parse_track_entry_clean(entry):
    # Extract title (everything before "Written by" or "Performed by")
    title_match = re.split(r"Written|performed|Performed|Courtesy|Lyrics by|Words and Music by", entry)
    title = title_match[0].strip()

    print(title)

    # Try to extract performer
    performer_match = re.search(
    r"(?:Performed by|performed by|Performed & Written by|Sung by|Written, Performed & Produced by|Written and Recorded by|Recorded by|Performed & Composed by|Written & Performed by|Performed & Produced by)\s*([\w\s\.\-&'()]+)",
    entry
    )

    artist = None
    remixer = None

    
    if performer_match:
        artist = performer_match.group(1)
        artist = re.split(r"from|Under license|Courtesy of|Published by|Written|Played at|Composed by|Produced by|Approved by|\(c\|Copyright|Recording|Conducted by|Recycled by", artist, flags=re.IGNORECASE)[0]
        artist = artist.replace("&", " & ").replace("  ", " ").strip()

        if "Remixed by" in artist:
            artist, remixer = artist.split("Remixed by", 1)
            artist = artist.strip()
            remixer = remixer.strip()

    if "byPowder Blues Band" in entry:
        title = entry.replace("byPowder Blues Band", "").strip()
        artist = "Powder Blues Band"

    return {
        "track": title,
        "artist": artist if artist else None,
        "remixer": remixer if remixer else None
    }

In [184]:
parsed_rows = []

for _, row in df_soundtracks.iterrows():
    parsed = parse_track_entry_clean(row['raw_track'])
    parsed['movie'] = row['movie']
    parsed['imdb_id'] = row['imdb_id']
    parsed_rows.append(parsed)

df_parsed = pd.DataFrame(parsed_rows)

High Adventure
Sex & Violence
Too Hot
Mirror in the Bathroom
Champaign Breakfast
Little Ocean
1969
Pooh Bare
I Love Livin' in the City
No More Bullshit
Fear & Loathing
Train Wreck
Little Doll
Cretin Hop
I Never Promised You a Rose Garden
Gypsy Tango
One of These Days
She's The One
Gasoline Rain
Gangsters
Beat My Guest
Mexican Dance
Lazy River
Bluegrass Blues
Hot For Teacher
Urban Struggle
Through the Town
Istanbul Coffee House
Moonlight Sonata
Free to Live
Wreck We 'Em
We Will Fall
Look Back & Laugh
Rock & Roll
Requiem
Mother of Pearl
Dreaming
She Loved Like Diamonds
Amoeba
The Trees
Kiss Me Deadly
Kill the Poor
Town Without Pity
Peaceful Day
Scum
Rundown
NWO
Bulletproof Cupid
Unheard Music
Feather in Your Cap
Does Your Hometown Care
Bee-Bee's Song
Man Invisible
Psychic Hearts
Human Cannonball
SalvationMusic byBill Carter
Candle
Hot Day
Huge Giant Omen
I'm Not Like Everybody Else
Sunday
Cult
Richard Hung Himself
Wash Away
Darker My Love
The Legend Of Pat Brown
No God
Urban Struggle
The

In [185]:
df_parsed.head()

Unnamed: 0,track,artist,remixer,movie,imdb_id
0,High Adventure,,,SLC Punk,tt0133189
1,Sex & Violence,The Exploited(as Exploited),,SLC Punk,tt0133189
2,Too Hot,The Specials,,SLC Punk,tt0133189
3,Mirror in the Bathroom,Fifi,,SLC Punk,tt0133189
4,Champaign Breakfast,John Sbarra,,SLC Punk,tt0133189


In [186]:
df_parsed.tail()

Unnamed: 0,track,artist,remixer,movie,imdb_id
409,Freak Out,311,,Nowhere,tt0119809
410,Land Beyond The Sun,Flying Saucer Attack,,Nowhere,tt0119809
411,Generation Wrekkked (Danny Saber Remix),Chuck D,Danny Saber,Nowhere,tt0119809
412,Dose (The Critter Mix),Filter,Critter,Nowhere,tt0119809
413,Love Is Stronger Than Death,The The,,Nowhere,tt0119809


In [187]:
len(df_parsed.groupby('movie'))

20

In [188]:
df_parsed.groupby('movie').size()

movie
Ghost World                                  36
Gummo                                        21
Kids                                         22
Ladies and Gentlemen, The Fabulous Stains    13
Mid90s                                       25
My Own Private Idaho                         16
Nowhere                                      39
Out of the Blue                              10
Paranoid Park (Gus Van Sant)                 20
Permanent Record                             16
River’s Edge                                 11
SLC Punk                                     42
Slacker                                      24
Smithereens                                  11
SubUrbia (Linklater)                         20
Suburbia (Penelope Spheeris)                  6
The Doom Generation                          26
Times Square                                 20
We Are the Best!                             20
Wild Tigers I Have Known                     16
dtype: int64

In [189]:
df_parsed.to_csv('Soundtrack Parsed.csv', index=False, encoding='utf-8')