In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
import os
import glob
import pandas as pd

# =========================
# CONFIG
# =========================
DATA_DIR = "D:\\MDSA Case Study\\box office data"  # folder containing yearly CSVs
OUTPUT_FILE = "merged_movies_data_1937_2025_filtered.csv"

# Expected pattern examples:
# merged_movies_data_1937.csv
# merged_movies_data_YYYY(1937-2025).csv
# merged_movies_data_1940-anything.csv
# You can tighten this pattern if your filenames are consistent.
PATTERN = os.path.join(DATA_DIR, "*.csv")

START_YEAR = 1937
END_YEAR = 2025

# Columns to drop (using your provided names)
COLUMNS_TO_DROP = [
    "Votes",
    "mÃ©ta_score",       # keep as-is from your file
    "description",
    "Movie Link",
    "writers",
    "directors",
    "stars",
    "awards_content",
]

# =========================
# HELPERS
# =========================
def normalize_colname(c: str) -> str:
    """Normalize column names for safer matching."""
    return c.strip()

def file_year_in_range(filename: str, start: int, end: int) -> bool:
    """
    Return True if filename contains at least one year between start and end.
    """
    base = os.path.basename(filename)
    # quick scan for 4-digit years
    for y in range(start, end + 1):
        if str(y) in base:
            return True
    return False

def contains_english(value) -> bool:
    """
    True if Languages contains 'English' among one or more languages.
    Handles comma/pipe/slash separated strings.
    """
    if pd.isna(value):
        return False
    s = str(value).strip().lower()
    # direct fast check first
    if "english" not in s:
        return False

    # tokenize on common separators for cleaner matching
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, ",")
    tokens = [t.strip() for t in s.split(",") if t.strip()]
    return any(tok == "english" for tok in tokens) or ("english" in s)

def is_us_origin(value) -> bool:
    """
    True if countries_origin includes US.
    Accepts values like: US, USA, United States, United States of America
    and multi-country strings.
    """
    if pd.isna(value):
        return False
    s = str(value).strip().lower()

    # Normalize separators
    for sep in ["|", "/", ";"]:
        s = s.replace(sep, ",")
    tokens = [t.strip() for t in s.split(",") if t.strip()]

    us_aliases = {
        "us",
        "u.s.",
        "usa",
        "u.s.a.",
        "united states",
        "united states of america",
    }

    # exact token match OR fallback substring checks for messy values
    if any(tok in us_aliases for tok in tokens):
        return True

    if "united states" in s or "usa" in s or s == "us":
        return True

    return False

# =========================
# LOAD FILES
# =========================
all_files = glob.glob(PATTERN)
candidate_files = [f for f in all_files if file_year_in_range(f, START_YEAR, END_YEAR)]

if not candidate_files:
    raise FileNotFoundError(
        f"No CSV files found in {DATA_DIR} containing years {START_YEAR}-{END_YEAR}."
    )

dfs = []
for f in sorted(candidate_files):
    try:
        df = pd.read_csv(f, low_memory=False)
        # Normalize column names
        df.columns = [normalize_colname(c) for c in df.columns]
        # Track source file (optional)
        df["__source_file"] = os.path.basename(f)
        dfs.append(df)
        print(f"Loaded: {f} | rows={len(df):,}")
    except Exception as e:
        print(f"Skipped {f} due to read error: {e}")

if not dfs:
    raise RuntimeError("No files could be loaded successfully.")

merged = pd.concat(dfs, ignore_index=True, sort=False)
print(f"\nMerged rows: {len(merged):,}")

# =========================
# DROP COLUMNS
# =========================
# Drop only columns that exist (safe)
existing_drop_cols = [c for c in COLUMNS_TO_DROP if c in merged.columns]
merged = merged.drop(columns=existing_drop_cols, errors="ignore")
print(f"Dropped columns: {existing_drop_cols}")

# =========================
# FILTER CONDITIONS
# =========================
required_cols = ["Languages", "countries_origin"]
missing_required = [c for c in required_cols if c not in merged.columns]
if missing_required:
    raise KeyError(
        f"Missing required columns for filtering: {missing_required}. "
        f"Available columns: {list(merged.columns)}"
    )

filtered = merged[
    merged["Languages"].apply(contains_english) &
    merged["countries_origin"].apply(is_us_origin)
].copy()

print(f"Filtered rows (English + US origin): {len(filtered):,}")

# Optional: remove duplicates
# filtered = filtered.drop_duplicates(subset=["Title", "Year"], keep="first")

# Save output
filtered.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
print(f"Saved: {OUTPUT_FILE}")


Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1937.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1938.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1939.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1940.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1941.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1942.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1943.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1944.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1945.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1946.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1947.csv | rows=600
Loaded: D:\MDSA Case Study\box office data\merged_movies_data_1948.csv | rows=600
Loaded: D:\MDSA 

In [31]:
df = pd.read_csv(OUTPUT_FILE)

In [32]:
df

Unnamed: 0,Title,Year,Duration,MPA,Rating,méta_score,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,countries_origin,filming_locations,production_company,genres,Languages,__source_file
0,1. Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,96.0,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['United States'],['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English'],merged_movies_data_1937.csv
1,3. Stage Door,1937,1h 32m,Approved,7.7,,"$952,000 (estimated)",,"$8,835",,"October 8, 1937",['United States'],"['RKO Studios - 780 N. Gower Street, Hollywood...",['RKO Radio Pictures'],"['Comedy', 'Drama', 'Romance']",['English'],merged_movies_data_1937.csv
2,4. Shall We Dance,1937,1h 49m,Approved,7.4,,"$991,000 (estimated)",,"$6,662",,"May 7, 1937",['United States'],"['RKO Studios - 780 N. Gower Street, Hollywood...",['RKO Radio Pictures'],"['Romantic Comedy', 'Comedy', 'Musical', 'Roma...","['English', 'French']",merged_movies_data_1937.csv
3,5. The Great Garrick,1937,1h 29m,Approved,6.7,,,,,,"October 30, 1937",['United States'],['Warner Brothers Burbank Studios - 4000 Warne...,['Warner Bros.'],"['Comedy', 'Romance']","['English', 'French']",merged_movies_data_1937.csv
4,6. Lost Horizon,1937,2h 12m,Approved,7.6,,"$4,000,000 (estimated)",,,,"September 1, 1937",['United States'],"['Ojai, California, USA']",['Columbia Pictures'],"['Adventure', 'Drama', 'Fantasy', 'Mystery']","['English', 'Mandarin']",merged_movies_data_1937.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28355,591. The Street Avenger,2025,,,,,"$4,000,000 (estimated)",,,,"December 24, 2025",['United States'],"['New Jersey, USA (Studio)']",['Angry Troll Studio'],"['Action', 'Adventure', 'Crime', 'Thriller']",['English'],merged_movies_data_2025.csv
28356,594. Friday the 13th: Blood Loss,2025,1h 30m,,5.4,,"$30,000 (estimated)",,,,"March 16, 2025",['United States'],"['Rochester New York, USA (location)']",['Forever Entertainment'],"['Drama', 'Horror']",['English'],merged_movies_data_2025.csv
28357,595. Killer Clown Girls,2025,1h 30m,,,,,,,,"December 30, 2025",['United States'],,"['Masch Global Media', 'Masch', 'Roman Empire ...",['Horror'],['English'],merged_movies_data_2025.csv
28358,"597. Love, Brooklyn",2025,1h 37m,,6.9,60.0,,,,,"January 27, 2025",['United States'],"['Brooklyn, New York, USA']","['Daughter Films', 'Fireheart Entertainment', ...",['Drama'],['English'],merged_movies_data_2025.csv


In [33]:
competitors_data = df[
    (df['production_company'].str.contains(r'Warner|Disney|Universal', case=False, na=False, regex=True)) 
].copy()

In [34]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,méta_score,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,countries_origin,filming_locations,production_company,genres,Languages,__source_file
0,1. Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,96.0,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['United States'],['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English'],merged_movies_data_1937.csv
3,5. The Great Garrick,1937,1h 29m,Approved,6.7,,,,,,"October 30, 1937",['United States'],['Warner Brothers Burbank Studios - 4000 Warne...,['Warner Bros.'],"['Comedy', 'Romance']","['English', 'French']",merged_movies_data_1937.csv
17,21. The Life of Emile Zola,1937,1h 56m,Approved,7.1,,,,,,"October 2, 1937",['United States'],"[""Goff Island, Laguna Beach, California, USA (...",['Warner Bros.'],"['Biography', 'Drama']",['English'],merged_movies_data_1937.csv
23,28. They Won't Forget,1937,1h 35m,Approved,7.2,,,,,,"October 9, 1937",['United States'],['Warner Brothers Burbank Studios - 4000 Warne...,['Warner Bros.'],"['Film Noir', 'Legal Drama', 'Drama', 'Mystery']",['English'],merged_movies_data_1937.csv
25,30. Marked Woman,1937,1h 36m,Approved,7.1,73.0,,,,,"April 10, 1937",['United States'],"['Times Square, Manhattan, New York City, New ...","['First National Pictures', 'Warner Bros.']","['Film Noir', 'Crime', 'Drama', 'Thriller']",['English'],merged_movies_data_1937.csv


In [35]:
competitors_data.drop(columns=['__source_file', 'méta_score', 'countries_origin'], inplace=True, errors='ignore')

In [36]:
competitors_data = competitors_data[competitors_data["opening_weekend_Gross"].notna()]

In [37]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,filming_locations,production_company,genres,Languages
0,1. Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English']
1192,1. Pinocchio,1940,1h 28m,Approved,7.5,"$2,600,000 (estimated)","$3,769,251","$121,892,045","$84,254,167","February 23, 1940",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English']
1193,2. Fantasia,1940,2h 4m,G,7.7,"$2,280,000 (estimated)","$980,798","$76,411,978","$76,408,097","September 19, 1941","['Stage 1, Walt Disney Studios, 500 South Buen...","['Walt Disney Animation Studios', 'Walt Disney...","['Classic Musical', 'Concert', 'Hand-Drawn Ani...",['English']
2043,1. Bambi,1942,1h 9m,Approved,7.3,"$858,000 (estimated)","$7,216,741","$267,447,150","$102,247,150","August 21, 1942","['Walt Disney Studios, 500 South Buena Vista S...","['Walt Disney Animation Studios', 'Walt Disney...","['Animal Adventure', 'Hand-Drawn Animation', '...",['English']
2044,2. Casablanca,1942,1h 42m,PG,8.5,"$950,000 (estimated)","$181,494","$4,727,083","$4,219,709","January 23, 1943","['Waterman Drive, Van Nuys, Los Angeles, Calif...",['Warner Bros.'],"['Drama', 'Romance', 'War']","['English', 'French', 'German', 'Italian', 'Ru..."


In [38]:
competitors_data["Title"] = (
    competitors_data["Title"]
    .fillna("")
    .astype(str)
    .str.strip()
    .str.replace(r'^\s*\d+\s*[\.\)\-:]\s*', '', regex=True)
)


In [39]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,filming_locations,production_company,genres,Languages
0,Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English']
1192,Pinocchio,1940,1h 28m,Approved,7.5,"$2,600,000 (estimated)","$3,769,251","$121,892,045","$84,254,167","February 23, 1940",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...","['Fairy Tale', 'Hand-Drawn Animation', 'Advent...",['English']
1193,Fantasia,1940,2h 4m,G,7.7,"$2,280,000 (estimated)","$980,798","$76,411,978","$76,408,097","September 19, 1941","['Stage 1, Walt Disney Studios, 500 South Buen...","['Walt Disney Animation Studios', 'Walt Disney...","['Classic Musical', 'Concert', 'Hand-Drawn Ani...",['English']
2043,Bambi,1942,1h 9m,Approved,7.3,"$858,000 (estimated)","$7,216,741","$267,447,150","$102,247,150","August 21, 1942","['Walt Disney Studios, 500 South Buena Vista S...","['Walt Disney Animation Studios', 'Walt Disney...","['Animal Adventure', 'Hand-Drawn Animation', '...",['English']
2044,Casablanca,1942,1h 42m,PG,8.5,"$950,000 (estimated)","$181,494","$4,727,083","$4,219,709","January 23, 1943","['Waterman Drive, Van Nuys, Los Angeles, Calif...",['Warner Bros.'],"['Drama', 'Romance', 'War']","['English', 'French', 'German', 'Italian', 'Ru..."


In [40]:
import re
import ast

allowed = {
    "Comedy","Adventure","Drama","Action","Thriller/Suspense","Romantic Comedy","Animation",
    "Unknown","Musical","Documentary","Western","Horror","Black Comedy","Concert/Performance"
}

def first_allowed_genre_pref_animation(s):
    if pd.isna(s):
        return "Unknown"

    # Build a clean list of genre tokens
    if isinstance(s, list):
        parts = [str(x).strip() for x in s]
    else:
        text = str(s).strip()

        if text.startswith("[") and text.endswith("]"):
            try:
                parsed = ast.literal_eval(text)
                if isinstance(parsed, list):
                    parts = [str(x).strip() for x in parsed]
                else:
                    parts = [text]
            except Exception:
                parts = [p.strip(" '\"") for p in re.split(r'[,/|;]', text)]
        else:
            parts = [p.strip(" '\"") for p in re.split(r'[,/|;]', text)]

    # Priority rule
    if "Animation" in parts:
        return "Animation"

    # Otherwise first allowed from left to right
    for p in parts:
        if p in allowed:
            return p

    return "Unknown"

competitors_data["genres"] = competitors_data["genres"].apply(first_allowed_genre_pref_animation)


In [41]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,filming_locations,production_company,genres,Languages
0,Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...",Animation,['English']
1192,Pinocchio,1940,1h 28m,Approved,7.5,"$2,600,000 (estimated)","$3,769,251","$121,892,045","$84,254,167","February 23, 1940",['Walt Disney Feature Animation - 500 S. Buena...,"['Walt Disney Animation Studios', 'Walt Disney...",Animation,['English']
1193,Fantasia,1940,2h 4m,G,7.7,"$2,280,000 (estimated)","$980,798","$76,411,978","$76,408,097","September 19, 1941","['Stage 1, Walt Disney Studios, 500 South Buen...","['Walt Disney Animation Studios', 'Walt Disney...",Animation,['English']
2043,Bambi,1942,1h 9m,Approved,7.3,"$858,000 (estimated)","$7,216,741","$267,447,150","$102,247,150","August 21, 1942","['Walt Disney Studios, 500 South Buena Vista S...","['Walt Disney Animation Studios', 'Walt Disney...",Animation,['English']
2044,Casablanca,1942,1h 42m,PG,8.5,"$950,000 (estimated)","$181,494","$4,727,083","$4,219,709","January 23, 1943","['Waterman Drive, Van Nuys, Los Angeles, Calif...",['Warner Bros.'],Drama,"['English', 'French', 'German', 'Italian', 'Ru..."


In [42]:
competitors_data["production_company"].value_counts()

production_company
['Warner Bros.']                                                             43
['Universal Pictures']                                                       42
['Walt Disney Pictures', 'Pixar Animation Studios']                          17
['Warner Bros.', 'Village Roadshow Pictures', 'NPV Entertainment']           16
['Walt Disney Animation Studios', 'Walt Disney Productions']                 12
                                                                             ..
['DIC Entertainment', 'Peak Productions', 'Walt Disney Pictures']             1
['Warner Bros.', 'Village Roadshow Pictures', 'Groucho Film Partnership']     1
['Alphaville Films', 'Universal Studios']                                     1
['Castle Rock Entertainment', 'Darkwoods Productions', 'Warner Bros.']        1
['Warner Bros.', 'Winkler Films']                                             1
Name: count, Length: 980, dtype: int64

In [43]:
import re
import ast

# Canonical output label -> regex pattern
allowed_patterns = {
    "Warner Bros.": re.compile(r"\bwarner\s*(bros\.?|brothers)\b", re.IGNORECASE),
    "Universal Pictures": re.compile(r"\buniversal(\s+pictures)?\b", re.IGNORECASE),
    "Walt Disney": re.compile(r"\b(walt\s+disney|disney)\b", re.IGNORECASE),
}

def first_allowed_company_regex(s):
    if pd.isna(s):
        return "Unknown"

    # Build a token list
    if isinstance(s, list):
        parts = [str(x).strip() for x in s]
    else:
        text = str(s).strip()

        # If list-like string: "['A', 'B']"
        if text.startswith("[") and text.endswith("]"):
            try:
                parsed = ast.literal_eval(text)
                if isinstance(parsed, list):
                    parts = [str(x).strip() for x in parsed]
                else:
                    parts = [text]
            except Exception:
                parts = [p.strip(" '\"") for p in re.split(r'[,/|;]', text)]
        else:
            # Split on common separators between companies
            parts = [p.strip(" '\"") for p in re.split(r'[,/|;]', text)]

    # Check each token left-to-right; return first canonical label matched
    for p in parts:
        for canonical, pattern in allowed_patterns.items():
            if pattern.search(p):
                return canonical

    return "Unknown"

competitors_data["production_company"] = (
    competitors_data["production_company"]
    .apply(first_allowed_company_regex)
)


In [44]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,filming_locations,production_company,genres,Languages
0,Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486","February 4, 1938",['Walt Disney Feature Animation - 500 S. Buena...,Walt Disney,Animation,['English']
1192,Pinocchio,1940,1h 28m,Approved,7.5,"$2,600,000 (estimated)","$3,769,251","$121,892,045","$84,254,167","February 23, 1940",['Walt Disney Feature Animation - 500 S. Buena...,Walt Disney,Animation,['English']
1193,Fantasia,1940,2h 4m,G,7.7,"$2,280,000 (estimated)","$980,798","$76,411,978","$76,408,097","September 19, 1941","['Stage 1, Walt Disney Studios, 500 South Buen...",Walt Disney,Animation,['English']
2043,Bambi,1942,1h 9m,Approved,7.3,"$858,000 (estimated)","$7,216,741","$267,447,150","$102,247,150","August 21, 1942","['Walt Disney Studios, 500 South Buena Vista S...",Walt Disney,Animation,['English']
2044,Casablanca,1942,1h 42m,PG,8.5,"$950,000 (estimated)","$181,494","$4,727,083","$4,219,709","January 23, 1943","['Waterman Drive, Van Nuys, Los Angeles, Calif...",Warner Bros.,Drama,"['English', 'French', 'German', 'Italian', 'Ru..."


In [45]:
competitors_data = competitors_data[competitors_data["production_company"] != "Unknown"]

In [46]:
competitors_data["filming_locations"].value_counts()

filming_locations
['Walt Disney Feature Animation - 500 S. Buena Vista Street, Burbank, California, USA']                                    18
['Los Angeles, California, USA']                                                                                           14
['Pixar Animation Studios - 1200 Park Avenue, Emeryville, California, USA']                                                14
['Vancouver, British Columbia, Canada']                                                                                    12
['Toronto, Ontario, Canada']                                                                                               11
                                                                                                                           ..
['Beaufort, South Carolina, USA']                                                                                           1
['Pinecliffe, Colorado, USA (rock tunnel)']                                                         

In [47]:
# 1) Convert release_date like "August 3, 1938" -> datetime -> "1938-08-03"
competitors_data["release_date"] = pd.to_datetime(
    competitors_data["release_date"],
    format="%B %d, %Y",   # e.g., August 3, 1938
    errors="coerce"       # invalid dates become NaT
)

# If you want the column stored as text in YYYY-MM-DD format:
competitors_data["release_date"] = competitors_data["release_date"].dt.strftime("%Y-%m-%d")

# Convert back to datetime for month extraction (safe/easy for season logic)
competitors_data["release_date"] = pd.to_datetime(competitors_data["release_date"], errors="coerce")

# 2) Create seasons from month
month_to_season = {
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Fall", 10: "Fall", 11: "Fall"
}

competitors_data["seasons"] = competitors_data["release_date"].dt.month.map(month_to_season)

# Optional: if date is missing/invalid, label as Unknown
competitors_data["seasons"] = competitors_data["seasons"].fillna("Unknown")


In [48]:
import re
import ast
import pandas as pd
import pycountry

# Build lookup maps once
name_to_iso3 = {}
for c in pycountry.countries:
    # official/common names
    name_to_iso3[c.name.lower()] = c.alpha_3
    if hasattr(c, "official_name"):
        name_to_iso3[c.official_name.lower()] = c.alpha_3

# common aliases
ALIASES = {
    "usa": "USA", "u.s.a": "USA", "u.s.": "USA", "us": "USA", "united states": "USA",
    "uk": "GBR", "u.k.": "GBR", "united kingdom": "GBR", "england": "GBR",
    "south korea": "KOR", "north korea": "PRK", "puerto rico": "PRI",
    "russia": "RUS", "vietnam": "VNM", "italy": "ITA", "france": "FRA", "germany": "DEU", "spain": "ESP", "jordan": "JOR", "lebanon": "LBN", "egypt": "EGY", "iran": "IRN", "iraq": "IRQ", "syria": "SYR",
    "india": "IND", "china": "CHN", "japan": "JPN", "canada": "CAN", "australia": "AUS", "brazil": "BRA", "mexico": "MEX", "argentina": "ARG", "south africa": "ZAF"
}
name_to_iso3.update(ALIASES)

def normalize_text(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    if s.startswith("[") and s.endswith("]"):
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, list):
                s = " ; ".join(str(i) for i in parsed)
        except Exception:
            pass
    s = re.sub(r"\([^)]*\)", " ", s)      # remove (studio), (location), dates
    s = re.sub(r"[\n\r]+", " ; ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_countries_broad(value):
    s = normalize_text(value).lower()
    if not s:
        return []

    found = []
    # phrase-level scan against known country names
    for name, iso3 in name_to_iso3.items():
        if re.search(rf"\b{re.escape(name)}\b", s):
            found.append(iso3)

    # dedupe keep order
    out, seen = [], set()
    for c in found:
        if c not in seen:
            seen.add(c)
            out.append(c)

    return out

competitors_data["filming_countries_list"] = competitors_data["filming_locations"].apply(extract_countries_broad)
competitors_data["filming_countries"] = competitors_data["filming_countries_list"].apply(lambda x: ", ".join(x) if x else "Unknown")


In [49]:
competitors_data.head()

Unnamed: 0,Title,Year,Duration,MPA,Rating,budget,opening_weekend_Gross,grossWorldWWide,gross_US_Canada,release_date,filming_locations,production_company,genres,Languages,seasons,filming_countries_list,filming_countries
0,Snow White and the Seven Dwarfs,1937,1h 23m,Approved,7.6,"$1,499,000 (estimated)","$6,017,914","$184,960,747","$184,925,486",1938-02-04,['Walt Disney Feature Animation - 500 S. Buena...,Walt Disney,Animation,['English'],Winter,[USA],USA
1192,Pinocchio,1940,1h 28m,Approved,7.5,"$2,600,000 (estimated)","$3,769,251","$121,892,045","$84,254,167",1940-02-23,['Walt Disney Feature Animation - 500 S. Buena...,Walt Disney,Animation,['English'],Winter,[USA],USA
1193,Fantasia,1940,2h 4m,G,7.7,"$2,280,000 (estimated)","$980,798","$76,411,978","$76,408,097",1941-09-19,"['Stage 1, Walt Disney Studios, 500 South Buen...",Walt Disney,Animation,['English'],Fall,[USA],USA
2043,Bambi,1942,1h 9m,Approved,7.3,"$858,000 (estimated)","$7,216,741","$267,447,150","$102,247,150",1942-08-21,"['Walt Disney Studios, 500 South Buena Vista S...",Walt Disney,Animation,['English'],Summer,[USA],USA
2044,Casablanca,1942,1h 42m,PG,8.5,"$950,000 (estimated)","$181,494","$4,727,083","$4,219,709",1943-01-23,"['Waterman Drive, Van Nuys, Los Angeles, Calif...",Warner Bros.,Drama,"['English', 'French', 'German', 'Italian', 'Ru...",Winter,[USA],USA


In [50]:
def usa_only_if_present(x):
    if pd.isna(x):
        return "Unknown"
    parts = [p.strip().upper() for p in re.split(r"[,\|;/]+", str(x)) if p.strip()]
    if any(p in {"USA","US","U.S.","U.S.A.","UNITED STATES"} for p in parts):
        return "USA"
    return str(x).strip() if str(x).strip() else "Unknown"

competitors_data["filming_countries"] = competitors_data["filming_countries"].apply(usa_only_if_present)

In [51]:
# pip install pycountry pycountry-convert
import re
import pandas as pd
import pycountry
import pycountry_convert as pc

# -----------------------------
# 1) Continent mapping setup
# -----------------------------
CONTINENT_NAME = {
    "AF": "Africa",
    "AS": "Asia",
    "EU": "Europe",
    "NA": "North America",
    "SA": "South America",
    "OC": "Oceania",
    "AN": "Antarctica"
}

ISO3_OVERRIDES = {
    "XKX": "Europe",         # Kosovo
    "PRI": "North America",  # Puerto Rico
    "RUS": "Europe"          # choose Europe for your business logic
}

def iso3_to_continent(iso3):
    if pd.isna(iso3):
        return "Unknown"

    iso3 = str(iso3).strip().upper()
    if not iso3:
        return "Unknown"

    if iso3 in ISO3_OVERRIDES:
        return ISO3_OVERRIDES[iso3]

    try:
        country = pycountry.countries.get(alpha_3=iso3)
        if not country:
            return "Unknown"
        iso2 = country.alpha_2
        cont_code = pc.country_alpha2_to_continent_code(iso2)
        return CONTINENT_NAME.get(cont_code, "Unknown")
    except Exception:
        return "Unknown"

# -----------------------------
# 2) Parse filming_countries -> ISO3 list
# -----------------------------
def split_iso3_list(x):
    """
    Converts strings like:
      'USA, CAN, GBR'
      'USA|CAN'
      'USA; CAN'
    to ['USA','CAN','GBR'].
    """
    if pd.isna(x):
        return []

    parts = [p.strip().upper() for p in re.split(r"[,\|;/]+", str(x)) if p.strip()]
    # keep only 3-letter codes
    parts = [p for p in parts if re.fullmatch(r"[A-Z]{3}", p)]

    # dedupe preserve order
    seen = set()
    out = []
    for p in parts:
        if p not in seen:
            seen.add(p)
            out.append(p)
    return out

# Make sure this column exists in your dataframe
# competitors_data["filming_countries"] should already contain ISO3-like codes e.g. "USA, CAN"
competitors_data["country_iso3_list"] = competitors_data["filming_countries"].apply(split_iso3_list)

# Primary country and continent
competitors_data["country_iso3_primary"] = competitors_data["country_iso3_list"].apply(
    lambda x: x[0] if x else "UNK"
)
competitors_data["continents"] = competitors_data["country_iso3_primary"].apply(iso3_to_continent)

In [54]:
competitors_data.columns

Index(['Title', 'Year', 'Duration', 'MPA', 'Rating', 'budget',
       'opening_weekend_Gross', 'grossWorldWWide', 'gross_US_Canada',
       'release_date', 'production_company', 'genres', 'Languages', 'seasons',
       'filming_countries', 'continents'],
      dtype='object')

In [53]:
competitors_data.drop(columns=['filming_countries_list', 'country_iso3_list', 'country_iso3_primary', 'filming_locations'], inplace=True, errors='ignore')

In [55]:
import numpy as np
import pandas as pd

money_cols = ["budget", "opening_weekend_Gross", "grossWorldWWide", "gross_US_Canada"]

def clean_money_col(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()

    # Standardize missing markers
    s = s.replace(
        ["", "Missing value", "missing value", "N/A", "n/a", "None", "null", "-", "--"],
        np.nan
    )

    # Remove "(estimated)" tag (case-insensitive)
    s = s.str.replace(r"\(estimated\)", "", regex=True, case=False)

    # Remove currency symbols, commas, spaces and any non-numeric except dot/minus
    s = s.str.replace(r"[^0-9.\-]", "", regex=True)

    # Empty leftovers -> NaN
    s = s.replace("", np.nan)

    # Convert
    return pd.to_numeric(s, errors="coerce")

for col in money_cols:
    competitors_data[col] = clean_money_col(competitors_data[col])

# 1) Profit = worldwide gross - budget
competitors_data["profit"] = competitors_data["grossWorldWWide"] - competitors_data["budget"]

# 2) ROI = (profit / budget)
# Avoid division by zero and invalid budget values
competitors_data["roi"] = np.where(
    competitors_data["budget"] > 0,
    competitors_data["profit"] / competitors_data["budget"],
    np.nan
)

# 3) Domestic share = domestic gross / worldwide gross
competitors_data["domestic_share"] = np.where(
    competitors_data["grossWorldWWide"] > 0,
    competitors_data["gross_US_Canada"] / competitors_data["grossWorldWWide"],
    np.nan
)

# 4) International gross share = (worldwide - domestic) / worldwide
# clip(lower=0) prevents tiny negative values from dirty data
intl_gross = (competitors_data["grossWorldWWide"] - competitors_data["gross_US_Canada"]).clip(lower=0)

competitors_data["intl_gross_share"] = np.where(
    competitors_data["grossWorldWWide"] > 0,
    intl_gross / competitors_data["grossWorldWWide"],
    np.nan
)

# Optional: if you want percentages instead of decimals (0-1), uncomment:
competitors_data["roi"] = competitors_data["roi"] * 100
competitors_data["domestic_share"] = competitors_data["domestic_share"] * 100
competitors_data["intl_gross_share"] = competitors_data["intl_gross_share"] * 100
