In [1]:
import pandas as pd
import re 
bio_designers = pd.read_json("data/designer_data_fmd.json", lines=True)

def extract_birth_year(text, min_year=1850, max_year=2025):
    if not isinstance(text, str):
        return None
    years = re.findall(r"\b(1[89]\d{2}|20\d{2})\b", text)  # captures 1800–2099
    years = [int(y) for y in years if min_year <= int(y) <= max_year]
    return min(years) if years else None

bio_designers["year_birth"] = bio_designers["biography"].apply(extract_birth_year)


In [2]:
designers_fmd = bio_designers[bio_designers["year_birth"]>1910].designer_name

In [24]:
import pandas as pd
df = pd.read_parquet("data/vogue_data.parquet")
df = df[df.groupby("fashion_house")["fashion_house"].transform("count") >= 10]

In [27]:
unique_designers = set(name for sublist in df["designer_name"] for name in sublist)

print(unique_designers)
print(f"Total unique designers: {len(unique_designers)}")

{'Anthony Vaccarello', 'Christopher John Rogers', 'Reem Acra', 'Giambattista Valli', 'Pieter Mulier', 'Gaby Aghion', 'Alessandro Sartori', 'Ozwald Boateng', 'Charles Jeffrey', 'John Bartlett', 'Victor Alfaro', 'Kim Jones', 'Demna', 'Catherine Holstein', 'Phoebe Philo', 'Christopher Kane', 'Jens Laugesen', 'Delfina Delettrez', 'Colm Dillane', 'Christophe Decarnin', 'Jen Kao', 'Nensi Dojaka', 'Sharon Wauchob', 'Jenny Packham', 'Gilles Dufour', 'Angela Missoni', 'Yohji Yamamoto', 'David Koma', 'Vera Wang', 'Christophe Lemaire', 'Martine Sitbon', 'Pierre Balmain', 'Marcelo Burlon', 'Alejandro Ingelmo', 'Mike Amiri', 'Craig Natiello', 'Martin Margiela', 'Valentin Yudashkin', 'Simon Holloway', 'Michiko Koshino', 'Jeremy Scott', 'Elsa Peretti', 'Thakoon Panichgul', 'Pamela Love', 'John Varvatos', 'Calvin Klein', 'Ossie Clark', 'Jonny Johansson', 'Rosie Assoulin', 'Donna Karan', 'Anya Hindmarch', 'Virgil Abloh', 'Chado Ralph Rucci', 'Elie Tahari', 'Gianfranco Ferre', 'Mary Katrantzou', 'Wes Go

In [None]:
from collections import Counter

# Flatten all designers across all rows
all_designers = [designer for sublist in df["designer_name"] for designer in sublist]

# Count occurrences
designer_counts = Counter(all_designers)

# Convert to DataFrame for nicer display (optional)
counts_df = pd.DataFrame.from_dict(designer_counts, orient='index', columns=['collection_count'])
counts_df = counts_df.sort_values('collection_count', ascending=False)



                            collection_count
Giorgio Armani                           403
Karl Lagerfeld                           309
Ralph Lauren                             307
Yves Saint Laurent                       284
Marc Jacobs                              269
...                                      ...
Silvio And Giuliana Gerani                 2
Chantal Thomass                            2
Julien Dossena                             2
Andrew Marc                                2
Gaby Aghion                                1

[463 rows x 1 columns]


In [31]:
designers_10plus = {designer for designer, count in designer_counts.items() if count >= 10}
len(designers_10plus )

401

In [4]:
#designer_fmd = pd.read_csv("data/names/designer_data_fmd_names.csv").designer_name
designer_bof = pd.read_json("data/all_designer_data_BOF.json", lines=True).designer_name
designer_wiki = pd.read_csv("data/names/fashion_designers_wikidata.csv").designer_name
additional_designers = pd.read_csv("data/names/additional_designers.csv").designer_name

In [5]:
all_designers = set(list(designers_fmd )+ list(designer_bof )+ list(additional_designers))
len(all_designers)

1484

In [6]:
import pandas as pd
import ahocorasick  # pip install pyahocorasick

# Build automaton
A = ahocorasick.Automaton()
for name in all_designers:
    A.add_word(name.lower(), name)
A.make_automaton()

def find_names(text):
    matches = set()
    for end_idx, orig_name in A.iter(text.lower()):
        matches.add(orig_name)
    return list(matches)

df = df.dropna(subset= ["description"])
df["designer_names"] = df["description"].apply(find_names)

In [None]:
df["year"] = df["year"].astype(int)
# Sort to make shifting easier
df = df.sort_values(["fashion_house", "year"]).reset_index(drop=True)



def propagate_single(row, df):
    fh, yr = row.fashion_house, row.year
    current_names = row.designer_names

    # Helper to compare designer lists ignoring order
    def same_names(list1, list2):
        return set(list1) == set(list2)

    # If multiple designers but more than 2, treat as ambiguous and propagate
    if len(current_names) > 2:
        current_names = []

    # If multiple designers with 2 or fewer names,
    # check if exact same list appears in neighbor years
    elif 1 < len(current_names) <= 2:
        neighbors = df.loc[
            (df.fashion_house == fh) &
            (df.year.isin([yr - 3, yr, yr + 3])) &
            (df.index != row.name)
        ]
        # Check if any neighbor has exactly the same designer list (order ignored)
        repeated = neighbors["designer_names"].apply(
            lambda x: len(x) == len(current_names) and same_names(x, current_names)
        ).any()
        if not repeated:
            current_names = []

    # If single designer but appears only once in the whole fashion house, treat as empty
    if len(current_names) == 1:
        name = current_names[0]
        count_in_house = sum(
            len(names) == 1 and names[0] == name
            for names in df.loc[df.fashion_house == fh, "designer_names"]
        )
        if count_in_house == 1:
            current_names = []

    # If empty now, try to propagate from neighbors with exactly one designer
    if len(current_names) == 0:
        neighbors = df.loc[
            (df.fashion_house == fh) &
            (df.year.isin([yr - 2, yr, yr + 2])) &
            (df.index != row.name)
        ]
        single_neighbors = neighbors[neighbors["designer_names"].apply(lambda x: len(x) == 1)]

        if not single_neighbors.empty:
            single_neighbors = single_neighbors.iloc[
                (single_neighbors["year"] - yr).abs().argsort()
            ]
            return single_neighbors["designer_names"].iloc[0]

    return current_names
    
# Apply propagation
df["designer_name"] = df.apply(lambda r: propagate_single(r, df), axis=1)

In [8]:


# for name in designer_wiki:
#     A.add_word(name.lower(), name)
# A.make_automaton()

# def find_names(text):
#     matches = set()
#     for end_idx, orig_name in A.iter(text.lower()):
#         matches.add(orig_name)
#     return list(matches)

# mask = df["designer_names"].apply(lambda x: len(x) == 0)

# # Apply find_names only on these rows
# df.loc[mask, "designer_names"] = df.loc[mask, "description"].apply(find_names)
# df.loc[mask, "designer_name"] = df.loc[mask].apply(lambda r: propagate_single(r, df), axis=1)

In [9]:
# Aggregate into a set of unique names for each fashion house
designer_per_house = (
    df.groupby("fashion_house")["designer_name"]
      .apply(lambda lists: sorted({name for sublist in lists for name in sublist}))
)

designer_per_house

fashion_house
3 1 Phillip Lim                           [Ossie Clark, Phillip Lim]
A Cold Wall                                            [Samuel Ross]
A F Vandevorst     [Dries Van Noten, Jean Paul Gaultier, Stephen ...
A P C                                                 [Jean Touitou]
Aalto                                             [Tuomas Merikoski]
                                         ...                        
Z Zegna                                         [Alessandro Sartori]
Zac Posen                                                [Zac Posen]
Zankov                                                            []
Zimmermann                             [Nicky and Simone Zimmermann]
Zuhair Murad                                          [Zuhair Murad]
Name: designer_name, Length: 699, dtype: object

In [10]:
house = "Aalto"
designers_for_house = sorted({name for sublist in df.loc[df["fashion_house"] == house, "designer_name"] for name in sublist})
print(designers_for_house)

['Tuomas Merikoski']


In [11]:
df[df["designer_name"].apply(lambda x: "Elsa Peretti" in x)].fashion_house

1950          Area
1951          Area
1952          Area
1953          Area
1959          Area
1960          Area
1963          Area
1964          Area
4982    Chris Benz
4983    Chris Benz
4984    Chris Benz
4985    Chris Benz
9945       Halston
9946       Halston
9947       Halston
9948       Halston
Name: fashion_house, dtype: object

In [19]:
import pandas as pd
import numpy as np

def fill_empty_designer_names(df):
    # Ensure sorted by fashion_house and year
    df = df.sort_values(["fashion_house", "year"]).reset_index(drop=True)

    # Create a copy of the designer_names column to modify
    filled = df["designer_name"].copy()

    # For each fashion house, fill empty designer_names from closest year
    for fh, group in df.groupby("fashion_house"):
        years = group["year"].values
        names = group["designer_name"].values

        # Indices of rows in original df for this fashion house
        indices = group.index.values

        # For each empty designer_names, find closest non-empty
        for i, name_list in zip(indices, names):
            if len(name_list) == 0:
                year = df.at[i, "year"]

                # Find candidates with non-empty designer_names
                non_empty = [(idx, abs(year - df.at[idx, "year"])) 
                             for idx in indices if len(df.at[idx, "designer_name"]) > 0]

                if non_empty:
                    # Pick index with minimum year difference
                    closest_idx = min(non_empty, key=lambda x: x[1])[0]

                    # Assign that designer_names list
                    filled.at[i] = df.at[closest_idx, "designer_name"]

    df["designer_name"] = filled
    return df

# Example usage:
# df = your dataframe
df = fill_empty_designer_names(df)

In [22]:
# Check remaining problematic rows (empty or multiple names)
problem_rows = df[df["designer_name"].apply(lambda x: len(x) ==0)]

print("Remaining problem rows:")
problem_rows.fashion_house.unique()

len(problem_rows)

Remaining problem rows:


1976

In [23]:
df

Unnamed: 0,fashion_house,show,URL,image_urls,location,season,year,category,image_urls_sample,description,editor,publish_date,designer_names,designer_name
0,3 1 Phillip Lim,fall-2006-ready-to-wear,https://www.vogue.com/fashion-shows/fall-2006-...,[https://assets.vogue.com/photos/55c6516608298...,,fall,2006,ready-to-wear,[https://assets.vogue.com/photos/55c6516608298...,While violinists played classic Prince hits in...,Laird Borrelli-Persson,"February 6, 2006",[Phillip Lim],[Phillip Lim]
1,3 1 Phillip Lim,fall-2007-ready-to-wear,https://www.vogue.com/fashion-shows/fall-2007-...,[https://assets.vogue.com/photos/55c6517708298...,,fall,2007,ready-to-wear,[https://assets.vogue.com/photos/55c6517708298...,"""Clothes people wear."" With that deceptively s...",Nicole Phelps,"February 3, 2007",[Phillip Lim],[Phillip Lim]
2,3 1 Phillip Lim,spring-2007-ready-to-wear,https://www.vogue.com/fashion-shows/spring-200...,[https://assets.vogue.com/photos/55c6517108298...,,spring,2007,ready-to-wear,[https://assets.vogue.com/photos/55c6517108298...,Phillip Lim's first-ever runway presentation g...,Nicole Phelps,"September 9, 2006",[Phillip Lim],[Phillip Lim]
3,3 1 Phillip Lim,fall-2008-ready-to-wear,https://www.vogue.com/fashion-shows/fall-2008-...,[https://assets.vogue.com/photos/55c6518908298...,,fall,2008,ready-to-wear,[https://assets.vogue.com/photos/55c6518908298...,He's a designer who often dismisses his clothe...,Meenal Mistry,"February 5, 2008",[Phillip Lim],[Phillip Lim]
4,3 1 Phillip Lim,spring-2008-ready-to-wear,https://www.vogue.com/fashion-shows/spring-200...,[https://assets.vogue.com/photos/55c6518008298...,,spring,2008,ready-to-wear,[https://assets.vogue.com/photos/55c6518008298...,A Mercer Street store! A children's line! Eyew...,Meenal Mistry,"September 8, 2007",[Phillip Lim],[Phillip Lim]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21998,Zuhair Murad,resort-2024,https://www.vogue.com/fashion-shows/resort-202...,[https://assets.vogue.com/photos/649aa1e5c1aeb...,,resort,2024,,[https://assets.vogue.com/photos/649aa22aed92a...,Zuhair Murad has stars in his eyes. One is Tay...,Tina Isaac-Goizé,"June 28, 2023",[Zuhair Murad],[Zuhair Murad]
21999,Zuhair Murad,spring-2025-ready-to-wear,https://www.vogue.com/fashion-shows/spring-202...,[https://assets.vogue.com/photos/66fac98107585...,,spring,2025,ready-to-wear,[https://assets.vogue.com/photos/66fac9a84919c...,The sea is shaping up to be one of spring’s ma...,Tina Isaac-Goizé,"October 2, 2024",[Zuhair Murad],[Zuhair Murad]
22000,Zuhair Murad,spring-2025-ready-to-wear,https://www.vogue.com/fashion-shows/spring-202...,[https://assets.vogue.com/photos/66fac98107585...,,spring,2025,ready-to-wear,[https://assets.vogue.com/photos/66fac9a84919c...,The sea is shaping up to be one of spring’s ma...,Tina Isaac-Goizé,"October 2, 2024",[Zuhair Murad],[Zuhair Murad]
22001,Zuhair Murad,resort-2025,https://www.vogue.com/fashion-shows/resort-202...,[https://assets.vogue.com/photos/6672a4d49ee2d...,,resort,2025,,[https://assets.vogue.com/photos/66705391075f0...,Whether she’s a planner or a last-minute dress...,Tina Isaac-Goizé,"June 17, 2024",[Zuhair Murad],[Zuhair Murad]
