# Titles Parser
Let's grab all the most frequent words from each title, and then use GPT 4 to categorize each token.

In [3]:
import re
import csv
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

In [19]:
# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("stopwords")

# File paths
input_path = "titles.txt"
output_path = "title_tokens.csv"

# Load stopwords
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package punkt to /Users/lucyzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucyzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
# Read all lines
with open(input_path, "r", encoding="utf-8") as f:
    text = f.read()

# Tokenize and normalize
tokens = word_tokenize(text.lower())
# Keep only alphabetic words and remove stopwords
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

# Count token frequencies
token_counts = Counter(filtered_tokens)

[nltk_data] Downloading package punkt to /Users/lucyzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucyzhang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
with open(output_path, "w+", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["token", "count"])
    for token, count in token_counts.most_common():
        writer.writerow([token, count])

print(f"✅ Token frequencies written to {output_path}")

✅ Token frequencies written to title_tokens.csv


In [3]:
import pandas as pd
df = pd.read_csv("title_tokens.csv")

In [5]:
df.head()

Unnamed: 0,token,count
0,women,327341
1,black,127970
2,sleeve,119554
3,dress,114901
4,long,94059


In [6]:
token_counts

Counter({'women': 327341,
         'black': 127970,
         'sleeve': 119554,
         'dress': 114901,
         'long': 94059,
         'men': 91066,
         'casual': 83010,
         'size': 64886,
         'blue': 63686,
         'womens': 61514,
         'neck': 61368,
         'short': 58271,
         'girls': 52701,
         'white': 47705,
         'shirt': 46730,
         'tops': 46197,
         'top': 44770,
         'silver': 43489,
         'summer': 43367,
         'set': 42261,
         'earrings': 41982,
         'pants': 41458,
         'red': 38183,
         'large': 38111,
         'necklace': 37775,
         'jewelry': 35986,
         'gold': 35782,
         'floral': 35675,
         'print': 35112,
         'l': 35022,
         'high': 34144,
         'party': 32183,
         'loose': 31997,
         'watch': 31631,
         'leather': 31244,
         'fashion': 30908,
         'xl': 30701,
         'sleeveless': 30313,
         'medium': 30252,
         'cotton': 

# OpenAI
Next, let's use GPT 4 to classify each of them

In [22]:
from openai import OpenAI
import os
import json
import re
import time
client = OpenAI(api_key=os.environ.get("OPENAI_INTERVIEW_API_KEY"))

In [88]:
def classify_tokens_with_gpt(token_chunk):
    prompt = f"""
You are a fashion expert. Your task is to classify product-related tokens from fashion item titles into one of the following categories:
- "gender": e.g. men, women's, unisex
- "color": e.g. black, navy, beige
- "category": e.g. boots, sandals, dress, hoodie
- "other": if it doesn't fit any of the above

Return a JSON object where each token maps to one of those 4 categories.

tokens = {token_chunk}
"""
    print(prompt)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )

    raw = response.choices[0].message.content.strip()
    # Strip ```json ... ``` if it exists
    if raw.startswith("```"):
        raw = re.sub(r"^```json\n?|```$", "", raw).strip()
    
    return json.loads(raw)
    

def classify_top_tokens(df, max_tokens=1000, chunk_size=50):
    # Ensure a 'category' column exists
    if 'category' not in df.columns:
        df['category'] = None

    tokens_to_classify = df[df['category'].isnull()]['token'][:max_tokens]

    for i in range(1000, len(tokens_to_classify), chunk_size):
        chunk = tokens_to_classify.iloc[i:i+chunk_size].tolist()
        try:
            print(f"⏳ Classifying tokens {i}–{i+len(chunk)}...")
            result = classify_tokens_with_gpt(chunk)
            print()
            print(result)
            for token, cat in result.items():
                df.loc[df['token'] == token, 'category'] = cat
            time.sleep(1.5)
        except Exception as e:
            print(f"❌ Error classifying chunk {i}–{i+len(chunk)}: {e}")
            continue

    return df

df = pd.read_csv("title_tokens_with_categories.csv")
df = classify_top_tokens(df, max_tokens=2000)

⏳ Classifying tokens 1000–1050...

You are a fashion expert. Your task is to classify product-related tokens from fashion item titles into one of the following categories:
- "gender": e.g. men, women's, unisex
- "color": e.g. black, navy, beige
- "category": e.g. boots, sandals, dress, hoodie
- "other": if it doesn't fit any of the above

Return a JSON object where each token maps to one of those 4 categories.

tokens = ['blank', 'fenix', 'splice', 'ripple', 'row', 'jewel', 'hockey', 'nautical', 'snowman', 'tunnel', 'clicker', 'crafting', 'machine', 'facial', 'cambridge', 'cafepress', 'brother', 'smith', 'velour', 'lights', 'gen', 'brooches', 'embossed', 'luxe', 'pig', 'tough', 'heavyweight', 'coolibar', 'heat', 'sleeved', 'surplice', 'watchband', 'crimson', 'tab', 'biadani', 'net', 'bifold', 'unlined', 'center', 'cactus', 'seed', 'ivy', 'sweatsuits', 'bust', 'patterns', 'san', 'endless', 'without', 'add', 'shirred']


{'blank': 'other', 'fenix': 'other', 'splice': 'other', 'ripple': '

In [89]:
df.to_csv("title_tokens_with_categories.csv", index=False)
print("✅ Done. You can resume later from this file.")

✅ Done. You can resume later from this file.


In [90]:
df.head(20)

Unnamed: 0,token,count,category
0,women,327341,gender
1,black,127970,color
2,sleeve,119554,other
3,dress,114901,category
4,long,94059,other
5,men,91066,gender
6,casual,83010,other
7,size,64886,other
8,blue,63686,color
9,womens,61514,gender


In [36]:
df[df["category"]=="other"]["token"].unique()

array(['sleeve', 'long', 'casual', 'size', 'neck', 'short', 'summer',
       'set', 'large', 'floral', 'print', 'l', 'high', 'party', 'loose',
       'leather', 'fashion', 'xl', 'sleeveless', 'medium', 'cotton',
       'waist', 'v', 'pockets', 'strap', 'fit', 'small', 'vintage',
       'lace', 'plus', 'shoulder', 'soft', 'steel', 'us', 'color',
       'sterling', 'button', 'beach', 'one', 'adjustable', 'gift',
       'solid', 'pack', 'yoga', 'winter', 'piece', 'workout', 'face',
       'maxi', 'round', 'knit', 'stainless', 'sports', 'lightweight',
       'light', 'front', 'cute', 'running', 'mini', 'slim', 'printed',
       'pocket', 'wedding', 'swing', 'classic', 'style', 'midi',
       'crystal', 'sport', 'plated', 'elastic', 'outdoor', 'charm',
       'little', 'gifts', 'bodycon', 'wrap', 'warm', 'open', 'cover',
       'flower', 'ruffle', 'christmas', 'stretch', 'heart', 'athletic',
       'wide', 'retro', 'sexy', 'compatible', 'fleece', 'dangle', 'crop',
       'metal', 'sun', 'ch

In [9]:
token_df = pd.read_csv("title_tokens_with_categories.csv")

In [10]:
token_df.head()

Unnamed: 0,token,count,category
0,women,327341,gender
1,black,127970,color
2,sleeve,119554,other
3,dress,114901,category
4,long,94059,other


# Enhance products
Now, let's add these back to our product DF

In [6]:
product_df = pd.read_csv("meta_Amazon_Fashion.csv")

In [7]:
product_df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,primary_key,image_url
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,[],{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,,yuedge-5-pairs-mens-moisture-control-cushioned...,https://m.media-amazon.com/images/I/41+cCfaVOF...
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"['Drawstring closure', 'Machine Wash']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,[],{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,,doubcq-womens-palazzo-lounge-wide-leg-casual-f...,https://m.media-amazon.com/images/I/515cR-ta1E...
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"['Zipper closure', 'Hand Wash Only']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,,pastel-by-vivienne-honey-vanilla-girls-trapeze...,https://m.media-amazon.com/images/I/31GwmwNCdA...
3,AMAZON FASHION,Mento Streamtail,2.0,1,"['Thermoplastic Rubber sole', 'High Density Pr...","[""Slip on the Women's Mento and you're ready t...",29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,[],{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,,mento-streamtail-B0811M2JG9,https://m.media-amazon.com/images/I/31P-uHUUIX...
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"['Pull On closure', 'Size Guide: ""S"" fits calf...",['Ronnox Calf Sleeves - Allowing Your Body to ...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': ""HONEST Review: RONNOX Women's 3-Pa...",RONNOX,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,,ronnox-womens-3-pairs-bright-colored-calf-comp...,https://m.media-amazon.com/images/I/51CqMDJOOD...


In [11]:
token_info = {
    token: {"category": category, "count": count}
    for token, category, count in zip(token_df.token, token_df.category, token_df["count"])
}

In [12]:
token_info

{'women': {'category': 'gender', 'count': 327341},
 'black': {'category': 'color', 'count': 127970},
 'sleeve': {'category': 'other', 'count': 119554},
 'dress': {'category': 'category', 'count': 114901},
 'long': {'category': 'other', 'count': 94059},
 'men': {'category': 'gender', 'count': 91066},
 'casual': {'category': 'other', 'count': 83010},
 'size': {'category': 'other', 'count': 64886},
 'blue': {'category': 'color', 'count': 63686},
 'womens': {'category': 'gender', 'count': 61514},
 'neck': {'category': 'other', 'count': 61368},
 'short': {'category': 'other', 'count': 58271},
 'girls': {'category': 'gender', 'count': 52701},
 'white': {'category': 'color', 'count': 47705},
 'shirt': {'category': 'category', 'count': 46730},
 'tops': {'category': 'category', 'count': 46197},
 'top': {'category': 'category', 'count': 44770},
 'silver': {'category': 'color', 'count': 43489},
 'summer': {'category': 'other', 'count': 43367},
 'set': {'category': 'other', 'count': 42261},
 'earr

In [20]:
def tokenize_field(val):
    if isinstance(val, list):
        val = " ".join(val)
    elif not isinstance(val, str):
        return []
    tokens = word_tokenize(val.lower())
    return [t for t in tokens if t.isalpha() and t not in stop_words]

In [21]:
def extract_tokens(row):
    all_tokens = (
        tokenize_field(row.get("title")) +
        tokenize_field(row.get("description")) +
        tokenize_field(row.get("details")) +
        tokenize_field(row.get("features"))
    )
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for token in all_tokens:
        if token not in seen:
            deduped.append(token)
            seen.add(token)
    return deduped

In [22]:
from collections import defaultdict
def classify_tokens(tokens):
    categorized = defaultdict(list)
    unclassified = []

    for token in tokens:
        info = token_info.get(token)
        if info:
            categorized[info["category"]].append(token)
        else:
            unclassified.append(token)

    # Prioritize unclassified others by global frequency
    sorted_unclassified = sorted(
        set(unclassified),
        key=lambda x: -token_info.get(x, {}).get("count", 0)
    )

    return {
        "gender_tokens": categorized["gender"][:2],
        "color_tokens": categorized["color"][:2],
        "category_tokens": categorized["category"][:2],
        "other_tokens": (categorized["other"] + sorted_unclassified)[:5],
    }

In [23]:
# Apply to DataFrame
def process_product(row):
    tokens = extract_tokens(row)
    row["tokens"] = tokens
    classified = classify_tokens(tokens)
    for key, val in classified.items():
        row[key] = val
    return row


In [35]:
small_df = product_df[:5]

In [36]:
small_df = small_df.apply(process_product, axis=1)

In [37]:
small_df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,details,parent_asin,bought_together,primary_key,image_url,tokens,gender_tokens,color_tokens,category_tokens,other_tokens
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,...,{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,,yuedge-5-pairs-mens-moisture-control-cushioned...,https://m.media-amazon.com/images/I/41+cCfaVOF...,"[yuedge, pairs, men, moisture, control, cushio...",[men],[blue],[socks],"[pairs, control, dry, fit, casual]"
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"['Drawstring closure', 'Machine Wash']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,...,{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,,doubcq-womens-palazzo-lounge-wide-leg-casual-f...,https://m.media-amazon.com/images/I/515cR-ta1E...,"[doubcq, women, palazzo, lounge, wide, leg, ca...",[women],[blue],"[palazzo, pants]","[lounge, wide, leg, casual, flowy]"
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"['Zipper closure', 'Hand Wash Only']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,,pastel-by-vivienne-honey-vanilla-girls-trapeze...,https://m.media-amazon.com/images/I/31GwmwNCdA...,"[pastel, vivienne, honey, vanilla, girls, trap...",[girls],"[honey, navy]","[trapeze, dress]","[vivienne, easy, removable, large, years]"
3,AMAZON FASHION,Mento Streamtail,2.0,1,"['Thermoplastic Rubber sole', 'High Density Pr...","[""Slip on the Women's Mento and you're ready t...",29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,...,{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,,mento-streamtail-B0811M2JG9,https://m.media-amazon.com/images/I/31P-uHUUIX...,"[mento, streamtail, slip, women, ready, hit, b...",[women],[],"[thong, sandal]","[slip, beach, canvas, straps, soft]"
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"['Pull On closure', 'Size Guide: ""S"" fits calf...",['Ronnox Calf Sleeves - Allowing Your Body to ...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': ""HONEST Review: RONNOX Women's 3-Pa...",RONNOX,...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,,ronnox-womens-3-pairs-bright-colored-calf-comp...,https://m.media-amazon.com/images/I/51CqMDJOOD...,"[ronnox, women, bright, colored, calf, compres...",[women],"[pink, green]",[socks],"[calf, compression, tube, sleeves, body]"


In [38]:
product_df = product_df.apply(process_product, axis=1)

In [39]:
product_df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,details,parent_asin,bought_together,primary_key,image_url,tokens,gender_tokens,color_tokens,category_tokens,other_tokens
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,...,{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,,yuedge-5-pairs-mens-moisture-control-cushioned...,https://m.media-amazon.com/images/I/41+cCfaVOF...,"[yuedge, pairs, men, moisture, control, cushio...",[men],[blue],[socks],"[pairs, control, dry, fit, casual]"
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"['Drawstring closure', 'Machine Wash']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,...,{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,,doubcq-womens-palazzo-lounge-wide-leg-casual-f...,https://m.media-amazon.com/images/I/515cR-ta1E...,"[doubcq, women, palazzo, lounge, wide, leg, ca...",[women],[blue],"[palazzo, pants]","[lounge, wide, leg, casual, flowy]"
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"['Zipper closure', 'Hand Wash Only']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,,pastel-by-vivienne-honey-vanilla-girls-trapeze...,https://m.media-amazon.com/images/I/31GwmwNCdA...,"[pastel, vivienne, honey, vanilla, girls, trap...",[girls],"[honey, navy]","[trapeze, dress]","[vivienne, easy, removable, large, years]"
3,AMAZON FASHION,Mento Streamtail,2.0,1,"['Thermoplastic Rubber sole', 'High Density Pr...","[""Slip on the Women's Mento and you're ready t...",29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,...,{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,,mento-streamtail-B0811M2JG9,https://m.media-amazon.com/images/I/31P-uHUUIX...,"[mento, streamtail, slip, women, ready, hit, b...",[women],[],"[thong, sandal]","[slip, beach, canvas, straps, soft]"
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"['Pull On closure', 'Size Guide: ""S"" fits calf...",['Ronnox Calf Sleeves - Allowing Your Body to ...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': ""HONEST Review: RONNOX Women's 3-Pa...",RONNOX,...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,,ronnox-womens-3-pairs-bright-colored-calf-comp...,https://m.media-amazon.com/images/I/51CqMDJOOD...,"[ronnox, women, bright, colored, calf, compres...",[women],"[pink, green]",[socks],"[calf, compression, tube, sleeves, body]"


In [48]:
def format_product(row):
    json_string = f"""{{"gender": {row["gender_tokens"]},"color": {row["color_tokens"]},"category": {row["category_tokens"]},"style": {row["other_tokens"]}}}"""
    row["text_for_embedding"] = json_string
    return row


In [49]:
product_df = product_df.apply(format_product, axis=1)
product_df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,parent_asin,bought_together,primary_key,image_url,tokens,gender_tokens,color_tokens,category_tokens,other_tokens,text_for_embedding
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,...,B08BHN9PK5,,yuedge-5-pairs-mens-moisture-control-cushioned...,https://m.media-amazon.com/images/I/41+cCfaVOF...,"[yuedge, pairs, men, moisture, control, cushio...",[men],[blue],[socks],"[pairs, control, dry, fit, casual]","{""gender"": ['men'],""color"": ['blue'],""category..."
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"['Drawstring closure', 'Machine Wash']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,...,B08R39MRDW,,doubcq-womens-palazzo-lounge-wide-leg-casual-f...,https://m.media-amazon.com/images/I/515cR-ta1E...,"[doubcq, women, palazzo, lounge, wide, leg, ca...",[women],[blue],"[palazzo, pants]","[lounge, wide, leg, casual, flowy]","{""gender"": ['women'],""color"": ['blue'],""catego..."
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"['Zipper closure', 'Hand Wash Only']",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,...,B077KJHCJ4,,pastel-by-vivienne-honey-vanilla-girls-trapeze...,https://m.media-amazon.com/images/I/31GwmwNCdA...,"[pastel, vivienne, honey, vanilla, girls, trap...",[girls],"[honey, navy]","[trapeze, dress]","[vivienne, easy, removable, large, years]","{""gender"": ['girls'],""color"": ['honey', 'navy'..."
3,AMAZON FASHION,Mento Streamtail,2.0,1,"['Thermoplastic Rubber sole', 'High Density Pr...","[""Slip on the Women's Mento and you're ready t...",29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,...,B0811M2JG9,,mento-streamtail-B0811M2JG9,https://m.media-amazon.com/images/I/31P-uHUUIX...,"[mento, streamtail, slip, women, ready, hit, b...",[women],[],"[thong, sandal]","[slip, beach, canvas, straps, soft]","{""gender"": ['women'],""color"": [],""category"": [..."
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"['Pull On closure', 'Size Guide: ""S"" fits calf...",['Ronnox Calf Sleeves - Allowing Your Body to ...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': ""HONEST Review: RONNOX Women's 3-Pa...",RONNOX,...,B07SB2892S,,ronnox-womens-3-pairs-bright-colored-calf-comp...,https://m.media-amazon.com/images/I/51CqMDJOOD...,"[ronnox, women, bright, colored, calf, compres...",[women],"[pink, green]",[socks],"[calf, compression, tube, sleeves, body]","{""gender"": ['women'],""color"": ['pink', 'green'..."


In [50]:
product_df.to_csv("meta_Amazon_Fashion_enhanced.csv", index=False)

In [53]:
len(list(product_df["text_for_embedding"].unique()))

632256

In [54]:
grouped_df = product_df.groupby("text_for_embedding")["primary_key"].apply(list).reset_index()
grouped_df.rename(columns={"primary_key": "pks"}, inplace=True)

# Write the grouped DataFrame to a new CSV
# grouped_df.to_csv("texts_for_embedding.csv", index=False)

In [55]:
grouped_df.head()

Unnamed: 0,text_for_embedding,pks
0,"{""gender"": ['adult', 'adults'],""color"": ['ash'...",[ash-ketchum-adult-costume---jacket-version-B0...
1,"{""gender"": ['adult', 'adults'],""color"": ['blac...",[rubies-black-light-activated-pink-fishnet-leg...
2,"{""gender"": ['adult', 'adults'],""color"": ['blac...",[forum-novelties-zebra-suspenders-B00I5RJUQK]
3,"{""gender"": ['adult', 'adults'],""color"": ['blac...",[black-lives-matter-adult-size-silicone-bracel...
4,"{""gender"": ['adult', 'adults'],""color"": ['blac...",[adult-dalmatian-costume-black-spotted-dalmati...


In [56]:
grouped_df.to_csv("texts_for_embedding.csv", index=False)

In [60]:
list(grouped_df["text_for_embedding"].unique())[1000:1500]

['{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'glasses\', \'goggles\'],"style": [\'swimming\', \'set\', \'polarized\', \'uv\', \'protective\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'goggle\', \'swim\'],"style": [\'swimming\', \'anti\', \'protection\', \'case\', \'x\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'goggles\', \'cap\'],"style": [\'swimming\', \'nose\', \'clip\', \'ear\', \'anti\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'goggles\', \'plugs\'],"style": [\'swimming\', \'ear\', \'pu\', \'anti\', \'uv\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'goggles\'],"style": [\'swimming\', \'comfortable\', \'large\', \'frame\', \'type\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\'],"category": [\'hat\', \'trucker\'],"style": [\'cool\', \'office\', \'show\', \'merchandise\', \'x\']}',
 '{"gender": [\'adult\', \'men\'],"color": [\'pink\

In [63]:

# "{""gender"": ['adult', 'men'],""color"": ['black'],""category"": ['jewelry', 'rings'],""style"": ['straight', 'cz', 'nose', 'studs', 'crystal']}",['2pc-18g-black-straight-cz-nose-bone-studs-2mm-crystal-gem-surgical-steel-nostril-jewelry-B0B64XTJ1R']

small = product_df[product_df["primary_key"]=="2pc-18g-black-straight-cz-nose-bone-studs-2mm-crystal-gem-surgical-steel-nostril-jewelry-B0B64XTJ1R"]

In [67]:
small

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,parent_asin,bought_together,primary_key,image_url,tokens,gender_tokens,color_tokens,category_tokens,other_tokens,text_for_embedding
400190,AMAZON FASHION,2pc 18g Black Straight CZ Nose Bone Studs 2mm ...,4.2,38,"['Ball closure', 'SUPERIOR 316LVM SURGICAL STE...",['These nose rings bone studs 18g are designed...,10.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'EVONIX Premium Body Piercing Jewel...,Evonix,...,B0B64XTJ1R,,2pc-18g-black-straight-cz-nose-bone-studs-2mm-...,https://m.media-amazon.com/images/I/41otdMchNi...,"[black, straight, cz, nose, bone, studs, cryst...","[adult, men]",[black],"[jewelry, rings]","[straight, cz, nose, studs, crystal]","{""gender"": ['adult', 'men'],""color"": ['black']..."


In [66]:
small["title"].unique()

array(['2pc 18g Black Straight CZ Nose Bone Studs 2mm Crystal Gem Surgical Steel Nostril Jewelry'],
      dtype=object)

In [68]:
small.to_dict()

{'main_category': {400190: 'AMAZON FASHION'},
 'title': {400190: '2pc 18g Black Straight CZ Nose Bone Studs 2mm Crystal Gem Surgical Steel Nostril Jewelry'},
 'average_rating': {400190: 4.2},
 'rating_number': {400190: 38},
 'features': {400190: "['Ball closure', 'SUPERIOR 316LVM SURGICAL STEEL - These bone nose stud black are hypoallergenic, lead-free and nickel-free. The construction of the cz nose bone is corrosion resistant, bio-compatible and easy to clean. The bright polished finish of these nose bone studs 2mm is tarnish-resistant and non-fading', 'SPECIFICATIONS - Crystal: 2 x Aurora Borealis crystal CZ gems that are bright and very clear (no fogginess) - they twinkle and sparkle in the light, looking cute and pretty whilst capturing attention. Length: 7mm. Gauge: 18g - 1mm. Ball End: 2mm gem / crystal. These 2mm nose bone are designed and tested for quality. Get the most out of your piercing with these 18 gauge nose bone', 'PIERCING STYLE - These 18g nose bone can be used for 