<a href="https://colab.research.google.com/github/ze11ey/StackTest/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
ROOT = "/content/drive/MyDrive/Sneaker-Heat-Dataset/Sneaker‑Heat-500"

import os, glob
print("Files under ROOT:", os.listdir(ROOT))
print("Output folder OK?:", os.path.isdir(f"{ROOT}/output"))


Files under ROOT: ['output', 'grail_whitelist.yaml', 'secrets.env', 'Untitled0.ipynb']
Output folder OK?: True


In [3]:
!pip -q install python-dotenv
from dotenv import load_dotenv
import os

load_dotenv(f"{ROOT}/secrets.env", override=True)

print("RAPIDAPI_KEY present? ", bool(os.getenv("RAPIDAPI_KEY")))
print("REDDIT_CLIENT_ID present?", bool(os.getenv("REDDIT_CLIENT_ID")))
print("REDDIT_SECRET present?   ", bool(os.getenv("REDDIT_SECRET")))


RAPIDAPI_KEY present?  True
REDDIT_CLIENT_ID present? True
REDDIT_SECRET present?    True


In [4]:
!pip -q install pyyaml
import yaml, pprint

with open(f"{ROOT}/grail_whitelist.yaml") as f:
    grails = yaml.safe_load(f)

print("Loaded", len(grails), "grails:")
pprint.pprint(grails[:5])


Loaded 25 grails:
['Louis Vuitton x Nike Air Force 1 Low “Virgil Abloh” (2022)',
 'Nike MAG “Back to the Future” (2011/2016)',
 'Nike SB Dunk Low “Paris” (2003)',
 'Air Jordan 1 “Solid Silver” (1995)',
 'Nike Dunk High “Wu-Tang” F&F (1999)']


In [5]:
# ------------- 3‑A · pull StockX top list -------------------
import requests, pandas as pd, time

def stockx_top(limit=2000):
    host = "stockx-api.p.rapidapi.com"
    url  = f"https://{host}/stockx/products"
    rows, page, per = [], 1, 100
    while len(rows) < limit:
        r = requests.get(
                url,
                headers={"X-RapidAPI-Key": os.getenv("RAPIDAPI_KEY"),
                         "X-RapidAPI-Host": host},
                params={"page": page,
                        "resultsPerPage": per,
                        "sort": "market_value",
                        "order": "DESC"},
                timeout=20,
             ).json().get("data", [])
        if not r:
            break
        rows.extend(r)
        page += 1
        time.sleep(0.25)
    return pd.DataFrame(rows)

stockx_df = stockx_top()
print("Pulled rows:", len(stockx_df))
# preview with the correct field names the API returns
print(stockx_df[['title', 'slug', 'avg_price', 'weekly_orders']].head())


Pulled rows: 1000
                                               title  \
0                   Jordan 3 Retro Pure Money (2025)   
1                     Nike Air Max 95 OG Levis Black   
2  Jordan 1 Retro High OG Shattered Backboard (2025)   
3                  Nike Air Max 95 OG Levis Obsidian   
4                    Nike Kobe 6 Protro Total Orange   

                                                slug   avg_price  \
0                 air-jordan-3-retro-pure-money-2025  216.250000   
1                     nike-air-max-95-og-levis-black  299.818182   
2  air-jordan-1-retro-high-og-shattered-backboard...  271.543478   
3                  nike-air-max-95-og-levis-obsidian  533.461538   
4                            nike-kobe-6-protro-wnba  335.148148   

   weekly_orders  
0            803  
1            475  
2            407  
3            390  
4            311  


In [6]:
# ------------- 3‑B · tidy StockX data -----------------
keep_cols = {
    "title":          "title",
    "slug":           "urlKey",
    "avg_price":      "market_value",   # already in dollars
    "weekly_orders":  "weekly_orders",
}

stockx_df_clean = (stockx_df[list(keep_cols)]
                     .rename(columns=keep_cols))

# msrp unavailable here → set NaN so later merge won’t fail
import numpy as np, pandas as pd
stockx_df_clean["msrp"] = np.nan

print(stockx_df_clean.head())
print("\nColumns now:", stockx_df_clean.columns.tolist())


                                               title  \
0                   Jordan 3 Retro Pure Money (2025)   
1                     Nike Air Max 95 OG Levis Black   
2  Jordan 1 Retro High OG Shattered Backboard (2025)   
3                  Nike Air Max 95 OG Levis Obsidian   
4                    Nike Kobe 6 Protro Total Orange   

                                              urlKey  market_value  \
0                 air-jordan-3-retro-pure-money-2025    216.250000   
1                     nike-air-max-95-og-levis-black    299.818182   
2  air-jordan-1-retro-high-og-shattered-backboard...    271.543478   
3                  nike-air-max-95-og-levis-obsidian    533.461538   
4                            nike-kobe-6-protro-wnba    335.148148   

   weekly_orders  msrp  
0            803   NaN  
1            475   NaN  
2            407   NaN  
3            390   NaN  
4            311   NaN  

Columns now: ['title', 'urlKey', 'market_value', 'weekly_orders', 'msrp']


In [7]:
# ------------- 4‑A · coverage check -----------------
def norm(t):                       # simple case‑fold for comparison
    return str(t).strip().lower()

stockx_titles = set(stockx_df_clean["title"].map(norm))

missing = [g for g in grails if norm(g) not in stockx_titles]
print(f"{len(missing)} grails NOT in StockX pull:")
for g in missing:
    print(" •", g)


25 grails NOT in StockX pull:
 • Louis Vuitton x Nike Air Force 1 Low “Virgil Abloh” (2022)
 • Nike MAG “Back to the Future” (2011/2016)
 • Nike SB Dunk Low “Paris” (2003)
 • Air Jordan 1 “Solid Silver” (1995)
 • Nike Dunk High “Wu-Tang” F&F (1999)
 • UNDFTD x Air Jordan 4 “Undefeated” (2005)
 • Nike Air Yeezy 2 “Red October” (2014)
 • Air Jordan 11 “Jeter” (2017)
 • Concepts x Nike SB Dunk Low “Yellow Lobster” (2009)
 • Nike SB Dunk Low “Staple NYC Pigeon” (2005)
 • Eminem x Carhartt x Air Jordan 4 (2015)
 • Nike SB Dunk Low “What The Dunk” (2007)
 • Air Jordan 4 “Manila” (2020)
 • Cactus Plant Flea Market x Nike Dunk Low “Spiral Sage” (2020)
 • DJ Khaled x Air Jordan 3 “Grateful” (2017)
 • Air Jordan 1 High “Dior” (2020)
 • Travis Scott x PlayStation x Nike Dunk Low (2020)
 • Off-White x Air Jordan 1 “Chicago” (2017)
 • Chanel x Pharrell x Adidas NMD Hu (2017)
 • Nike Air Yeezy 1 “Grammy” Prototype (2008)
 • Nike SB Dunk Low “Freddy Krueger” Sample (2007)
 • Air Jordan 1 High “Colett

In [8]:
# create minimal rows for missing grails
import pandas as pd, numpy as np

extra_rows = pd.DataFrame({
    "title":         missing,
    "urlKey":        [np.nan]*len(missing),
    "market_value":  [np.nan]*len(missing),
    "weekly_orders": [np.nan]*len(missing),
    "msrp":          [np.nan]*len(missing),
})

master_df = pd.concat([stockx_df_clean, extra_rows], ignore_index=True)
print("Master size after merge:", len(master_df))
master_df.head()


Master size after merge: 1025


Unnamed: 0,title,urlKey,market_value,weekly_orders,msrp
0,Jordan 3 Retro Pure Money (2025),air-jordan-3-retro-pure-money-2025,216.25,803.0,
1,Nike Air Max 95 OG Levis Black,nike-air-max-95-og-levis-black,299.818182,475.0,
2,Jordan 1 Retro High OG Shattered Backboard (2025),air-jordan-1-retro-high-og-shattered-backboard...,271.543478,407.0,
3,Nike Air Max 95 OG Levis Obsidian,nike-air-max-95-og-levis-obsidian,533.461538,390.0,
4,Nike Kobe 6 Protro Total Orange,nike-kobe-6-protro-wnba,335.148148,311.0,


In [9]:
# --- quick helper‑presence check ---
for fn in ["reddit_upvotes", "google_trends_pop",
           "tiktok_vid_count", "instagram_post_count",
           "twitter_sentiment"]:
    print(f"{fn:22} →", "OK" if fn in globals() else "MISSING")


reddit_upvotes         → MISSING
google_trends_pop      → MISSING
tiktok_vid_count       → MISSING
instagram_post_count   → MISSING
twitter_sentiment      → MISSING


In [10]:
# ------------- Step 5 · enrich master_df -------------------
import time, numpy as np, pandas as pd

# helper list for iteration
enrich_cols = ["reddit_upvotes", "google_trends_pop",
               "tiktok_vid_count", "instagram_post_count",
               "twitter_sentiment"]

# add missing columns as NaN
for c in enrich_cols:
    if c not in master_df.columns:
        master_df[c] = np.nan

N = 100            # <-- set to len(master_df) when ready for full run
start = master_df[master_df["reddit_upvotes"].isna()].index.min()  # resume
end   = min(start + N, len(master_df))

print(f"Enriching rows {start} .. {end-1}")

for i in range(start, end):
    row = master_df.loc[i]
    tag = row["title"]            # simplistic tag; tweak if desired

    master_df.at[i, "reddit_upvotes"]      = reddit_upvotes(tag)
    master_df.at[i, "google_trends_pop"]   = google_trends_pop(tag)
    master_df.at[i, "tiktok_vid_count"]    = tiktok_vid_count(tag)
    master_df.at[i, "instagram_post_count"]= instagram_post_count(tag)
    master_df.at[i, "twitter_sentiment"]   = twitter_sentiment(tag)

    if i % 25 == 0:
        print(f"  … processed {i - start + 1}/{N}")
        master_df.to_csv(f"{ROOT}/output/master_enriched_step5.csv", index=False)

        time.sleep(0.5)   # small pause to respect rate limits

print("✓ Step 5 partial run finished")
master_df.to_csv(f"{ROOT}/output/master_enriched_step5.csv", index=False)
master_df.head()


Enriching rows 0 .. 99


NameError: name 'reddit_upvotes' is not defined