# 📊 Kasparro — Applied AI Market Intelligence (Kaggle Notebook)

**Author:** Viresh Nagouda

This notebook:
- Cleans & unifies **Google Play (Kaggle)** + **Apple App Store (RapidAPI)** data
- Generates **structured insights** with **confidence scores**
- Exports **combined_apps.csv**, **insights.json**, **report.html**
- Includes a **CLI** here and a **Streamlit app** (for local run)

**Run order:** top → bottom.


In [13]:
# --- Kaggle Secrets (Gemini + RapidAPI) ---
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

# If you stored your Gemini key in Kaggle Secrets as "GOOGLE_API_KEY", fetch it here:
GEMINI_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")  # Google AI Studio key
RAPIDAPI_KEY   = user_secrets.get_secret("RAPIDAPI_KEY")    # Optional

# We'll prefer Gemini (Google) and ignore OPENAI unless you add one later.
OPENAI_API_KEY = None
USE_GEMINI = bool(GEMINI_API_KEY)

print("Gemini key available:", USE_GEMINI)
print("RapidAPI key available:", bool(RAPIDAPI_KEY))


Gemini key available: True
RapidAPI key available: True


In [8]:
import os, json, math, time, textwrap
from typing import Optional, Dict, Any
import numpy as np
import pandas as pd

# Detect Kaggle output directory
OUTPUT_DIR = "/kaggle/working" if os.path.exists("/kaggle/working") else "out"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Paths for Kaggle vs local
KAGGLE_ANDROID_PATH = "/kaggle/input/google-play-store-apps/googleplaystore.csv"
LOCAL_ANDROID_PATH  = "/kaggle/input/google-play-store-apps/googleplaystore.csv"
ANDROID_PATH = KAGGLE_ANDROID_PATH if os.path.exists(KAGGLE_ANDROID_PATH) else LOCAL_ANDROID_PATH

# Optional Phase 5 Excel (attach via Add Data if available)
PHASE5_XLSX_CANDIDATES = [
    "/kaggle/input/kasparro-phase5-d2c-synthetic-dataset",
    "/kaggle/input/phase5-d2c/Kasparro_Phase5_D2C_Synthetic_Dataset.xlsx",
    "/kaggle/input/kasparro-d2c/Kasparro_Phase5_D2C_Synthetic_Dataset.xlsx",
    "Kasparro_Phase5_D2C_Synthetic_Dataset.xlsx",
]
PHASE5_XLSX_PATH = next((p for p in PHASE5_XLSX_CANDIDATES if os.path.exists(p)), None)

print("OUTPUT_DIR:", OUTPUT_DIR)
print("Phase 5 Excel found:", bool(PHASE5_XLSX_PATH))


OUTPUT_DIR: /kaggle/working
Phase 5 Excel found: True


In [9]:
# Safe to run even if already installed; will be quick or no-op offline.
try:
    import google.generativeai as genai
except:
    try:
        !pip -q install google-generativeai
        import google.generativeai as genai
    except Exception as e:
        print("Could not install google-generativeai (likely internet disabled). Using fallbacks.", e)


In [10]:
if not os.path.exists(ANDROID_PATH):
    raise FileNotFoundError("Google Play dataset not found. Attach `lava18/google-play-store-apps` via Add Data.")

raw_gp = pd.read_csv(ANDROID_PATH)
print("Raw Google Play shape:", raw_gp.shape)
raw_gp.head()


Raw Google Play shape: (10841, 13)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [11]:
def parse_price(x):
    if pd.isna(x): return 0.0
    s = str(x).strip().replace("$","")
    try: return float(s)
    except: return 0.0

def parse_installs(x):
    if pd.isna(x): return np.nan
    s = str(x).replace("+","").replace(",","").strip()
    try: return int(s)
    except: return np.nan

def parse_size(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().upper()
    mult = 1.0
    if s.endswith("M"): mult = 1_000_000; s = s[:-1]
    elif s.endswith("K"): mult = 1_000;    s = s[:-1]
    s = s.replace(",","")
    try:
        val = float(s)*mult
        return val/(1024*1024)  # → MB
    except: return np.nan

def normalize_category(cat):
    if pd.isna(cat): return "Unknown"
    c = str(cat).strip().title().replace("&"," & ")
    return " ".join(c.split())

def clean_google_play(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df[df["App"].notna() & df["Category"].notna()]
    df = df.drop_duplicates(subset=["App","Category","Reviews","Installs"], keep="first")

    df["price_usd"]        = df["Price"].apply(parse_price)
    df["installs"]         = df["Installs"].apply(parse_installs)
    df["size_mb"]          = df["Size"].apply(parse_size)
    df["rating"]           = pd.to_numeric(df["Rating"], errors="coerce")
    df["reviews_count"]    = pd.to_numeric(df["Reviews"], errors="coerce")
    df["primary_category"] = df["Category"].apply(normalize_category)
    df["app_name"]         = df["App"].astype(str).str.strip()
    df["content_rating"]   = df["Content Rating"].astype(str).str.strip()
    df["last_updated"]     = pd.to_datetime(df["Last Updated"], errors="coerce")
    df["platform"]         = "android"

    cols = ["app_name","platform","primary_category","rating","reviews_count",
            "price_usd","installs","size_mb","content_rating","last_updated"]
    return df[cols].reset_index(drop=True)

gp = clean_google_play(raw_gp)
print("Cleaned Google Play:", gp.shape)
gp.head()


Cleaned Google Play: (10356, 10)


Unnamed: 0,app_name,platform,primary_category,rating,reviews_count,price_usd,installs,size_mb,content_rating,last_updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,android,Art_And_Design,4.1,159.0,0.0,10000.0,18.119812,Everyone,2018-01-07
1,Coloring book moana,android,Art_And_Design,3.9,967.0,0.0,500000.0,13.35144,Everyone,2018-01-15
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",android,Art_And_Design,4.7,87510.0,0.0,5000000.0,8.296967,Everyone,2018-08-01
3,Sketch - Draw & Paint,android,Art_And_Design,4.5,215644.0,0.0,50000000.0,23.841858,Teen,2018-06-08
4,Pixel Draw - Number Art Coloring Book,android,Art_And_Design,4.3,967.0,0.0,100000.0,2.670288,Everyone,2018-06-20


In [16]:
# ============= Robust Apple App Store fetch =============
import warnings
warnings.filterwarnings("ignore")  # silence harmless display warnings

try:
    import requests
except Exception:
    requests = None

def _itunes_search(term: str, country: str="us", limit:int=1):
    """
    Try Apple's iTunes Search API first (no key needed).
    Docs: https://itunes.apple.com/search
    """
    if requests is None:
        raise RuntimeError("requests not available")
    url = "https://itunes.apple.com/search"
    params = {"term": term, "country": country, "entity": "software", "limit": limit}
    r = requests.get(url, params=params, timeout=15)
    r.raise_for_status()
    return r.json()

APPSTORE_HOST = "appstore-scrapper-api.p.rapidapi.com"  # RapidAPI (may vary by provider)

def _rapidapi_search(term: str, country: str="us"):
    """
    Try RapidAPI provider (optional). Endpoint paths differ by provider.
    If your plan uses another path, swap it here.
    """
    if (requests is None) or (RAPIDAPI_KEY is None):
        raise RuntimeError("RapidAPI not available")
    url = f"https://{APPSTORE_HOST}/search"
    headers = {"x-rapidapi-key": RAPIDAPI_KEY, "x-rapidapi-host": APPSTORE_HOST}
    params = {"term": term, "country": country, "lang": "en_us"}
    r = requests.get(url, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    return r.json()

def appstore_search(term: str, country: str="us") -> Dict[str, Any]:
    """
    1) iTunes Search API
    2) RapidAPI (optional)
    3) Mock fallback
    """
    # 1) iTunes official API
    try:
        data = _itunes_search(term, country, limit=1)
        results = data.get("results", [])
        if results:
            r = results[0]
            return {"results": [{
                "trackName": r.get("trackName"),
                "primaryGenreName": r.get("primaryGenreName"),
                "averageUserRating": r.get("averageUserRating"),   # may be None on search
                "userRatingCount": r.get("userRatingCount"),
                "price": r.get("price", 0.0),
                "isFree": (r.get("price", 0.0) in [0, 0.0]),
                "contentAdvisoryRating": r.get("contentAdvisoryRating"),
                "currentVersionReleaseDate": r.get("currentVersionReleaseDate"),
                "fileSizeBytes": r.get("fileSizeBytes"),
            }]}
    except Exception as e:
        print("iTunes Search API failed → trying RapidAPI:", e)

    # 2) RapidAPI provider
    try:
        data = _rapidapi_search(term, country)
        return data
    except Exception as e:
        print("RapidAPI error → using mock:", e)

    # 3) Mock fallback
    return {"results": [{
        "trackName": f"{term.title()} App (Mock)",
        "primaryGenreName": "Utilities",
        "averageUserRating": 4.2,
        "userRatingCount": 5432,
        "price": 0.99,
        "isFree": False,
        "contentAdvisoryRating": "9+",
        "currentVersionReleaseDate": "2024-02-10T10:00:00Z",
        "fileSizeBytes": str(80*1024*1024),
    }]}

def transform_appstore_result(r: Dict[str, Any]) -> Dict[str, Any]:
    try:
        size_mb = float(r.get("fileSizeBytes", 0)) / (1024*1024) if r.get("fileSizeBytes") else None
    except:
        size_mb = None

    return {
        "app_name":         r.get("trackName"),
        "platform":         "ios",
        "primary_category": r.get("primaryGenreName"),
        "rating":           r.get("averageUserRating"),
        "reviews_count":    r.get("userRatingCount"),
        "price_usd":        float(r.get("price", 0.0)) if r.get("price") is not None else 0.0,
        "installs":         np.nan,  # Apple doesn't expose installs
        "size_mb":          size_mb,
        "content_rating":   r.get("contentAdvisoryRating"),
        "last_updated":     pd.to_datetime(r.get("currentVersionReleaseDate"), errors="coerce"),
    }

sample_terms = ["notion", "slack", "spotify"]
ios_rows = []
for t in sample_terms:
    data = appstore_search(t)
    for r in data.get("results", [])[:1]:
        ios_rows.append(transform_appstore_result(r))

ios_df = pd.DataFrame(ios_rows)
print("iOS sample rows:", ios_df.shape)
ios_df.head()


iOS sample rows: (3, 10)


Unnamed: 0,app_name,platform,primary_category,rating,reviews_count,price_usd,installs,size_mb,content_rating,last_updated
0,"Notion: Notes, Tasks, AI",ios,Productivity,4.78633,67430,0.0,,310.94043,4+,2025-09-23 16:54:53+00:00
1,Slack,ios,Business,3.89415,29524,0.0,,398.038086,17+,2025-09-24 19:01:15+00:00
2,Spotify: Music and Podcasts,ios,Music,4.78774,37050671,0.0,,220.545898,12+,2025-09-24 14:06:51+00:00


In [17]:
combined = pd.concat([gp, ios_df], ignore_index=True, axis=0)
combined_path = os.path.join(OUTPUT_DIR, "combined_apps.csv")
combined.to_csv(combined_path, index=False)
combined_path


'/kaggle/working/combined_apps.csv'

In [18]:
def zscore(series: pd.Series):
    return (series - series.mean()) / (series.std(ddof=0) + 1e-9)

cat_stats = combined.groupby("primary_category").agg(
    n_apps=("app_name","count"),
    avg_rating=("rating","mean"),
    med_installs=("installs", lambda s: np.nanmedian(pd.to_numeric(s, errors="coerce"))),
    sum_reviews=("reviews_count","sum"),
).reset_index()

cat_stats["z_installs"]  = zscore(np.log1p(cat_stats["med_installs"].fillna(0)))
cat_stats["z_rating"]    = zscore(cat_stats["avg_rating"].fillna(cat_stats["avg_rating"].median()))
cat_stats["z_comp"]      = zscore(-cat_stats["n_apps"])  # fewer apps → better
cat_stats["opportunity_score"] = (
    0.5*cat_stats["z_installs"] + 0.4*cat_stats["z_rating"] + 0.1*cat_stats["z_comp"]
)

# Confidence: min(1, sqrt(n/50)) * variance penalty on rating dispersion
var_penalty = 1 - (cat_stats["avg_rating"].fillna(0).std() / 5.0)
var_penalty = np.clip(var_penalty, 0.6, 1.0)
cat_stats["confidence"] = np.minimum(1.0, np.sqrt(cat_stats["n_apps"]/50.0)) * var_penalty

top_cats = cat_stats.sort_values("opportunity_score", ascending=False).head(10)
top_cats


Unnamed: 0,primary_category,n_apps,avg_rating,med_installs,sum_reviews,z_installs,z_rating,z_comp,opportunity_score,confidence
0,1.9,1,19.0,,0.0,-3.158025,5.821449,0.818108,0.831378,0.084853
10,Entertainment,111,4.136036,5000000.0,47570716.0,1.300582,-0.203333,0.513021,0.62026,0.6
26,Photography,322,4.182895,3000000.0,204297410.0,1.152927,-0.184339,-0.072191,0.495509,0.6
9,Education,130,4.375969,1000000.0,23165500.0,0.835372,-0.106081,0.460324,0.421286,0.6
34,Weather,82,4.244,1000000.0,14604735.0,0.835372,-0.159572,0.593453,0.413203,0.6
28,Shopping,223,4.252239,1000000.0,94862756.0,0.835372,-0.156232,0.202387,0.375432,0.6
29,Social,280,4.254918,1000000.0,533576829.0,0.835372,-0.155146,0.044297,0.360057,0.6
33,Video_Players,175,4.06375,1000000.0,110380188.0,0.835372,-0.232632,0.335516,0.358185,0.6
7,Communication,366,4.151466,1000000.0,601273552.0,0.835372,-0.197078,-0.194225,0.319432,0.6
17,House_And_Home,80,4.164706,500000.0,2794772.0,0.635017,-0.191712,0.599,0.300724,0.6


In [33]:
import itertools

def llm_gemini_summary(prompt: str) -> str:
    if not GEMINI_API_KEY:
        return "(Gemini disabled)\n" + prompt
    try:
        import google.generativeai as genai
        genai.configure(api_key=GEMINI_API_KEY)

        # Get models that actually support generateContent
        try:
            models = genai.list_models()
            supported = [m.name for m in models if "generateContent" in getattr(m, "supported_generation_methods", [])]
        except Exception:
            # If listing fails (sometimes in Kaggle), fall back to a reasonable shortlist
            supported = []

        # Preferred order; we'll intersect with what's supported if we could list
        preferred = [
            "models/gemini-2.0-flash-latest",
            "models/gemini-2.0-flash",
            "models/gemini-2.0-pro-latest",
            "models/gemini-2.0-pro",
            "models/gemini-2.0-pro",
        ]

        # Build final candidate list
        candidates = preferred
        if supported:
            # Normalize names (SDK sometimes returns "models/<id>")
            supported_set = set(supported)
            pref_norm = [m if m.startswith("models/") else f"models/{m}" for m in preferred]
            # keep preferred that exist, then add any other supported as fallback
            candidates = [m for m in pref_norm if m in supported_set] + [m for m in supported if m not in pref_norm]

        last_err = None
        for mid in candidates:
            try:
                model = genai.GenerativeModel(mid)
                resp = model.generate_content(
f"""You are a growth analyst. Based on these category lines, give 4 crisp, actionable recommendations (bullets). Be concise.

{prompt}
""",
                    request_options={"timeout": 60}
                )
                # v0.8.x: resp.text usually populated; stitch if not
                if getattr(resp, "text", None):
                    return resp.text
                parts = []
                for c in getattr(resp, "candidates", []) or []:
                    for p in getattr(getattr(c, "content", None), "parts", []) or []:
                        t = getattr(p, "text", "") or ""
                        if t: parts.append(t)
                if parts:
                    return "\n".join(parts)
                last_err = "Empty response"
            except Exception as e:
                last_err = e
                continue

        return f"(Gemini error fallback) {last_err}\n\n{prompt}"
    except Exception as e:
        return f"(Gemini import/config error) {e}\n\n{prompt}"


In [34]:
# Drop garbage categories like "1.9" (no letters / too short)
mask_valid_cat = combined["primary_category"].astype(str).str.contains(r"[A-Za-z]", na=False)
combined_clean = combined[mask_valid_cat & (combined["primary_category"].str.len() >= 2)].copy()

# Recompute on cleaned data
def zscore(series: pd.Series):
    return (series - series.mean()) / (series.std(ddof=0) + 1e-9)

cat_stats = combined_clean.groupby("primary_category").agg(
    n_apps=("app_name","count"),
    avg_rating=("rating","mean"),
    med_installs=("installs", lambda s: np.nanmedian(pd.to_numeric(s, errors="coerce"))),
    sum_reviews=("reviews_count","sum"),
).reset_index()

cat_stats["z_installs"]  = zscore(np.log1p(cat_stats["med_installs"].fillna(0)))
cat_stats["z_rating"]    = zscore(cat_stats["avg_rating"].fillna(cat_stats["avg_rating"].median()))
cat_stats["z_comp"]      = zscore(-cat_stats["n_apps"])
cat_stats["opportunity_score"] = 0.5*cat_stats["z_installs"] + 0.4*cat_stats["z_rating"] + 0.1*cat_stats["z_comp"]

var_penalty = 1 - (cat_stats["avg_rating"].fillna(0).std() / 5.0)
var_penalty = np.clip(var_penalty, 0.6, 1.0)
cat_stats["confidence"] = np.minimum(1.0, np.sqrt(cat_stats["n_apps"]/50.0)) * var_penalty

top_cats = cat_stats.sort_values("opportunity_score", ascending=False).head(10)
top_cats


Unnamed: 0,primary_category,n_apps,avg_rating,med_installs,sum_reviews,z_installs,z_rating,z_comp,opportunity_score,confidence
8,Education,130,4.375969,1000000.0,23165500.0,0.870535,1.124998,0.482186,0.933485,0.971001
33,Weather,82,4.244,1000000.0,14604740.0,0.870535,0.201161,0.61471,0.577203,0.971001
27,Shopping,223,4.252239,1000000.0,94862760.0,0.870535,0.258836,0.225421,0.561344,0.971001
28,Social,280,4.254918,1000000.0,533576800.0,0.870535,0.277592,0.068048,0.553109,0.971001
9,Entertainment,111,4.136036,5000000.0,47570720.0,1.415973,-0.55463,0.534643,0.539599,0.971001
25,Photography,322,4.182895,3000000.0,204297400.0,1.242854,-0.2266,-0.04791,0.525996,0.971001
0,Art_And_Design,65,4.358065,100000.0,1714440.0,0.090191,0.999659,0.661645,0.511124,0.971001
15,Health_And_Fitness,306,4.26145,500000.0,30845190.0,0.635627,0.323321,-0.003735,0.446769,0.971001
14,Game,1121,4.281285,1000000.0,1415537000.0,0.870535,0.462171,-2.253882,0.394748,0.971001
23,Parenting,60,4.3,100000.0,958331.0,0.090191,0.593184,0.67545,0.349914,0.971001


In [35]:
summary_rows = format_summary_rows(top_cats)
llm_summary = llm_gemini_summary(summary_rows)
print(llm_summary)


* **Prioritize Education:** High score and confidence indicate strong potential. Focus on improving discoverability within the education category.
* **Explore Entertainment, Photography:**  Median installs are significantly higher than others, suggesting substantial market reach. Analyze the competitive landscape, focusing on user acquisition strategies to improve positioning in these categories.
* **Investigate Art_And_Design, Parenting:**  These categories have relatively low competition but also significantly lower install numbers.  Assess user engagement and refine marketing strategies to drive growth in these niche categories.
* **Analyze Game category:**  High competition, moderate score.  Investigate strategies to gain a competitive edge, likely focusing on specific game genres or unique features.



In [37]:
def format_summary_rows(df: pd.DataFrame) -> str:
    lines = []
    for _, r in df.iterrows():
        ri = []
        if pd.notna(r["avg_rating"]):   ri.append(f"avg rating ~ {r['avg_rating']:.2f}")
        if pd.notna(r["med_installs"]):
            mi = r["med_installs"]
            ri.append(f"median installs ~ {int(mi) if not math.isnan(mi) else 'NA'}")
        ri.append(f"competition n={int(r['n_apps'])}")
        lines.append(f"- **{r['primary_category']}** → score={r['opportunity_score']:.2f}, conf={r['confidence']:.2f} ({', '.join(ri)})")
    return "\n".join(lines)

summary_rows = format_summary_rows(top_cats)

def llm_gemini_summary(prompt: str) -> str:
    try:
        import google.generativeai as genai
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel("gemini-2.0-flash")
        resp = model.generate_content(
            f"""You are a growth analyst. Based on these category lines, give 4 crisp, actionable recommendations (bullets). Be concise.

{prompt}
"""
        )
        return resp.text
    except Exception as e:
        return f"(Gemini error fallback) {str(e)}\n\n{prompt}"

if GEMINI_API_KEY:
    llm_summary = llm_gemini_summary(summary_rows)
else:
    llm_summary = "(LLM disabled) Focus on categories with high installs & ratings but fewer competing apps:\n" + summary_rows

print(llm_summary)


Here are four actionable recommendations based on the category analysis:

*   **Prioritize Education:** Invest heavily in the Education category. High score, high confidence, good ratings, and substantial installs suggest a strong, addressable market.
*   **Explore Art & Design or Parenting (Niche Opportunities):** Despite lower scores, Art & Design and Parenting have low competition and high average ratings, indicating potential for targeted, high-quality apps that cater to specific user needs.
*   **Avoid direct competition in Game or Photography:** The Game and Photography categories are overcrowded and have relatively low scores, making them challenging markets to penetrate without significant differentiation or a substantial marketing budget.
*   **Carefully Evaluate Weather, Shopping, and Social (Highly Competitive):** These categories show moderate scores but high competition. Any entry here requires a compelling unique selling proposition (USP) and a strong marketing strategy.


In [38]:
insights = {
    "generated_at_utc": pd.Timestamp.utcnow().isoformat(),
    "methodology": {
        "opportunity_score": "0.5*z_installs + 0.4*z_rating + 0.1*z_comp (z_comp uses -n_apps)",
        "confidence": "min(1, sqrt(n_apps/50)) * variance_penalty_on_rating_dispersion"
    },
    "top_categories": top_cats.to_dict(orient="records"),
    "llm_summary": llm_summary
}
insights_path = os.path.join(OUTPUT_DIR, "insights.json")
with open(insights_path, "w") as f:
    json.dump(insights, f, indent=2, default=str)
insights_path


'/kaggle/working/insights.json'

In [39]:
def render_html_report(combined_df: pd.DataFrame, top_cats_df: pd.DataFrame, llm_summary: str) -> str:
    head = """
    <html><head><meta charset="utf-8"><title>Kasparro Market Intelligence Report</title>
    <style>
    body { font-family: Arial, sans-serif; margin: 24px; }
    h1,h2 { margin-top: 1.1em; }
    table { border-collapse: collapse; width: 100%; margin-top: 12px; }
    th, td { border: 1px solid #ddd; padding: 8px; font-size: 14px; }
    th { background: #f6f6f6; }
    code, pre { background: #f3f3f3; padding: 8px; display: block; white-space: pre-wrap; }
    </style></head><body>
    """
    title = "<h1>Kasparro — AI-Powered Market Intelligence</h1>"
    meta  = f"<p><small>Generated: {pd.Timestamp.utcnow().strftime('%Y-%m-%d %H:%M UTC')}</small></p>"
    top_html = top_cats_df.to_html(index=False)
    sample = combined_df.sample(min(10, len(combined_df)), random_state=42).sort_index().to_html(index=False)
    body = f"""
    <h2>Recommendations</h2>
    <pre>{llm_summary}</pre>
    <h2>Top Categories (Opportunity Ranking)</h2>
    {top_html}
    <h2>Combined Dataset — Sample</h2>
    {sample}
    """
    return head + title + meta + body + "</body></html>"

html = render_html_report(combined, top_cats, llm_summary)
report_path = os.path.join(OUTPUT_DIR, "report.html")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(html)
report_path


'/kaggle/working/report.html'

In [40]:
def query_insights(category: Optional[str]=None, top_k: int=5):
    if category:
        sub = combined[combined["primary_category"].str.lower() == category.lower()]
        if sub.empty:
            print("No data for category:", category)
            return
        display(sub.describe(include="all"))
    else:
        display(top_cats.head(top_k)[["primary_category","opportunity_score","confidence","avg_rating","med_installs","n_apps"]])

# Try:
query_insights()  # top-5


Unnamed: 0,primary_category,opportunity_score,confidence,avg_rating,med_installs,n_apps
8,Education,0.933485,0.971001,4.375969,1000000.0,130
33,Weather,0.577203,0.971001,4.244,1000000.0,82
27,Shopping,0.561344,0.971001,4.252239,1000000.0,223
28,Social,0.553109,0.971001,4.254918,1000000.0,280
9,Entertainment,0.539599,0.971001,4.136036,5000000.0,111


In [41]:
app_py = """
import json, pandas as pd, streamlit as st
st.set_page_config(page_title='Kasparro Market Intelligence', layout='wide')
st.title('Kasparro — AI-Powered Market Intelligence')

df  = pd.read_csv('combined_apps.csv')
ins = json.load(open('insights.json'))

st.subheader('Top Categories')
st.dataframe(pd.DataFrame(ins['top_categories']))

st.subheader('Browse Combined Dataset')
st.dataframe(df.head(200))
"""

with open(os.path.join(OUTPUT_DIR,"app.py"), "w", encoding="utf-8") as f:
    f.write(app_py)

print("Wrote app.py to", OUTPUT_DIR)


Wrote app.py to /kaggle/working
