# Phone recommendation from free-text

This adds a lightweight NLP parser and a scoring-based recommender on top of `mobile_phone_2025.csv`.

Approach:
- Parse user text (budget, brand hints, needs like gaming/camera/battery/display/5G).
- Map dataset columns dynamically (aliases like camera_quality, battery_life, ram, storage, price, brand, etc.).
- Compute aspect scores and a weighted overall score; filter by constraints (e.g., budget).
- Return top suggestions with a short explanation. 

In [41]:
# Data loading
import pandas as pd
import numpy as np

data = pd.read_csv('mobile_phone_2025.csv', encoding='latin1')
print(f"Loaded {len(data)} rows, {len(data.columns)} columns from mobile_phone_2025.csv")

Loaded 930 rows, 15 columns from mobile_phone_2025.csv


In [42]:
# Utilities: column normalization and feature helpers
import re
from typing import Dict, Any, List, Tuple

# Normalizer
norm = lambda s: re.sub(r"[^a-z0-9]+", "_", str(s).strip().lower())

# Create a normalized lookup for dataset columns
_original_cols = list(data.columns)
_norm_map = {c: norm(c) for c in _original_cols}
_rev_norm = {v: k for k, v in _norm_map.items()}

# Known aliases for common features (expanded for this dataset)
ALIASES = {
    "brand": ["brand", "manufacturer", "oem", "company", "company_name"],
    "price": [
        "price", "mrp", "cost", "street_price", "launched_price",
        "launched_price_india", "launched_price_pakistan", "launched_price_usa",
        "launched_price_china", "launched_price_dubai",
    ],
    "camera": ["camera_quality", "camera", "rear_camera", "main_camera_mp", "camera_mp", "back_camera"],
    "battery": ["battery_life", "battery", "battery_capacity", "battery_mah"],
    "ram": ["ram", "memory_ram_gb"],
    "storage": ["storage", "rom", "storage_gb"],
    "display": ["display", "screen", "display_size", "screen_size", "refresh_rate", "screen_refresh"],
    "chipset": ["chipset", "soc", "processor", "cpu_score"],
    "weight": ["weight", "mass", "mobile_weight"],
    "charging": ["charging", "charge_watts", "fast_charge"],
    "front_camera": ["front_camera", "selfie_camera_mp"],
    "fiveg": ["5g", "five_g", "supports_5g", "is_5g"],
}

# Resolve the first alias that exists in the dataset (with fuzzy fallback)

def resolve_col(key: str) -> str | None:
    candidates = [key]
    if key in ALIASES:
        candidates = ALIASES[key] + [key]
    # Normalize all candidates
    norms = [norm(c) for c in candidates]
    # 1) direct normalized match
    for n in norms:
        if n in _rev_norm:
            return _rev_norm[n]
    # 2) fuzzy contains match
    for n in norms:
        for k in _rev_norm.keys():
            if n and (n in k or k in n):
                return _rev_norm[k]
    # 3) exact original column names match
    for cand in candidates:
        for col in _original_cols:
            if str(col).strip().lower() == cand.lower():
                return col
    return None

# Safe numeric coercion

def to_num(x):
    try:
        if pd.isna(x):
            return np.nan
        # remove commas and non-digit (keep dot)
        if isinstance(x, str):
            y = re.sub(r"[^0-9.]+", "", x)
            return float(y) if y != "" else np.nan
        return float(x)
    except Exception:
        return np.nan

# Min-max scaling to [0,1]

def minmax(series: pd.Series, higher_is_better: bool = True) -> pd.Series:
    s = series.astype(float)
    vmin, vmax = np.nanmin(s.values), np.nanmax(s.values)
    if not np.isfinite(vmin) or not np.isfinite(vmax) or vmax == vmin:
        return pd.Series([0.0] * len(s), index=s.index)
    scaled = (s - vmin) / (vmax - vmin)
    if not higher_is_better:
        scaled = 1.0 - scaled
    return scaled

# Currency selection helpers for price
CURRENCY_PREF = {
    'bdt': ['india', 'pakistan', 'usa', 'dubai', 'china', 'price'],
    'inr': ['india', 'price'],
    'usd': ['usa', 'price'],
    'pkr': ['pakistan', 'price'],
    'aed': ['dubai', 'price'],
    'cny': ['china', 'price'],
}
COL_CURRENCY_BY_NAME = {
    'india': 'inr',
    'pakistan': 'pkr',
    'usa': 'usd',
    'dubai': 'aed',
    'china': 'cny'
}
# approximate conversion to BDT (very rough; for filtering only)
BDT_RATE = {'bdt':1.0,'inr':1.3,'usd':120.0,'pkr':0.40,'aed':33.0,'cny':16.0}


def choose_price_column(budget_currency: str = 'bdt') -> Tuple[str | None, str | None]:
    keys = list(_rev_norm.keys())
    price_like = [k for k in keys if 'price' in k]
    if not price_like:
        return None, None
    prefs = CURRENCY_PREF.get(budget_currency, ['price'])
    # rank by preference
    for tag in prefs:
        for k in price_like:
            if tag in k:
                col = _rev_norm[k]
                # detect currency from tag if possible
                cur = COL_CURRENCY_BY_NAME.get(tag, budget_currency)
                return col, cur
    # fallback
    col = _rev_norm[price_like[0]]
    return col, budget_currency

# Quickly check which important columns exist
AVAILABLE = {k: resolve_col(k) for k in [
    "brand","price","camera","battery","ram","storage","display","chipset","front_camera","charging","fiveg","weight"
]}
print({k: v for k, v in AVAILABLE.items() if v is not None})

# Price column inferred for budget ops (computed later per request as needed)


{'brand': 'Company Name', 'price': 'Launched Price (Pakistan)', 'camera': 'Back Camera', 'battery': 'Battery Capacity', 'ram': 'RAM', 'display': 'Screen Size', 'chipset': 'Processor', 'front_camera': 'Front Camera', 'weight': 'Mobile Weight'}


In [43]:
# Lightweight NLP parsing of user request into structured preferences
import math

BRAND_VOCAB = None
if AVAILABLE.get("brand") is not None:
    # build simple brand list from data
    brands = data[AVAILABLE["brand"]].dropna().astype(str).str.strip().str.lower().unique().tolist()
    BRAND_VOCAB = sorted({b for b in brands if b and b != 'nan'})

CURRENCY_WORDS = {
    'taka': 'bdt', 'tk': 'bdt', 'bdt': 'bdt',
    'rs': 'inr', 'rupees': 'inr', 'inr': 'inr',
    'usd': 'usd', 'dollar': 'usd', '$': 'usd'
}

INTENT_KEYWORDS = {
    'gaming': ["gaming", "game", "gamer", "pubg", "genshin", "high fps", "smooth"],
    'camera': ["camera", "photo", "photography", "pictures", "selfie", "video", "stabilization"],
    'battery': ["battery", "battery life", "long lasting", "screen on", "charge", "mah", "power"],
    'display': ["display", "screen", "amoled", "oled", "ips", "refresh", "hz", "bright", "hdr"],
    'performance': ["performance", "fast", "responsive", "lag", "processor", "chipset", "soc"],
    'fiveg': ["5g", "five g", "5-g"],
    'value': ["budget", "cheap", "value", "bang for buck", "under", "within"]
}

BRAND_ALIASES = {
    # simple normalization (e.g., mi->xiaomi)
    "mi": "xiaomi",
    "redmi": "xiaomi",
    "poco": "xiaomi",
    "realme": "realme",
    "samsung": "samsung",
    "apple": "apple",
    "iphone": "apple",
    "oneplus": "oneplus",
    "oppo": "oppo",
    "vivo": "vivo",
    "motorola": "motorola",
    "google": "google",
    "pixel": "google",
    "infinix": "infinix",
    "tecno": "tecno"
}

budget_regex = re.compile(r"(?:under|below|within|around|upto|up to|budget|price)\s*[:=]?\s*([\d,.]+)\s*(tk|taka|bdt|inr|rs|usd|dollar|\$)?", re.I)
number_only_budget = re.compile(r"\b([\d]{4,7})\b")  # fallback if user just writes a large number


def parse_user_text(text: str) -> Dict[str, Any]:
    t = text.lower()
    # budget
    budget = None
    currency = 'bdt'  # default to BDT if in Bangladesh context
    m = budget_regex.search(t)
    if m:
        budget_val = m.group(1)
        cur = m.group(2)
        if cur:
            currency = CURRENCY_WORDS.get(cur.lower(), currency)
        try:
            budget = float(re.sub(r"[^0-9.]", "", budget_val))
        except Exception:
            budget = None
    else:
        # fallback: a large number without currency likely budget
        m2 = number_only_budget.search(t)
        if m2:
            try:
                budget = float(m2.group(1))
            except Exception:
                budget = None

    # brands
    brands_mentioned: List[str] = []
    if BRAND_VOCAB:
        for w in set(re.findall(r"[a-zA-Z0-9+]+", t)):
            wn = BRAND_ALIASES.get(w, w)
            if wn in BRAND_VOCAB:
                brands_mentioned.append(wn)
    brands_mentioned = list(dict.fromkeys(brands_mentioned))  # de-dup preserve order

    # intents/weights
    weights = {
        'gaming': 0.0,
        'camera': 0.0,
        'battery': 0.0,
        'display': 0.0,
        'performance': 0.0,
        'fiveg': 0.0,
        'value': 0.0
    }
    for k, kws in INTENT_KEYWORDS.items():
        for kw in kws:
            if kw in t:
                weights[k] += 1.0
    # normalize weights; if all zero, set balanced defaults
    if sum(weights.values()) == 0:
        weights = {k: 1.0 for k in weights}
    else:
        s = sum(weights.values())
        weights = {k: v / s for k, v in weights.items()}

    return {
        'budget': budget,
        'currency': currency,
        'brands': brands_mentioned,
        'weights': weights
    }

# quick smoke test
print(parse_user_text("I want a gaming phone under 30000 tk with great battery and good camera. Prefer Samsung."))

{'budget': 30000.0, 'currency': 'bdt', 'brands': ['samsung'], 'weights': {'gaming': 0.25, 'camera': 0.25, 'battery': 0.25, 'display': 0.0, 'performance': 0.0, 'fiveg': 0.0, 'value': 0.25}}


In [44]:
# Recommendation engine
from dataclasses import dataclass

@dataclass
class Rec:
    name: str
    score: float
    price: Any
    brand: Any
    reasons: List[str]

# Try to identify a name column for the phone model
NAME_COL_GUESSES = [
    'model', 'phone', 'name', 'device', 'title', 'product', 'variant', 'model_name'
]
NAME_COL = None
for g in NAME_COL_GUESSES:
    c = resolve_col(g)
    if c is not None:
        NAME_COL = c
        break
if NAME_COL is None:
    # fallback: first column that is object dtype and not brand
    obj_cols = [c for c in _original_cols if data[c].dtype == 'object' and c != AVAILABLE.get('brand')]
    NAME_COL = obj_cols[0] if obj_cols else _original_cols[0]

# Define aspect score builders, robust to missing columns

def build_aspect_scores(df: pd.DataFrame) -> Dict[str, pd.Series]:
    scores = {}
    # performance: prefer higher chipset/cpu score, ram
    perf_cols = []
    if AVAILABLE.get('chipset'): perf_cols.append(AVAILABLE['chipset'])
    if AVAILABLE.get('ram'): perf_cols.append(AVAILABLE['ram'])
    if perf_cols:
        perf = sum([minmax(df[c].map(to_num)) for c in perf_cols]) / len(perf_cols)
    else:
        perf = pd.Series([0.0]*len(df), index=df.index)
    scores['performance'] = perf

    # gaming: performance + display refresh if present
    game_cols = []
    if AVAILABLE.get('display'):
        # try to detect a refresh rate column by name pattern if display is numeric; else 0
        cand = resolve_col('refresh_rate') or resolve_col('screen_refresh') or AVAILABLE['display']
        if cand:
            game_cols.append(cand)
    game_parts = [perf]
    for c in game_cols:
        if c in df.columns:
            game_parts.append(minmax(df[c].map(to_num)))
    scores['gaming'] = sum(game_parts) / len(game_parts)

    # camera: rear camera + front camera if present
    cam_parts = []
    if AVAILABLE.get('camera'):
        cam_parts.append(minmax(df[AVAILABLE['camera']].map(to_num)))
    if AVAILABLE.get('front_camera') and AVAILABLE['front_camera'] in df.columns:
        cam_parts.append(minmax(df[AVAILABLE['front_camera']].map(to_num)))
    scores['camera'] = sum(cam_parts)/len(cam_parts) if cam_parts else pd.Series([0.0]*len(df), index=df.index)

    # battery: capacity + charging speed if any
    bat_parts = []
    if AVAILABLE.get('battery'):
        bat_parts.append(minmax(df[AVAILABLE['battery']].map(to_num)))
    if AVAILABLE.get('charging') and AVAILABLE['charging'] in df.columns:
        bat_parts.append(minmax(df[AVAILABLE['charging']].map(to_num)))
    scores['battery'] = sum(bat_parts)/len(bat_parts) if bat_parts else pd.Series([0.0]*len(df), index=df.index)

    # display: size/refresh/brightness proxies
    disp_parts = []
    if AVAILABLE.get('display'):
        disp_parts.append(minmax(df[AVAILABLE['display']].map(to_num)))
    scores['display'] = sum(disp_parts)/len(disp_parts) if disp_parts else pd.Series([0.0]*len(df), index=df.index)

    # fiveg: binary preference if column exists
    fiveg_col = AVAILABLE.get('fiveg')
    if fiveg_col and fiveg_col in df.columns:
        five = df[fiveg_col].astype(str).str.lower().isin(['1','true','yes','y','5g','supported']).astype(float)
        scores['fiveg'] = five
    else:
        scores['fiveg'] = pd.Series([0.0]*len(df), index=df.index)

    # value: placeholder; will be recomputed in recommend_phones with currency-aware prices
    scores['value'] = perf

    return scores

ASPECT_SCORES = build_aspect_scores(data)


def recommend_phones(text: str, top_k: int = 5) -> List[Rec]:
    parsed = parse_user_text(text)
    weights = parsed['weights']
    df = data.copy()

    # Choose price column and map to BDT for fair comparison
    price_col, price_cur = choose_price_column(parsed.get('currency', 'bdt'))
    reasons_global = []
    price_bdt = None
    if price_col is not None:
        raw_price = df[price_col].map(to_num)
        cur = price_cur or parsed.get('currency', 'bdt')
        rate = BDT_RATE.get(cur, 1.0)
        price_bdt = raw_price * rate
        reasons_global.append(f"Using price column '{price_col}' assumed currency {cur.upper()} with conversion to BDT (rate≈{rate}).")

    # Budget filter if available
    if parsed['budget'] and price_bdt is not None:
        df = df[price_bdt <= parsed['budget'] * 1.05]  # small tolerance
        price_bdt = price_bdt.loc[df.index]
        reasons_global.append(f"Filtered to budget ≤ {int(parsed['budget'])} BDT.")

    # Brand preference boost
    brand_boost = pd.Series([1.0]*len(df), index=df.index)
    if parsed['brands'] and AVAILABLE.get('brand') and not df.empty:
        brand_col = AVAILABLE['brand']
        brand_boost = df[brand_col].astype(str).str.lower().apply(lambda b: 1.08 if BRAND_ALIASES.get(b, b) in parsed['brands'] else 1.0)
        reasons_global.append(f"Boosting preferred brands: {', '.join(parsed['brands'])}.")

    # FiveG hard preference: if text explicitly asks 5g, filter to fiveg==True if possible
    if not df.empty and weights.get('fiveg', 0) > 0.15 and AVAILABLE.get('fiveg') and AVAILABLE['fiveg'] in df.columns:
        five_col = AVAILABLE['fiveg']
        mask_5g = df[five_col].astype(str).str.lower().isin(['1','true','yes','y','5g','supported'])
        if mask_5g.any():
            df = df[mask_5g]
            if price_bdt is not None:
                price_bdt = price_bdt.loc[df.index]
            reasons_global.append("Filtered to 5G-capable phones.")

    if df.empty:
        return []

    # Rebuild aspect scores limited to filtered DF index
    local_scores = {k: v.loc[df.index] for k, v in ASPECT_SCORES.items()}

    # Recompute 'value' using currency-aware inverse price if available
    if price_bdt is not None:
        inv_price = minmax(price_bdt, higher_is_better=False)
        local_scores['value'] = (inv_price + local_scores['performance']) / 2

    # Weighted sum
    aspects = ['gaming','camera','battery','display','performance','value','fiveg']
    total = None
    for a in aspects:
        if a in local_scores:
            comp = local_scores[a] * weights.get(a, 0)
            total = comp if total is None else total + comp
    total = total.fillna(0.0) * brand_boost

    df = df.assign(__score=total)
    top = df.sort_values('__score', ascending=False).head(top_k)

    # Build explanations per item
    recs: List[Rec] = []
    for idx, row in top.iterrows():
        reasons = [] + reasons_global
        # strongest aspects
        aspect_contribs: List[Tuple[str, float]] = []
        for a in aspects:
            if a in local_scores:
                aspect_contribs.append((a, float(local_scores[a].loc[idx] * weights.get(a, 0))))
        aspect_contribs.sort(key=lambda x: x[1], reverse=True)
        for a, val in aspect_contribs[:3]:
            if val > 0:
                pretty = a.capitalize()
                reasons.append(f"Strong for {pretty}.")
        # add key specs snippets
        brand_val = row.get(AVAILABLE.get('brand'), '')
        price_val = row.get(price_col, '') if price_col is not None else row.get(AVAILABLE.get('price'), '')
        cam_val = row.get(AVAILABLE.get('camera'), '') if AVAILABLE.get('camera') else ''
        bat_val = row.get(AVAILABLE.get('battery'), '') if AVAILABLE.get('battery') else ''
        ram_val = row.get(AVAILABLE.get('ram'), '') if AVAILABLE.get('ram') else ''
        stor_val = row.get(AVAILABLE.get('storage'), '') if AVAILABLE.get('storage') else ''
        reasons.append(f"Specs: RAM {ram_val}, Storage {stor_val}, Camera {cam_val}, Battery {bat_val}.")

        name = str(row.get(NAME_COL, brand_val))
        recs.append(Rec(name=name, score=float(row['__score']), price=price_val, brand=brand_val, reasons=reasons))

    return recs

# Pretty print helper

def print_recommendations(text: str, top_k: int = 5):
    recs = recommend_phones(text, top_k=top_k)
    if not recs:
        print("No phones matched the request. Consider increasing budget or relaxing constraints.")
        return
    print(f"Top {len(recs)} suggestions:")
    for i, r in enumerate(recs, 1):
        price_str = r.price if pd.isna(r.price) else str(r.price)
        print(f"{i}. {r.name} ({r.brand}) — Score: {r.score:.3f} — Price: {price_str}")
        for reason in r.reasons:
            print(f"   - {reason}")

In [46]:
# Example run using your prompt
example_text = (
    "good battery life, and decent performance within 10000"
)
print(example_text)
print()
print_recommendations(example_text, top_k=3)

good battery life, and decent performance within 10000

Top 3 suggestions:
1. Smart HD 32GB (Infinix) — Score: 0.288 — Price: INR 5,999
   - Using price column 'Launched Price (India)' assumed currency INR with conversion to BDT (rate≈1.3).
   - Filtered to budget ≤ 10000 BDT.
   - Strong for Battery.
   - Strong for Value.
   - Strong for Performance.
   - Specs: RAM 2GB, Storage , Camera 8MP, Battery 5,000mAh.
2. Pop 9 64GB (Tecno) — Score: 0.229 — Price: INR 6,999
   - Using price column 'Launched Price (India)' assumed currency INR with conversion to BDT (rate≈1.3).
   - Filtered to budget ≤ 10000 BDT.
   - Strong for Battery.
   - Strong for Value.
   - Strong for Performance.
   - Specs: RAM 3GB, Storage , Camera 8MP, Battery 5,000mAh.
3. Pop 8 64GB (Tecno) — Score: 0.229 — Price: INR 6,999
   - Using price column 'Launched Price (India)' assumed currency INR with conversion to BDT (rate≈1.3).
   - Filtered to budget ≤ 10000 BDT.
   - Strong for Battery.
   - Strong for Value.
  