<a href="https://colab.research.google.com/github/trahinhasan/Marketplace-Product-Title-Quality-Classification/blob/main/marketplace_product_title_quality_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/sample_data/walmart_products_free_dataset.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df = df[['title']]

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['title'])

In [None]:
df = df.drop_duplicates(subset=["title"])

In [None]:
df.sample(10)

In [None]:
df.head()

In [None]:
import re

def clean_title(text):
  text = text.lower()
  text = re.sub(r'[^a-z0-9 ]', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text.strip()


In [None]:
df['clean_title'] = df['title'].apply(clean_title)

In [None]:
brands = ['samsung','apple','nike','sony','lg','hp','dell']
colors = ['black','white','red','blue','green','silver']
materials = ['cotton','leather','plastic','steel','wood']



In [None]:
def assign_label(title):
    tokens = title.split()
    length = len(tokens)

    brand_present = any(b in title for b in brands)
    color_present = any(c in title for c in colors)
    material_present = any(m in title for m in materials)
    number_present = any(char.isdigit() for char in title)

    score = sum([brand_present, color_present, material_present, number_present])

    if score >= 3 and length >= 5:
        return "Good"
    elif score >= 1 and length >= 3:
        return "Mediocre"
    else:
        return "Bad"


In [None]:
df['label'] = df['clean_title'].apply(assign_label)


In [None]:
df['label'].value_counts()


In [None]:
def extract_features(title):
    return pd.Series({
        'title_length': len(title.split()),
        'brand_count': sum(b in title for b in brands),
        'color_count': sum(c in title for c in colors),
        'material_count': sum(m in title for m in materials),
        'number_count': sum(char.isdigit() for char in title)
    })


In [None]:
features = df['clean_title'].apply(extract_features)


In [None]:
X = features
y = df['label']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd

importance = pd.Series(model.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)


In [None]:
import pickle

with open("title_quality_model.pkl", "wb") as f:
    pickle.dump(model, f)
