In [36]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
# categories_file_name = r'/workspace/datasets/product_data/categories/categories.small.xml'
queries_file_name = r'/workspace/datasets/train.csv'

# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
        # print("parents", parents)
        # print("leaf_id", leaf_id, cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])
parents_df

Unnamed: 0,category,parent
0,abcat0010000,cat00000
1,abcat0011000,abcat0010000
2,abcat0011001,abcat0011000
3,abcat0011002,abcat0011000
4,abcat0011003,abcat0011000
...,...,...
4634,pcmcat97200050013,cat15205
4635,pcmcat97200050015,cat15063
4636,pcmcat99000050001,pcmcat50000050006
4637,pcmcat99000050002,pcmcat99000050001


In [38]:
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]
df

Unnamed: 0,category,query
0,abcat0101001,Televisiones Panasonic 50 pulgadas
1,abcat0101001,Sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca
...,...,...
1865264,pcmcat247400050000,ttv
1865265,pcmcat218000050000,incase
1865266,pcmcat248500050020,ds games
1865267,pcmcat209000050008,Archos


In [39]:
import re
from nltk.corpus import stopwords
from nltk import \
    SnowballStemmer, \
    word_tokenize

stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

def transform_query(s: str):
    s = s.lower()
    s = re.sub('[\'"]', '', s)
    tokens = word_tokenize(s)
    tokens = [token for token in tokens if not token in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)

df["query"] = df['query'].apply(transform_query)
df

Unnamed: 0,category,query
0,abcat0101001,television panason 50 pulgada
1,abcat0101001,sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca
...,...,...
1865264,pcmcat247400050000,ttv
1865265,pcmcat218000050000,incas
1865266,pcmcat248500050020,ds game
1865267,pcmcat209000050008,archo


In [40]:
# d = df.head(100).copy(deep=True)
# d = df.head(10_000).copy(deep=True)
d = df.copy(deep=True)
d

Unnamed: 0,category,query
0,abcat0101001,television panason 50 pulgada
1,abcat0101001,sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca
...,...,...
1865264,pcmcat247400050000,ttv
1865265,pcmcat218000050000,incas
1865266,pcmcat248500050020,ds game
1865267,pcmcat209000050008,archo


In [58]:
d["nb_queries"] = d.groupby('category')['query'].transform(len)
d

Unnamed: 0,category,query,nb_queries
0,abcat0101001,television panason 50 pulgada,80213
1,abcat0101001,sharp,80213
2,pcmcat193100050014,nook,13826
3,abcat0101001,rca,80213
4,abcat0101005,rca,1042
...,...,...,...
1865264,pcmcat247400050000,ttv,79245
1865265,pcmcat218000050000,incas,6685
1865266,pcmcat248500050020,ds game,2376
1865267,pcmcat209000050008,archo,74258


In [59]:
min_queries = 1000

nb_categories_having_less_than_min_queries = d[d["nb_queries"] < min_queries]["category"].count()
nb_categories_having_less_than_min_queries

0

In [57]:
def get_parent(category_code: str):
    # Root category: no parent
    if category_code == "cat00000":
        return category_code
    #print("get_parent of", category_code)
    return parents_df.loc[parents_df['category']==category_code]["parent"].values[0]
# get_parent("abcat0101001")

def fn(series):
    # print("fn", series["category"], series["nb_queries"], min_queries)
    if series["nb_queries"] < min_queries:
        parent = get_parent(series["category"])
        # print("get_parent for", series["category"], " => parent is ", parent)
        return parent
    else:
        # print("> min_queries: ", series["category"])
        return series["category"]

d["category"] = d.apply(fn, axis=1)
d

Unnamed: 0,category,query,nb_queries
0,abcat0101001,television panason 50 pulgada,80213
1,abcat0101001,sharp,80213
2,pcmcat193100050014,nook,13826
3,abcat0101001,rca,80213
4,abcat0101005,rca,1042
...,...,...,...
1865264,pcmcat247400050000,ttv,79245
1865265,pcmcat218000050000,incas,6685
1865266,pcmcat248500050020,ds game,2376
1865267,pcmcat209000050008,archo,74258


In [60]:
print("categ", len(pd.unique(df["category"])), "parent", len(pd.unique(d["category"])))

categ 1486 parent 388


In [61]:
output_file_name = r'/workspace/datasets/labeled_query_data.min1000.txt'

df['label'] = '__label__' + df['category']
df['output'] = df['label'] + ' ' + df['query']
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)
"ok"


'ok'