In [12]:
import pandas as pd
import glob
# Read all .csv file in this directory and merge them into one dataframe
# Then save the dataframe to a new csv file

# Get all .csv files in the current directory
csv_files = glob.glob('*_products.csv')

# Read all .csv files and merge them into one dataframe
df = pd.concat([pd.read_csv(file) for file in csv_files])

# Save the dataframe to a new csv file
df.to_csv('merged_products_count.csv', index=False)


In [13]:
# Group by full_gpc and count the number of products, and order by count
df.full_gpc = df.full_gpc.str.lower()
df = df.groupby('full_gpc').size().reset_index(name='count').sort_values(by='count', ascending=False)


In [14]:
df.to_csv('product_counts.csv', index=False)

In [15]:
# Find all full_gpc that  contains upper case letter
# Load facebook product category txt
file = "fb_product_categories_en_US.csv"
fb_categories = pd.read_csv(file)


In [16]:
# Find all rows that are not in the fb_categories
df[~df['full_gpc'].isin(fb_categories.category)]['full_gpc'].values

array(['sporting goods > outdoor recreation > water sports & boating > boating rafting & paddling',
       "clothing & accessories > clothing > men's clothing > tops > t-shirts",
       'sporting goods > outdoor recreation > water sports & boating > boating rafting & paddling > row boats',
       ...,
       'sporting goods > exercise & fitness > exercise equipment mats',
       'sporting goods > exercise & fitness > exercise mat storage racks',
       'home > home goods > kitchen & dining > small kitchen appliances > other small kitchen appliances'],
      shape=(1212,), dtype=object)

In [17]:
df[df['full_gpc'].isin(fb_categories.category)]['full_gpc']

2399                                                other
2350                                        media > books
427     clothing & accessories > clothing > unisex clo...
501     clothing & accessories > clothing > women's cl...
554     clothing & accessories > clothing accessories ...
                              ...                        
2022    home improvement > building supplies & hardwar...
277     clothing & accessories > clothing > baby cloth...
2673    sporting goods > exercise & fitness > cardio e...
274     clothing & accessories > clothing > baby clothing
1201    home > cleaning supplies > party supplies > ca...
Name: full_gpc, Length: 2138, dtype: object

In [18]:
# and make it lower case
# df['full_gpc'] = df['full_gpc'].str.lower()
# parse the full_gpc to a list of categories
df['full_gpc_list'] = df['full_gpc'].str.split(' > ')


In [19]:
# For each full_gpc list change it multiple columns and kepp the count
tdf = pd.DataFrame(df['full_gpc_list'].tolist(), index=df.index)
# Order the columns by the [full_gpc, 0, 1, 2, 3, 4, 5, count]
df = pd.concat([df, tdf], axis=1)
df = df[["full_gpc", 0, 1, 2, 3, 4, 5, "count"]]

In [20]:
# Group by full_gpc and sum the number of products, and order by count
df = df.groupby('full_gpc').sum()
# Sorted by 0, 1, 2, 3, 4, 5
df = df.sort_values(by=[0, 1, 2, 3, 4, 5])
df = df.reset_index()



In [22]:
df.to_csv('product_counts_lv1_lv2_lv3_lv4_lv5_lv6.csv', index=False)

In [26]:
len(df[0].unique())

35

In [40]:
# Read the cvs file of labeled data
file = "Product_FPC_Counts - Sheet2.csv"
labeled_data = pd.read_csv(file)

labeled_data.drop(columns=['Unnamed: 10'], inplace=True)
labeled_data.head()



Unnamed: 0,full_gpc,0,1,2,3,4,5,count,% of total products,Selected Category
0,<category_from_taxonomy>,<category_from_taxonomy>,0,0,0,0,0,9,0.000481%,0
1,animals & pet supplies > pet supplies,animals & pet supplies,pet supplies,0,0,0,0,6,0.000320%,0
2,animals & pet supplies > pet supplies > bird s...,animals & pet supplies,pet supplies,bird supplies,0,0,0,2,0.000107%,0
3,animals & pet supplies > pet supplies > dog su...,animals & pet supplies,pet supplies,dog supplies,0,0,0,7,0.000374%,0
4,animals & pet supplies > pet supplies > fish s...,animals & pet supplies,pet supplies,fish supplies,0,0,0,5,0.000267%,1


In [45]:
# Read ad taxonomy
file = "ad_taxonomy_count.csv"
ad_taxonomy = pd.read_csv(file)

ad_taxonomy.head()




Unnamed: 0,keep,category,line item count
0,1.0,antiques & collectibles,88
1,1.0,antiques & collectibles > collectible appliances,0
2,1.0,antiques & collectibles > collectible coins & ...,208
3,1.0,antiques & collectibles > collectible electronics,2
4,1.0,antiques & collectibles > collectible furniture,0


In [62]:
# Taxonomy sync by removing 2.1% of products that have hallucinated categories or not in the facebook product category
product_df = df[df['full_gpc'].isin(ad_taxonomy.category)]

In [69]:
product_df.rename(columns={'full_gpc': 'category'}, inplace=True)

print(ad_taxonomy.shape, product_df.shape)

(2965, 3) (2138, 8)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_df.rename(columns={'full_gpc': 'category'}, inplace=True)


In [73]:
# ad_taxonomy LEFT Join product_df
join_df = pd.merge(ad_taxonomy, product_df, on='category', how='left')
# rename count to product_count
join_df.rename(columns={'count': 'product_count'}, inplace=True)
# Reorgnize the columns 
join_df = join_df[['keep','category', 'line item count','product_count']]

In [80]:
# Renam
join_df.to_csv('ad_product_counts_taxonomy_joined.csv', index=False)

# Construct final taxonomy based on code



In [82]:
# Rename the file name
join_df.rename(columns={'category': 'fpt'}, inplace=True)
join_df['count'] = join_df['product_count'] + join_df['line item count'] # This is the the naive method of doing the counting and it dominates by product_count.
join_df[['fpt', 'count']].to_csv('taxonomy_sanitized_input.csv', index=False)

In [84]:
import numpy as np
join_df.replace(np.nan, 0, inplace=True)
join_df[['fpt', 'count']].to_csv('taxonomy_sanitized_input.csv', index=False)

In [101]:
# Plot another graph to normalize the count by the total count
join_df['line item count'] = join_df['line item count'] / join_df['line item count'].sum()
join_df['product_count'] = join_df['product_count'] / join_df['product_count'].sum()
join_df['count'] = (join_df['line item count'] + join_df['product_count']) * 1000000 # Sum after normalization
join_df[['fpt', 'count']].to_csv('taxonomy_sanitized_input_normalized.csv', index=False)


# Taxonomy Validation

In [None]:
# Read taxonomy_out_normalized.csv and taxonomy_out_sum.csv
tax_norm = pd.read_csv('taxonomy_out_normalized.csv')
tax_sum = pd.read_csv('taxonomy_out_sum.csv')


In [117]:
# Parse those two taxonomies and see if they align on the first level
tax_norm['fpt_lv1'] = tax_norm['id'].str.split(' > ').str[0]
tax_sum['fpt_lv1'] = tax_sum['id'].str.split(' > ').str[0]

tax_norm['fpt_lv2'] = tax_norm['id'].str.split(' > ').str[1]
tax_sum['fpt_lv2'] = tax_sum['id'].str.split(' > ').str[1]

tax_norm['fpt_lv3'] = tax_norm['id'].str.split(' > ').str[2]
tax_sum['fpt_lv3'] = tax_sum['id'].str.split(' > ').str[2]




In [None]:
assert len(tax_norm.fpt_lv1.unique()) == len(tax_sum.fpt_lv1.unique()), "Disagree on lv1"
assert len(tax_norm.fpt_lv2.unique()) == len(tax_sum.fpt_lv2.unique()), "Disagree on lv2"

# Agree on both lv1 and lv2 

In [126]:
# Check the difference
# what is in norm but not in sum
print(tax_norm[~tax_norm['id'].isin(tax_sum['id'])]['id'].values)




['antiques & collectibles > collectible coins & paper money'
 'antiques & collectibles > collectible home goods'
 'antiques & collectibles > collectible sports memorabilia'
 'antiques & collectibles > collectible toys'
 'baby products > toys > baby toys'
 "clothing & accessories > clothing > men's clothing > tops > blazers & sports coats"
 'electronics > accessories > blank media'
 'electronics > cameras > camera accessories'
 'electronics > cameras > camera drones'
 'electronics > cameras > digital cameras'
 'electronics > video game consoles & video games > video game accessories'
 'electronics > video game consoles & video games > video game consoles'
 'food & beverages > beverages > juice'
 'food & beverages > beverages > milk'
 'food & beverages > beverages > powdered beverage mixes'
 'food & beverages > beverages > soda'
 'food & beverages > beverages > water'
 'food & beverages > food > bakery & bread'
 'food & beverages > food > dairy, eggs & cheese'
 'food & beverages > food >

In [129]:
# what is in sum but not in norm
print(tax_sum[~tax_sum['id'].isin(tax_norm['id'])]['id'].values)



['auto parts & accessories > trailer parts & accessories'
 'baby products > baby gear' 'baby products > baby transport'
 'baby products > nursery'
 'clothing & accessories > clothing > baby clothing > other baby clothing & accessories'
 "clothing & accessories > clothing > men's clothing > sleepwear"
 'clothing & accessories > clothing > uniforms & work clothing > other uniforms & work clothing'
 'clothing & accessories > clothing > uniforms & work clothing > pants & shorts'
 'clothing & accessories > clothing > uniforms & work clothing > shirts'
 "clothing & accessories > clothing > women's clothing > activewear > sports bras"
 "clothing & accessories > clothing > women's clothing > tops > suits & blazers"
 "clothing & accessories > clothing accessories > boys' accessories"
 "clothing & accessories > clothing accessories > girls' accessories"
 "clothing & accessories > clothing accessories > men's accessories > belt buckles"
 "clothing & accessories > clothing accessories > men's acce