## Create manifest file for use in SageMaker GroundTruth for crowd-sourced annotations

Use the ML model to find the top-5 EIF matches for every product. Generate a manifest file in JSON format with this information to get annotations with the Amazon SageMaker Ground Truth service.

In [1]:
# Use a GPU instance if possible, the code can take a few minutes to run otherwise.
from tqdm import tqdm
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

import activity_config
from caml.eio import naics

In [2]:
# Read activities. An activity can be a product, material, or process (e.g transport, manufacturing)
activity_df = pd.read_csv(activity_config.activity_file)
# examine activity data
activity_df.head()

Unnamed: 0,activity_description,sale_price
0,carrot,0.19
1,banana,0.3
2,piano,523.1
3,backpack,75.2
4,keyboard,25.8


In [3]:
# exact logic may change depending on the activity data format
# additional cleaning of data can be added if needed, e.g. removing special characters
activity_list = activity_df.activity_description.values
len(activity_list)

5

In [4]:
naics_df = naics.get_naics_data()
naics_list = naics_df.naics_desc.values
print(len(naics_list))
naics_df.head()

(1016, 4)
(20082, 2)
19723


Unnamed: 0,naics_desc,naics_code,naics_title,co2e_per_dollar,bea_code
0,AM radio stations,515112,Radio Stations,0.11,515100
1,ATMs (automatic teller machines) installation,238290,Other Building Equipment Contractors,0.245,"230302,233230,233412,2334A0,230301,233411,2332..."
2,ATMs (automatic teller machines) manufacturing,334118,Computer Terminal and Other Computer Periphera...,0.183,334118
3,Abattoirs,311611,Animal (except Poultry) Slaughtering,1.279,31161A
4,Abortion clinics,621410,Family Planning Centers,0.116,621400


In [17]:
model = SentenceTransformer('all-mpnet-base-v2')
activity_embedding = model.encode(activity_list)
naics_embedding = model.encode(naics_list)
cosine_scores = util.cos_sim(activity_embedding, naics_embedding)
sorted_cs, indices = cosine_scores.sort(dim=1, descending=True)

In [21]:
result_df = pd.DataFrame()
ranked_eifs_list = []
for activity_ix in tqdm(range(len(activity_df))):
    activity_text = activity_df.iloc[activity_ix].activity_description
    sorted_activity_cs = sorted_cs[activity_ix].cpu().numpy()
    naics_ix = indices[activity_ix].cpu().numpy()

    result_df.loc[activity_ix, 'activity'] = activity_text    
    result_df.loc[activity_ix, 'naics_code'] = naics_df.loc[naics_ix[0], 'naics_code']
    result_df.loc[activity_ix, 'naics_desc'] = naics_df.loc[naics_ix[0], 'naics_desc']
    result_df.loc[activity_ix, 'naics_title'] = naics_df.loc[naics_ix[0], 'naics_title']
    result_df.loc[activity_ix, 'bea_code'] = naics_df.loc[naics_ix[0], 'bea_code']
    result_df.loc[activity_ix, 'co2e_per_dollar'] = naics_df.loc[naics_ix[0], 'co2e_per_dollar']
    result_df.loc[activity_ix, 'cosine_score'] = float("{:.3f}".format(sorted_activity_cs[0]))

    # Create a ranked list
    similarity_scores = pd.DataFrame(index=np.arange(20))
    for i in range(20):
        similarity_scores.loc[i,'cosine_score'] = float("{:.8f}".format(sorted_activity_cs[i]))
        similarity_scores.loc[i, 'bea_code'] = naics_df.loc[naics_ix[i], 'bea_code']
        similarity_scores.loc[i, 'naics_desc'] = naics_df.loc[naics_ix[i], 'naics_desc']
        similarity_scores.loc[i, 'naics_title'] = naics_df.loc[naics_ix[i], 'naics_title']
        similarity_scores.loc[i, 'co2e_per_dollar'] = naics_df.loc[naics_ix[i], 'co2e_per_dollar']

    similarity_scores['activity_text'] = activity_text
    aggregated_scores = similarity_scores.groupby('bea_code').first()
    aggregated_scores['votes'] = similarity_scores.groupby('bea_code').size()
    aggregated_scores = aggregated_scores.sort_values(['cosine_score', 'votes'], ascending=False).reset_index().head()
    ranked_eifs_list += [aggregated_scores]

100%|██████████| 5/5 [00:00<00:00, 24.71it/s]


In [22]:
# Summary of EIFs chosen. Note that the model can make mistakes.
# If all the results look wrong, either there is a bug in the code,
# or the activity descriptions are not clear enough
result_df

Unnamed: 0,activity,naics_code,naics_desc,naics_title,bea_code,co2e_per_dollar,cosine_score
0,carrot,311991.0,"Carrots, cut, peeled or sliced fresh, manufact...",Perishable Prepared Food Manufacturing,311990,0.603,0.626
1,banana,111339.0,Banana farming,Other Noncitrus Fruit Farming,111300,0.5,0.665
2,piano,451140.0,Piano stores,Musical Instrument and Supplies Stores,4B0000,0.163,0.601
3,backpack,314999.0,"Bags, sleeping, manufacturing",All Other Miscellaneous Textile Product Mills,314900,0.263,0.517
4,keyboard,334118.0,"Keyboards, computer peripheral equipment, manu...",Computer Terminal and Other Computer Periphera...,334118,0.183,0.62


In [30]:
#Ranked list of EIFs for one of the activities sampled from the given list
ranked_eifs_list[np.random.choice(range(len(activity_list)))]

Unnamed: 0,bea_code,cosine_score,naics_desc,naics_title,co2e_per_dollar,activity_text,votes
0,311990,0.626165,"Carrots, cut, peeled or sliced fresh, manufact...",Perishable Prepared Food Manufacturing,0.603,carrot,2
1,111200,0.561532,"Carrot farming, field, bedding plant and seed ...",Other Vegetable (except Potato) and Melon Farming,0.588,carrot,2
2,311420,0.450194,Fruit pickling,Fruit and Vegetable Canning,0.518,carrot,4
3,111300,0.432884,Orange groves,Orange Groves,0.5,carrot,8
4,111900,0.428126,Peanut farming,Peanut Farming,1.431,carrot,2
