In [2]:
from IPython.display import display
import pandas as pd
pd.set_option('display.max_rows', 500)

data = pd.read_json('H6.json')['results']
df_normalized = pd.json_normalize(data)
#df_normalized.to_excel('H6.xlsx')

In [3]:
# Make 'object' type to 'string'
dat = pd.DataFrame(df_normalized)

colnames = dat.columns.to_list()
for colname in colnames:
    if dat[colname].dtype == 'object':
        dat[colname] = dat[colname].astype('string')

print(dat.info())
display(dat)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6940 entries, 0 to 6939
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                6940 non-null   string
 1   text              6940 non-null   string
 2   parent            6940 non-null   string
 3   isLeaf            6940 non-null   string
 4   aggrlevel         6940 non-null   int64 
 5   standardUnitAbbr  6940 non-null   string
dtypes: int64(1), string(5)
memory usage: 325.4 KB
None


Unnamed: 0,id,text,parent,isLeaf,aggrlevel,standardUnitAbbr
0,TOTAL,Total - All H6 commodities,#,0,0,
1,01,01 - Animals; live,TOTAL,0,2,
2,0101,"0101 - Horses, asses, mules and hinnies; live",01,0,4,
3,010121,"010121 - Horses; live, pure-bred breeding animals",0101,1,6,u
4,010129,"010129 - Horses; live, other than pure-bred br...",0101,1,6,u
...,...,...,...,...,...,...
6935,970610,970610 - Antiques; of an age exceeding 250 years,9706,1,6,kg
6936,970690,970690 - Antiques; of an age exceeding 100 yea...,9706,1,6,kg
6937,99,99 - Commodities not specified according to kind,TOTAL,0,2,
6938,9999,9999 - Commodities not specified according to ...,99,0,4,


In [4]:
# Finding relevant HS codes (to start keyword collection for tech/misc entities)
# source: https://www.trade-tariff.service.gov.uk/find_commodity

a = '8401' #Machinery and mechanical appliances, boilers, nuclear reactors; parts thereof
b = '2844' #Radioactive chemical elements and radioactive isotopes (including the fissile or fertile chemical elements and isotopes); and their compounds; mixtures and residues containing these products)
c = '2845' #Isotopes other than those of heading no. 2844; compounds, inorganic or organic, of such isotopes, whether or not chemically defined

hs_raw = dat['text'][dat['parent'].isin([a, b, c])].tolist()

In [34]:
# Find nouns using NER (not effective)
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_name = "dslim/distilbert-NER"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

nouns = []
for text in hs_raw:
    results = ner_pipeline(text)
    if(results):
        print(results)


[{'entity_group': 'MISC', 'score': 0.7782796, 'word': 'U', 'start': 9, 'end': 10}, {'entity_group': 'MISC', 'score': 0.85282105, 'word': '##rani', 'start': 10, 'end': 14}]
[{'entity_group': 'MISC', 'score': 0.9161987, 'word': 'U', 'start': 9, 'end': 10}, {'entity_group': 'MISC', 'score': 0.96477515, 'word': '##rani', 'start': 10, 'end': 14}]
[{'entity_group': 'MISC', 'score': 0.91679347, 'word': 'U', 'start': 9, 'end': 10}, {'entity_group': 'MISC', 'score': 0.955081, 'word': '##rani', 'start': 10, 'end': 14}]
[{'entity_group': 'ORG', 'score': 0.69930845, 'word': 'Bo', 'start': 9, 'end': 11}, {'entity_group': 'MISC', 'score': 0.5393184, 'word': '##ron', 'start': 11, 'end': 14}]
[{'entity_group': 'MISC', 'score': 0.820883, 'word': 'Li', 'start': 9, 'end': 11}, {'entity_group': 'MISC', 'score': 0.7036883, 'word': '##thi', 'start': 11, 'end': 14}]
[{'entity_group': 'MISC', 'score': 0.7275891, 'word': 'He', 'start': 9, 'end': 11}, {'entity_group': 'MISC', 'score': 0.47019863, 'word': '##liu

In [14]:
# Using POS (most promising)
import spacy
nlp = spacy.load("en_core_web_sm")

# Get single words
nouns = []
for text in hs_raw:
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.append(token.text)

# Get compound words
compound_nouns = []
for text in hs_raw:
    doc = nlp(text)
    compounds = []
    for token in doc:
        if token.dep_ == 'compound':
            compounds.append((token.text, token.head.text))
    for compound in compounds:
        compound_nouns.append(' '.join(compound))

keywords = set(nouns + compound_nouns)

for e in keywords:
    print(e)

fuel
separation
Fuel
compounds
water
curium-240
no
radium-223
reactors
residues
uranium compounds
elements
cartridges
einsteinium-253
apparatus
isotopes
gadolinium-148
dispersions
fuel elements
deuterium
thorium
.
uranium
Fuel elements
tritium
plutonium
cermets
Lithium
alloys
mixtures
deuterium oxide
products
parts
oxide


In [17]:
# TF-IDF attempt
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = dat['text'].tolist()
text = '\n'.join(dat['text'][dat['parent'].isin([a, b, c])].tolist())
corpus.append(text)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix[-1].toarray().flatten()
keywords = {feature_names[i]: tfidf_scores[i] for i in tfidf_scores.argsort()[-50:][::-1]}
for k in keywords:
    print(k)


machines
machinery
including
metal
working
compounds
uranium
cermets
elements
making
alloys dispersions
tools
ceramic products
heading
machine
dispersions
gas
machine tools
products
products mixtures
engines
including cermets
isotopes
cermets ceramic
compounds alloys
dispersions including
water
ceramic
radioactive
alloys
similar
mixtures containing
mixtures
mechanical
printing
sewing machines
cutting
steam
boilers
u235
machines heading
finishing
data
isotopes compounds
enriched
radioactive elements
elements isotopes
tools including
8447
use
