In [2]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'en')

In [3]:
# Create a baseline corpus
import requests
import json

def fetch_article_titles(length=1500):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "random",
        "rnnamespace": 0, 
        "rnlimit": length
    }
    response = requests.get(url, params=params)
    data = response.json()
    titles = [article['title'] for article in data['query']['random']]
    return titles

def make_corpus(titles, min_length=250, target_number=200, specialised_search=False):
    corpus = []
    i = 0
    for title in titles:
        if title == 'small modular reactor': #title of target article
            continue
        page = wiki_wiki.page(title)
        text = page.text
        if (text) and len(text.split(' ')) >= min_length:
            flag = True
            if (specialised_search):
                if 'nuclear' not in text:
                    flag = False
            if (flag):
                corpus.append(text)
                i += 1
                if i >= target_number:
                    break
    print(f'Collected {i} articles of minimum {min_length} words')
    return corpus

In [18]:
titles = fetch_article_titles()

corpus = make_corpus(titles)

with open('corpus.txt', 'w', encoding='utf-8') as file:
    for text in corpus:
        file.write(text.replace('\n', ' ') + '\n\n')

Collected 200 articles of minimum 250 words


In [22]:
page = wiki_wiki.page('small modular reactor')
text = page.text
corpus.append(text)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Target article is the last one in the list
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix[-1].toarray().flatten()
keywords = {feature_names[i]: tfidf_scores[i] for i in tfidf_scores.argsort()[-30:][::-1]}
for k in keywords:
    print(k)

reactors
smr
smrs
reactor
nuclear
power
safety
waste
fuel
energy
designs
plant
mwe
cost
thermal
neutron
modular
large
radioactive
licensing
construction
smr designs
conventional
nuscale
electricity
nuclear power
power plant
small
generation
2023


In [11]:
new_text = '''
284410 - Uranium; natural uranium and its compounds, alloys, dispersions (including cermets), ceramic products and mixtures containing natural uranium or natural uranium compounds
284420 - Uranium; enriched in U235, plutonium, their compounds, alloys dispersions (including cermets), ceramic products and mixtures containing uranium enriched in U235, plutonium or compounds of these products
284430 - Uranium; depleted in U235, thorium, their compounds, alloys, dispersions (including cermets), ceramic products and mixtures containing uranium depleted in U235, thorium; compounds of these products
284441 - Radioactive elements; tritium and its compounds; alloys, dispersions (including cermets), ceramic products and mixtures containing tritium or its compounds
284442 - Radioactive elements; actinium-225, 227, californium-253, curium-240, 241, 242, 243, 244, einsteinium-253, 254, gadolinium-148, polonium-208, 209, 210, radium-223, uranium-230 or 232, compounds; alloys, dispersions, ceramic products and mixtures
284443 - Radioactive elements, isotopes and compounds; other alloys, dispersions (including cermets), ceramic products and mixtures containing these elements, isotopes or compounds
284444 - Radioactive elements, isotopes, compounds n.e.c. in heading no. 2844, alloys, dispersions (including cermets), ceramic products, mixtures containing the elements, isotopes, compounds; radioactive residues
284450 - Spent (irradiated) fuel elements (cartridges) of nuclear reactors
284510 - Heavy water (deuterium oxide)
284520 - Boron enriched in boron-10 and its compounds
284530 - Lithium enriched in lithium-6 and its compounds
284540 - Helium-3
284590 - Isotopes other than those of heading no. 2844, 2845.20, 2845.30 or 2845.40; compounds, inorganic or organic, of such isotopes, whether or not chemically defined
8401 - Nuclear reactors; fuel elements (cartridges), non-irradiated, for nuclear reactors, machinery and apparatus for isotopic separation
8402 - Boilers; steam or other vapour generating (other than central heating hot water boilers, capable also of producing low pressure steam), super heated water boilers
8403 - Central heating boilers; excluding those of heading no. 8402
8404 - Auxiliary plant for use with boilers of heading no. 8402 or 8403; e.g. economisers, super-heaters, soot removers, gas recoverers), condensers for steam or other vapour power units
8405 - Generators for producer or water gas with or without their purifiers acetylene gas generators and similar water process gas generators, with or without their purifiers
8406 - Turbines; steam and other vapour turbines
8407 - Reciprocating or rotary internal combustion piston engines
8408 - Compression-ignition internal combustion piston engines (diesel or semi-diesel engines)
8409 - Parts suitable for use solely or principally with the engines of heading no. 8407 or 8408
8410 - Turbines; hydraulic water wheels and regulators therefor
8411 - Turbo-jets, turbo-propellers and other gas turbines
8412 - Engines and motors; n.e.c. (e.g. reaction engines, hydraulic power engines, pneumatic power engines)
8413 - Pumps; for liquids, whether or not fitted with measuring device, liquid elevators
8414 - Air or vacuum pumps, air or other gas compressors and fans; ventilating or recycling hoods incorporating a fan; gas-tight biological safety cabinets whether or not fitted with filters
8415 - Air conditioning machines; comprising a motor driven fan and elements for changing the temperature and humidity, including those machines in which the humidity cannot be separately regulated
8416 - Furnace burners for liquid fuel, for pulverised solid fuel or for gas; mechanical grates, mechanical ash dischargers and similar appliances
8417 - Furnaces and ovens; industrial or laboratory, including incinerators, non-electric
8418 - Refrigerators, freezers and other refrigerating or freezing equipment, electric or other; heat pumps other than air conditioning machines of heading no. 8415
8419 - Machinery, plant (not domestic), or laboratory equipment; electrically heated or not, (excluding items in 85.14) for the treatment of materials by a process involving change of temperature; including instantaneous or non electric storage water heaters
8420 - Machines; calendering or other rolling machines, for other than metal or glass and cylinders therefor
8421 - Centrifuges, including centrifugal dryers; filtering or purifying machinery and apparatus for liquids or gases
8422 - Dish washing machines; machinery for cleaning, drying, filling, closing, sealing, capsuling or labelling bottles, cans, boxes, bags, etc, machinery for aerating beverages
8423 - Weighing machines; excluding balances of a sensitivity of 5cg or better, including weight operated counting or checking machines and weights of all kinds
8424 - Mechanical appliances for projecting, dispersing or spraying liquids or powders; fire extinguishers, spray guns, steam, sand blasting machines
8425 - Pulley tackle and hoists other than skip hoists; winches and capstans; jacks
8426 - Derricks, cranes, including cable cranes, mobile lifting frames, straddle carriers and works trucks fitted with a crane
8427 - Fork-lift and other works trucks; fitted with lifting or handling equipment
8428 - Lifting, handling, loading or unloading machinery; n.e.c. in heading no. 8425, 8426 or 8427 (e.g. lifts, escalators, conveyors, teleferics)
8429 - Bulldozers, graders, levellers, scrapers, angledozers, mechanical shovels, excavators, shovel loaders, tamping machines and road rollers, self-propelled
8430 - Moving, grading, levelling, scraping, excavating, tamping, compacting, extracting or boring machinery, for earth, minerals, or ores; pile drivers and extractors; snow ploughs and snow blowers
8431 - Machinery parts; used solely or principally with the machinery of heading no. 8425 to 8430
8432 - Agricultural, horticultural or forestry machinery for soil preparation or cultivation; lawn or sports-ground rollers
8433 - Harvesting and threshing machinery, straw and fodder balers, grass or hay mowers; machines for cleaning, sorting or grading eggs, fruit or other agricultural produce, other than machinery of heading no 8437
8434 - Milking machines and dairy machinery
8435 - Presses, crushers and similar machinery; used in the manufacture of wine, cider, fruit juices or similar beverages
8436 - Agricultural, horticultural, forestry, poultry-keeping, bee-keeping machinery; including germination plant fitted with mechanical or thermal equipment; poultry incubators and brooders
8437 - Machines for cleaning, sorting, grading seed, grain, dried leguminous vegetables; machinery used in the milling industry for the working of cereals or dried leguminous vegetables, not farm type machinery
8438 - Machinery n.e.c. in this chapter, for the industrial preparation or manufacture of food or drink; other than machinery for extraction or preparation of animal or fixed vegetable or microbial fats or oils
8439 - Machinery; for making pulp of fibrous cellulosic material, or for making or finishing paper or paperboard
8440 - Book-binding machinery; including book-sewing machines
8441 - Machines; for making up paper pulp, paper or paperboard, including cutting machines of all kinds
8442 - Machinery, apparatus and equipment (excluding machines of headings 8456 to 8465) for preparing or making printing components; plates, cylinders and other printing components; lithographic stones prepared for printing purposes
8443 - Printing machinery; used for printing by means of plates, cylinders and other printing components of heading 84.42; other printers, copying machines and facsimile machines, whether or not combined; parts and accessories thereof
8444 - Textile machinery; for extruding, drawing, texturing or cutting man-made textile materials
8445 - Textile machinery; spinning, doubling, twisting machines, textile reeling or winding machines and machines for preparing textile yarns for use on machines of heading no. 8446 and 8447
8446 - Weaving machines (looms)
8447 - Knitting machines, stitch-bonding machines and machines for making gimped yarn, tulle, lace, embroidery, trimmings, braid or net and machines for tufting
8448 - Machinery, auxiliary; for use with machines of heading no. 8444 to 8447 (e.g. dobbies, jacquards, automatic stop motions, shuttle changing mechanisms) parts, accessories for machines of heading no. 8444, 8447
8449 - Machinery; for manufacture or finishing felt or non-wovens in the piece or in shapes, including machinery for making felt hats, blocks for making hats
8450 - Household or laundry-type washing machines; including machines which both wash and dry
8451 - Machinery (not of heading no. 8450) for washing, cleaning, wringing, drying, ironing, pressing, bleaching, dyeing, dressing, finishing, coating or impregnating textile yarn, fabrics or made up articles
8452 - Sewing machines; other than book-sewing machines of heading no. 8440; furniture, bases and covers specially designed for sewing machines; sewing machine needles
8453 - Machinery for preparing, tanning or working hides, skins or leather or for making or repairing footwear or other articles of hides, skins or leather, other than sewing machines
8454 - Converters, ladles, ingot moulds and casting machines; of a kind used metallurgy or in metal foundries
8455 - Metal-rolling mills and rolls therefor
8456 - Machine-tools; for working any material by removal of material, by laser or other light or photon beam, ultrasonic, electro-discharge, electro-chemical, electron beam, ionic-beam, or plasma arc processes; water-jet cutting machines
8457 - Machining centres, unit construction machines (single station) and multi-station transfer machines for working metal
8458 - Lathes for removing metal
8459 - Machine-tools; (including way-type unit head machines) for drilling, boring, milling, threading or tapping by removing metal, other than lathes of heading no. 8458
8460 - Machine-tools; for deburring, sharpening, grinding, honing, lapping, polishing or otherwise finishing metal, sintered metal carbides or cermets by means of grinding stones, abrasives or polishing products
8461 - Machine-tools; for planing, shaping, slotting, broaching, gear cutting and grinding, finishing, sawing, cutting off and other tools working by removing metal, sintered metal carbides or cermets n.e.c.
8462 - Machine-tools (including presses) for working metal by forging, hammering or die forging (excluding rolling mills); machine-tools (including presses, slitting lines and cut-to-length lines) for working metal by bending, folding, straightening, flattening,
8463 - Machine-tools; n.e.c. for working metal, sintered metal carbides or cermets without removing material
8464 - Machine-tools; for working stone, ceramics, concrete, asbestos-cement or like mineral materials or for cold working glass
8465 - Machine-tools; (including machines for nailing, stapling, glueing or otherwise assembling) for working wood, cork, bone, hard plastics or rubber or similar hard materials
8466 - Parts & accessories suited for use only/mainly with machines of headings 8456-8465, including work/tool holders, self-opening dieheads, dividing heads & other special attachments for the machines; tool holders for any type of tool for working in the hand
8467 - Tools; for working in the hand, pneumatic, hydraulic or with self-contained electric or non-electric motor
8468 - Machinery and apparatus for soldering, brazing, welding, whether or not capable of cutting, other than those of heading no. 8515; gas-operated surface tempering machines and appliances
8470 - Calculating machines and pocket-size data recording, reproducing and displaying machines with calculating functions; accounting machines, postage-franking machines, ticket-issuing machines and similar, incorporating a calculating device; cash registers
8471 - Automatic data processing machines and units thereof, magnetic or optical readers, machines for transcribing data onto data media in coded form and machines for processing such data, not elsewhere specified or included
8472 - Office machines; not elsewhere classified
8473 - Machinery; parts and accessories (other than covers, carrying cases and the like) suitable for use solely or principally with machines of headings 84.70 to 84.72
8474 - Machinery for sorting, screening, separating, washing, crushing, grinding, mixing or kneading earth, stone, ores in solid form, shaping, moulding machinery for solid mineral fuels
8475 - Machines; for assembling electric or electronic lamps, tubes, valves, flashbulbs, in glass envelopes, machines for manufacturing or hot working glass or glassware
8476 - Automatic goods-vending machines (e.g. postage stamp, cigarette, food or beverage machines), including money-changing machines
8477 - Machinery; for working rubber or plastics or for the manufacture of products from these materials, n.e.c. in this chapter
8478 - Machinery; for preparing or making up tobacco, n.e.c. in this chapter
8479 - Machinery and mechanical appliances; having individual functions, n.e.c. in this chapter
8480 - Moulding boxes for metal foundry, moulding patterns, moulds for metals (excluding ingot moulds), metal carbides, glass, mineral materials, rubber or plastics
8481 - Taps, cocks, valves and similar appliances for pipes, boiler shells, tanks, vats or the like, including pressure-reducing valves and thermostatically controlled valves
8482 - Ball or roller bearings
8483 - Transmission shafts (including cam and crank) and cranks; bearing housings and plain shaft bearings; gears and gearing; ball or roller screws; gear boxes and other speed changers; flywheels and pulleys; clutches and shaft couplings
8484 - Gaskets and similar joints of metal sheeting combined with other material or of two or more layers of metal; sets or assortments of gaskets and similar joints, dissimilar in composition, put up in pouches, envelopes or similar packings; mechanical seals
8485 - Machines for additive manufacturing
8486 - Machines and apparatus of a kind used solely or principally for the manufacture of semiconductor boules or wafers, semiconductor devices, electronic integrated circuits or flat panel displays; machines & apparatus specified in note 11 (C) to this Chapter
'''

In [24]:
# Try with HS keywords
new_corpus = corpus
new_corpus.append(new_text)

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)
tfidf_matrix = vectorizer.fit_transform(new_corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix[-1].toarray().flatten()
keywords = {feature_names[i]: tfidf_scores[i] for i in tfidf_scores.argsort()[-30:][::-1]}
for k in keywords:
    print(k)

machines
machinery
heading
metal
including
compounds
tools
cermets
machine tools
products
working
machine
elements
uranium
alloys dispersions
dispersions
ceramic
ceramic products
products mixtures
mechanical
mixtures
alloys
similar
engines
gas
textile
including cermets
apparatus
compounds alloys
sewing


In [4]:
titles = fetch_article_titles(length=500000000)
corpus = make_corpus(titles, min_length=100, target_number=200, specialised_search=True)
# weird approach. let's ignore this

Collected 11 articles of minimum 100 words


In [35]:
# More specialised Wiki TF-IDF:
def get_all_articles_in_category(category_name):
    def fetch_articles(category, articles):
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category}",
            "cmlimit": "max",  # use 'max' to get the maximum allowed by the API
            "format": "json"
        }

        while True:
            response = requests.get(url, params=params)
            data = response.json()
            for article in data['query']['categorymembers']:
                if article['ns'] == 0:  # ns=0 for articles
                    articles.add(article['title'])
                elif article['ns'] == 14:  # ns=14 for categories
                    subcategory = article['title'].replace("Category:", "")
                    fetch_articles(subcategory, articles)
            if 'continue' in data:
                params['cmcontinue'] = data['continue']['cmcontinue']
            else:
                break

    articles = set()
    fetch_articles(category_name, articles)
    return articles

In [39]:
category_names = [
    'Energy', 'Nuclear Energy', 'Renewable Energy', 'Sustainable Energy', 'Energy Policy', 'Thermal Energy',
    'Technology', 'Nuclear Technology', 'Industrial Technology',
    'Engineering', 'Nuclear Engineering',
    'Science', 'Physics', 'Electricity',
    'Nuclear Safety', 'Nuclear Regulation', 'Nuclear Proliferation',
    'Environment', 'Sustainable Development',
    'Electricity', 'Power Generation',
    'Chemistry', 'Physics',
    ]

titles = set()
for name in category_names:
    article_titles = get_articles_in_category(name)
    titles.update(article_titles)

print(len(titles))

347


In [40]:
for t in titles:
    print(t)

Electrophobia (anxiety condition)
Bug (engineering)
Core–shell semiconductor nanocrystal
Short circuit
Etymology of electricity
Superconducting nanowire single-photon detector
List of science and technology awards for women
Toroidal solenoid
Digital ecology
Geometry index
Life-cycle greenhouse gas emissions of energy sources
NET Power Demonstration Facility
Maximum power principle
Chemputation
Energy informatics
Allotropy
Force control
Superelectrophilic anion
Liquid nitrogen wash
Margham
Gilchrist–Thomas process
Nucleation
Chromogen
List of chemistry mnemonics
Theoretical chemistry
Oxhydroelectric effect
Mathematical chemistry
Standby power
Registered Scientist
Advanced superionic conductor
Conductive metal−organic frameworks
Disclination
Electro-immobilisation
Double layer forces
Isoelectric (electric potential)
Environmental chemistry
Power-off testing
Pedersen current
Generalized renewal process
Actinide chemistry
Index of energy articles
Energy customer switching
Energy
Cononsolve

In [41]:
def make_corpus(titles):
    corpus = []
    for title in titles:
        if title == 'small modular reactor': #title of target article
            continue
        page = wiki_wiki.page(title)
        text = page.text
        corpus.append(text)
    return corpus

In [42]:
corpus = make_corpus(titles)

In [43]:
page = wiki_wiki.page('small modular reactor')
text = page.text
corpus.append(text)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=10000)
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix[-1].toarray().flatten()
keywords = {feature_names[i]: tfidf_scores[i] for i in tfidf_scores.argsort()[-40:][::-1]}
for k in keywords:
    print(k)

smr
smrs
reactors
nuclear
reactor
mwe
safety
power
designs
fuel
waste
smr designs
licensing
nuscale
modular
plant
mwh
neutron
power plant
nuscale power
2023
energy
nuclear fuel
radioactive waste
light water
proliferation
radioactive
construction
cost
nuclear power
conventional
costs
neutron reactors
nuclear energy
spent
desalination
deployment
fast
project
site


In [45]:
# Use POS to filter keywords
import spacy
nlp = spacy.load("en_core_web_sm")

# Get single words
nouns = []
for text in keywords:
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.append(token.text)

# Get compound words
compound_nouns = []
for text in keywords:
    doc = nlp(text)
    compounds = []
    for token in doc:
        if token.dep_ == 'compound' or (token.head.dep_ == 'ROOT' and token.dep_ == 'attr'):
            compounds.append((token.text, token.head.text))
    for compound in compounds:
        compound_nouns.append(' '.join(compound))

keywords_filtered = set(nouns + compound_nouns)

for e in keywords_filtered:
    print(e)

project
neutron reactors
reactors
smr designs
power
waste
proliferation
deployment
energy
water
designs
neutron
nuscale power
construction
costs
desalination
plant
safety
power plant
reactor
fuel
smr
site


In [46]:
import pickle
with open('Wiki_keywords.pkl', 'wb') as file:
    pickle.dump(keywords, file)