In [1]:
!pip install nltk
!pip install gensim



In [2]:
import nltk
import numpy as np
import pandas as pd

In [3]:

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:

def jaccard_similarity(set_a,set_b):
  # 積集合の要素数を計算
  num_intersection = len(set.intersection(set_a, set_b))
  # 和集合の要素数を計算
  num_union = len(set.union(set_a, set_b))
  #Jaccard係数を算出　空集合の時は1を出力
  try:
      return float(num_intersection) / num_union
  except ZeroDivisionError:
      return 1.0

In [5]:
set_a = set(['a', 'an', 'and', 'apple', 'apples', 'buy', 'i', 'like', 'strawberries', 'tomorrow', 'will'])
set_b = set(['an', 'and', 'apple', 'apples', 'bought', 'eat', 'i', 'some', 'strawberries', 'tomorrow', 'will'])
set_c = set(['basketball', 'day', 'every', 'i', 'jordan', 'like', 'michael', 'play'])

print("jaccard(a, b) = ", jaccard_similarity(set_a, set_b)) #Jaccard係数を計算
print("jaccard(a, c) = ", jaccard_similarity(set_a, set_c))
print("jaccard(b, c) = ", jaccard_similarity(set_b, set_c))

jaccard(a, b) =  0.5714285714285714
jaccard(a, c) =  0.11764705882352941
jaccard(b, c) =  0.05555555555555555


In [6]:
def dice_similarity(set_a, set_b):
  num_intersection =  len(set.intersection(set_a, set_b))
  sum_nums = len(set_a) + len(set_b)
  try:
    return 2 * num_intersection / sum_nums
  except ZeroDivisionError:
    return 1.0

In [7]:
set_a = set(['a', 'an', 'and', 'apple', 'apples', 'buy', 'i', 'like', 'strawberries', 'tomorrow', 'will'])
set_b = set(['an', 'and', 'apple', 'apples', 'bought', 'eat', 'i', 'some', 'strawberries', 'tomorrow', 'will'])
set_c = set(['basketball', 'day', 'every', 'i', 'jordan', 'like', 'michael', 'play'])

print("dice(a, b) = ", dice_similarity(set_a, set_b))
print("dice(a, c) = ", dice_similarity(set_a, set_c))
print("dice(b, c) = ", dice_similarity(set_b, set_c))

dice(a, b) =  0.7272727272727273
dice(a, c) =  0.21052631578947367
dice(b, c) =  0.10526315789473684


In [8]:
def simpson_similarity(list_a, list_b):
  num_intersection = len(set.intersection(set(list_a), set(list_b)))
  min_num = min(len(set(list_a)), len(set(list_b)))
  try:
    return num_intersection / min_num
  except ZeroDivisionError:
    if num_intersection == 0:
      return 1.0
    else:
      return 0

In [9]:
set_a = set(['a', 'an', 'and', 'apple', 'apples', 'buy', 'i', 'like', 'strawberries', 'tomorrow', 'will'])
set_b = set(['an', 'and', 'apple', 'apples', 'bought', 'eat', 'i', 'some', 'strawberries', 'tomorrow', 'will'])
set_c = set(['basketball', 'day', 'every', 'i', 'jordan', 'like', 'michael', 'play'])

print("simpson(a, b) = ", simpson_similarity(set_a, set_b)) 
print("simpson(a, c) = ", simpson_similarity(set_a, set_c)) 
print("simpson(b, c) = ", simpson_similarity(set_b, set_c))

simpson(a, b) =  0.7272727272727273
simpson(a, c) =  0.25
simpson(b, c) =  0.125


In [10]:
set_a = set(['a', 'an', 'and', 'apple', 'apples', 'buy', 'i', 'like', 'strawberries', 'tomorrow', 'will'])
set_b = set(['an', 'and', 'apple', 'apples', 'bought', 'eat', 'i', 'some', 'strawberries', 'tomorrow', 'will'])
set_c = set(['basketball', 'day', 'every', 'i', 'jordan', 'like', 'michael', 'play'])
set_d = set(['basketball', 'day', 'a', 'an', 'and', 'will', 'play', 'eat', 'i']) # 大きめの集合を作って試してみよう

print("jaccard similarity:")
print(jaccard_similarity(set_d, set_a))
print(jaccard_similarity(set_d, set_b))
print(jaccard_similarity(set_d, set_c))

print("dice similarity:")
print(dice_similarity(set_d, set_a))
print(dice_similarity(set_d, set_b))
print(dice_similarity(set_d, set_c))

print("simpson similarity:")
print(simpson_similarity(set_d, set_a))
print(simpson_similarity(set_d, set_b))
print(simpson_similarity(set_d, set_c))

jaccard similarity:
0.3333333333333333
0.3333333333333333
0.3076923076923077
dice similarity:
0.5
0.5
0.47058823529411764
simpson similarity:
0.5555555555555556
0.5555555555555556
0.5


In [11]:
def euclidean_distance(list_a, list_b):
  diff_vec = np.array(list_a) - np.array(list_b)
  return np.linalg.norm(diff_vec)

In [12]:
bow_a = [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1]  
bow_b = [1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1]  
bow_c = [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0]  

print("euclidean_distance(bow_a, bow_b) = ",euclidean_distance(bow_a, bow_b))
print("euclidean_distance(bow_a, bow_c) = ",euclidean_distance(bow_a, bow_c))
print("euclidean_distance(bow_b, bow_c) = ",euclidean_distance(bow_b, bow_c))

euclidean_distance(bow_a, bow_b) =  2.23606797749979
euclidean_distance(bow_a, bow_c) =  3.7416573867739413
euclidean_distance(bow_b, bow_c) =  4.123105625617661


In [13]:
def minkowski_distance(list_a, list_b, p):
  n = len(list_a)
  list_d = []
  for i in range(n):
    list_d.append(list_a[i]-list_b[i])
  return np.linalg.norm(list_d, ord=p)

In [14]:

# p=0
print(minkowski_distance(bow_a, bow_b, 0))
print(minkowski_distance(bow_a, bow_c, 0))
print(minkowski_distance(bow_b, bow_c, 0))
# p=1
print(minkowski_distance(bow_a, bow_b, 1))
print(minkowski_distance(bow_a, bow_c, 1))
print(minkowski_distance(bow_b, bow_c, 1))

# p=2
print(minkowski_distance(bow_a, bow_b, 2))
print(minkowski_distance(bow_a, bow_c, 2))
print(minkowski_distance(bow_b, bow_c, 2))

# p=3
print(minkowski_distance(bow_a, bow_b, 3))
print(minkowski_distance(bow_a, bow_c, 3))
print(minkowski_distance(bow_b, bow_c, 3))

5.0
14.0
17.0
5.0
14.0
17.0
2.23606797749979
3.7416573867739413
4.123105625617661
1.7099759466766968
2.4101422641752297
2.571281590658235


In [15]:
documents=["I like apples and a strawberries. I will buy an apple tomorrow @Fresco.",
           "I bought some apples and strawberries. I will eat an apple <b>tomorrow.</b>",
           "I play basketball every day. I like Michael Jordan (born February 17, 1963)."]


import re

def cleaning_text(text):
    # @の削除
    pattern1 = '@'
    text = re.sub(pattern1, '', text) 
    # <b>タグの削除
    pattern2 = '<.*?>'
    text = re.sub(pattern2, '', text)    
    # ()内を削除
    pattern3 = '\(.*?\)'
    text = re.sub(pattern3, '', text)
    return text
  

for text in documents:
    print(cleaning_text(text))

I like apples and a strawberries. I will buy an apple tomorrow Fresco.
I bought some apples and strawberries. I will eat an apple tomorrow.
I play basketball every day. I like Michael Jordan .


In [16]:
text = '<p><b>Natural language processing</b> (<b>NLP</b>) is a subfield of <a href="/wiki/Computer_science" title="Computer science">computer science</a>, <a href="/wiki/Information_engineering_(field)" title="Information engineering (field)">information engineering</a>, and <a href="/wiki/Artificial_intelligence" title="Artificial intelligence">artificial intelligence</a> concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of <a href="/wiki/Natural_language" title="Natural language">natural language</a> data.</p>'

print(cleaning_text(text))

Natural language processing  is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human  languages, in particular how to program computers to process and analyze large amounts of natural language data.


In [17]:
#以下にwikipediaの各元素のアブストラクトを示す.

d = ["","","","","","","","","","","","","","","","","",""]

d[0] = "Hydrogen is the chemical element with the symbol H and atomic number 1. With a standard atomic weight of 1.008, hydrogen is the lightest element in the periodic table. Hydrogen is the most abundant chemical substance in the Universe, constituting roughly 75% of all baryonic mass.[7][note 1] Non-remnant stars are mainly composed of hydrogen in the plasma state. The most common isotope of hydrogen, termed protium (name rarely used, symbol 1H), has one proton and no neutrons. The universal emergence of atomic hydrogen first occurred during the recombination epoch (Big Bang). At standard temperature and pressure, hydrogen is a colorless, odorless, tasteless, non-toxic, nonmetallic, highly combustible diatomic gas with the molecular formula H2. Since hydrogen readily forms covalent compounds with most nonmetallic elements, most of the hydrogen on Earth exists in molecular forms such as water or organic compounds. Hydrogen plays a particularly important role in acid–base reactions because most acid-base reactions involve the exchange of protons between soluble molecules. In ionic compounds, hydrogen can take the form of a negative charge (i.e., anion) when it is known as a hydride, or as a positively charged (i.e., cation) species denoted by the symbol H+. The hydrogen cation is written as though composed of a bare proton, but in reality, hydrogen cations in ionic compounds are always more complex. As the only neutral atom for which the Schrödinger equation can be solved analytically,[8] study of the energetics and bonding of the hydrogen atom has played a key role in the development of quantum mechanics. Hydrogen gas was first artificially produced in the early 16th century by the reaction of acids on metals. In 1766–81, Henry Cavendish was the first to recognize that hydrogen gas was a discrete substance,[9] and that it produces water when burned, the property for which it was later named: in Greek, hydrogen means water-former. Industrial production is mainly from steam reforming natural gas, and less often from more energy-intensive methods such as the electrolysis of water.[10] Most hydrogen is used near the site of its production, the two largest uses being fossil fuel processing (e.g., hydrocracking) and ammonia production, mostly for the fertilizer market. Hydrogen is problematic in metallurgy because it can embrittle many metals,[11] complicating the design of pipelines and storage tanks.[12]"
d[1] = "Helium (from Greek: ἥλιος, romanized: Helios, lit. 'Sun') is a chemical element with the symbol He and atomic number 2. It is a colorless, odorless, tasteless, non-toxic, inert, monatomic gas, the first in the noble gas group in the periodic table.[a] Its boiling point is the lowest among all the elements. Helium is the second lightest and second most abundant element in the observable universe (hydrogen is the lightest and most abundant). It is present at about 24% of the total elemental mass, which is more than 12 times the mass of all the heavier elements combined. Its abundance is similar to this in both the Sun and in Jupiter. This is due to the very high nuclear binding energy (per nucleon) of helium-4, with respect to the next three elements after helium. This helium-4 binding energy also accounts for why it is a product of both nuclear fusion and radioactive decay. Most helium in the universe is helium-4, the vast majority of which was formed during the Big Bang. Large amounts of new helium are being created by nuclear fusion of hydrogen in stars Helium is named for the Greek Titan of the Sun, Helios. It was first detected as an unknown, yellow spectral line signature in sunlight, during a solar eclipse in 1868 by Georges Rayet,[11] Captain C. T. Haig,[12] Norman R. Pogson,[13] and Lieutenant John Herschel,[14] and was subsequently confirmed by French astronomer, Jules Janssen.[15] Janssen is often jointly credited with detecting the element, along with Norman Lockyer. Janssen recorded the helium spectral line during the solar eclipse of 1868, while Lockyer observed it from Britain. Lockyer was the first to propose that the line was due to a new element, which he named. The formal discovery of the element was made in 1895 by two Swedish chemists, Per Teodor Cleve and Nils Abraham Langlet, who found helium emanating from the uranium ore, cleveite, which is now not regarded as a separate mineral species but as a variety of uraninite.[16][17] In 1903, large reserves of helium were found in natural gas fields in parts of the United States, which is by far the largest supplier of the gas today. Liquid helium is used in cryogenics (its largest single use, absorbing about a quarter of production), particularly in the cooling of superconducting magnets, with the main commercial application being in MRI scanners. Helium's other industrial uses—as a pressurizing and purge gas, as a protective atmosphere for arc welding, and in processes such as growing crystals to make silicon wafers—account for half of the gas produced. A well-known but minor use is as a lifting gas in balloons and airships.[18] As with any gas whose density differs from that of air, inhaling a small volume of helium temporarily changes the timbre and quality of the human voice. In scientific research, the behavior of the two fluid phases of helium-4 (helium I and helium II) is important to researchers studying quantum mechanics (in particular the property of superfluidity) and to those looking at the phenomena, such as superconductivity, produced in matter near absolute zero. On Earth, it is relatively rare—5.2 ppm by volume in the atmosphere. Most terrestrial helium present today is created by the natural radioactive decay of heavy radioactive elements (thorium and uranium, although there are other examples), as the alpha particles emitted by such decays consist of helium-4 nuclei. This radiogenic helium is trapped with natural gas in concentrations as great as 7% by volume, from which it is extracted commercially by a low-temperature separation process called fractional distillation. Previously, terrestrial helium—a non-renewable resource because once released into the atmosphere, it readily escapes into space—was thought to be in increasingly short supply.[19][20] However, recent studies suggest that helium produced deep in the earth by radioactive decay can collect in natural gas reserves in larger than expected quantities,[21] in some cases, having been released by volcanic activity.[22]"
d[2] = "Lithium (from Greek: λίθος, romanized: lithos, lit. 'stone') is a chemical element with the symbol Li and atomic number 3. It is a soft, silvery-white alkali metal. Under standard conditions, it is the lightest metal and the lightest solid element. Like all alkali metals, lithium is highly reactive and flammable, and must be stored in mineral oil. When cut, it exhibits a metallic luster, but moist air corrodes it quickly to a dull silvery gray, then black tarnish. It never occurs freely in nature, but only in (usually ionic) compounds, such as pegmatitic minerals, which were once the main source of lithium. Due to its solubility as an ion, it is present in ocean water and is commonly obtained from brines. Lithium metal is isolated electrolytically from a mixture of lithium chloride and potassium chloride. The nucleus of the lithium atom verges on instability, since the two stable lithium isotopes found in nature have among the lowest binding energies per nucleon of all stable nuclides. Because of its relative nuclear instability, lithium is less common in the solar system than 25 of the first 32 chemical elements even though its nuclei are very light: it is an exception to the trend that heavier nuclei are less common.[2] For related reasons, lithium has important uses in nuclear physics. The transmutation of lithium atoms to helium in 1932 was the first fully man-made nuclear reaction, and lithium deuteride serves as a fusion fuel in staged thermonuclear weapons.[3] Lithium and its compounds have several industrial applications, including heat-resistant glass and ceramics, lithium grease lubricants, flux additives for iron, steel and aluminium production, lithium batteries, and lithium-ion batteries. These uses consume more than three quarters of lithium production. Lithium is present in biological systems in trace amounts; its functions are uncertain. Lithium salts have proven to be useful as a mood-stabilizing drug in the treatment of bipolar disorder in humans."
d[3] = "Beryllium is a chemical element with the symbol Be and atomic number 4. It is a relatively rare element in the universe, usually occurring as a product of the spallation of larger atomic nuclei that have collided with cosmic rays. Within the cores of stars, beryllium is depleted as it is fused into heavier elements. It is a divalent element which occurs naturally only in combination with other elements in minerals. Notable gemstones which contain beryllium include beryl (aquamarine, emerald) and chrysoberyl. As a free element it is a steel-gray, strong, lightweight and brittle alkaline earth metal. In structural applications, the combination of high flexural rigidity, thermal stability, thermal conductivity and low density (1.85 times that of water) make beryllium metal a desirable aerospace material for aircraft components, missiles, spacecraft, and satellites.[6] Because of its low density and atomic mass, beryllium is relatively transparent to X-rays and other forms of ionizing radiation; therefore, it is the most common window material for X-ray equipment and components of particle detectors.[6] The high thermal conductivities of beryllium and beryllium oxide have led to their use in thermal management applications. When added as an alloying element to aluminium, copper (notably the alloy beryllium copper), iron or nickel beryllium improves many physical properties.[example needed][6] Tools made of beryllium copper alloys are strong and hard and do not create sparks when they strike a steel surface. Beryllium does not form oxides until it reaches very high temperatures. The commercial use of beryllium requires the use of appropriate dust control equipment and industrial controls at all times because of the toxicity of inhaled beryllium-containing dusts that can cause a chronic life-threatening allergic disease in some people called berylliosis.[7]"
d[4] = "Boron is a chemical element with the symbol B and atomic number 5. Produced entirely by cosmic ray spallation and supernovae and not by stellar nucleosynthesis, it is a low-abundance element in the Solar System and in the Earth's crust.[11] Boron is concentrated on Earth by the water-solubility of its more common naturally occurring compounds, the borate minerals. These are mined industrially as evaporites, such as borax and kernite. The largest known boron deposits are in Turkey, the largest producer of boron minerals. Elemental boron is a metalloid that is found in small amounts in meteoroids but chemically uncombined boron is not otherwise found naturally on Earth. Industrially, very pure boron is produced with difficulty because of refractory contamination by carbon or other elements. Several allotropes of boron exist: amorphous boron is a brown powder; crystalline boron is silvery to black, extremely hard (about 9.5 on the Mohs scale), and a poor electrical conductor at room temperature. The primary use of elemental boron is as boron filaments with applications similar to carbon fibers in some high-strength materials. Boron is primarily used in chemical compounds. About half of all boron consumed globally is an additive in fiberglass for insulation and structural materials. The next leading use is in polymers and ceramics in high-strength, lightweight structural and refractory materials. Borosilicate glass is desired for its greater strength and thermal shock resistance than ordinary soda lime glass. Boron as sodium perborate is used as a bleach. A small amount of boron is used as a dopant in semiconductors, and reagent intermediates in the synthesis of organic fine chemicals. A few boron-containing organic pharmaceuticals are used or are in study. Natural boron is composed of two stable isotopes, one of which (boron-10) has a number of uses as a neutron-capturing agent. In biology, borates have low toxicity in mammals (similar to table salt), but are more toxic to arthropods and are used as insecticides. Boric acid is mildly antimicrobial, and several natural boron-containing organic antibiotics are known.[12] Boron is an essential plant nutrient and boron compounds such as borax and boric acid are used as fertilizers in agriculture, although it's only required in small amounts, with excess being toxic. Boron compounds play a strengthening role in the cell walls of all plants. There is no consensus on whether boron is an essential nutrient for mammals, including humans, although there is some evidence it supports bone health."
d[5] = "Carbon (from Latin: carbo coal) is a chemical element with the symbol C and atomic number 6. It is nonmetallic and tetravalent—making four electrons available to form covalent chemical bonds. It belongs to group 14 of the periodic table.[13] Three isotopes occur naturally, 12C and 13C being stable, while 14C is a radionuclide, decaying with a half-life of about 5,730 years.[14] Carbon is one of the few elements known since antiquity.[15] Carbon is the 15th most abundant element in the Earth's crust, and the fourth most abundant element in the universe by mass after hydrogen, helium, and oxygen. Carbon's abundance, its unique diversity of organic compounds, and its unusual ability to form polymers at the temperatures commonly encountered on Earth enables this element to serve as a common element of all known life. It is the second most abundant element in the human body by mass (about 18.5%) after oxygen.[16] The atoms of carbon can bond together in diverse ways, resulting in various allotropes of carbon. The best known allotropes are graphite, diamond, and buckminsterfullerene.[17] The physical properties of carbon vary widely with the allotropic form. For example, graphite is opaque and black while diamond is highly transparent. Graphite is soft enough to form a streak on paper (hence its name, from the Greek verb γράφειν which means to write), while diamond is the hardest naturally occurring material known. Graphite is a good electrical conductor while diamond has a low electrical conductivity. Under normal conditions, diamond, carbon nanotubes, and graphene have the highest thermal conductivities of all known materials. All carbon allotropes are solids under normal conditions, with graphite being the most thermodynamically stable form at standard temperature and pressure. They are chemically resistant and require high temperature to react even with oxygen. The most common oxidation state of carbon in inorganic compounds is +4, while +2 is found in carbon monoxide and transition metal carbonyl complexes. The largest sources of inorganic carbon are limestones, dolomites and carbon dioxide, but significant quantities occur in organic deposits of coal, peat, oil, and methane clathrates. Carbon forms a vast number of compounds, more than any other element, with almost ten million compounds described to date,[18] and yet that number is but a fraction of the number of theoretically possible compounds under standard conditions. For this reason, carbon has often been referred to as the king of the elements.[19]"
d[6] = "Nitrogen is the chemical element with the symbol N and atomic number 7. It was first discovered and isolated by Scottish physician Daniel Rutherford in 1772. Although Carl Wilhelm Scheele and Henry Cavendish had independently done so at about the same time, Rutherford is generally accorded the credit because his work was published first. The name nitrogène was suggested by French chemist Jean-Antoine-Claude Chaptal in 1790 when it was found that nitrogen was present in nitric acid and nitrates. Antoine Lavoisier suggested instead the name azote, from the Greek ἀζωτικός no life, as it is an asphyxiant gas; this name is instead used in many languages, such as French, Russian, Romanian and Turkish, and appears in the English names of some nitrogen compounds such as hydrazine, azides and azo compounds. Nitrogen is the lightest member of group 15 of the periodic table, often called the pnictogens. It is a common element in the universe, estimated at about seventh in total abundance in the Milky Way and the Solar System. At standard temperature and pressure, two atoms of the element bind to form dinitrogen, a colourless and odorless diatomic gas with the formula N2. Dinitrogen forms about 78% of Earth's atmosphere, making it the most abundant uncombined element. Nitrogen occurs in all organisms, primarily in amino acids (and thus proteins), in the nucleic acids (DNA and RNA) and in the energy transfer molecule adenosine triphosphate. The human body contains about 3% nitrogen by mass, the fourth most abundant element in the body after oxygen, carbon, and hydrogen. The nitrogen cycle describes movement of the element from the air, into the biosphere and organic compounds, then back into the atmosphere. Many industrially important compounds, such as ammonia, nitric acid, organic nitrates (propellants and explosives), and cyanides, contain nitrogen. The extremely strong triple bond in elemental nitrogen (N≡N), the second strongest bond in any diatomic molecule after carbon monoxide (CO),[2] dominates nitrogen chemistry. This causes difficulty for both organisms and industry in converting N2 into useful compounds, but at the same time means that burning, exploding, or decomposing nitrogen compounds to form nitrogen gas releases large amounts of often useful energy. Synthetically produced ammonia and nitrates are key industrial fertilisers, and fertiliser nitrates are key pollutants in the eutrophication of water systems. Apart from its use in fertilisers and energy-stores, nitrogen is a constituent of organic compounds as diverse as Kevlar used in high-strength fabric and cyanoacrylate used in superglue. Nitrogen is a constituent of every major pharmacological drug class, including antibiotics. Many drugs are mimics or prodrugs of natural nitrogen-containing signal molecules: for example, the organic nitrates nitroglycerin and nitroprusside control blood pressure by metabolizing into nitric oxide. Many notable nitrogen-containing drugs, such as the natural caffeine and morphine or the synthetic amphetamines, act on receptors of animal neurotransmitters."
d[7] = "Oxygen is the chemical element with the symbol O and atomic number 8. It is a member of the chalcogen group in the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. After hydrogen and helium, oxygen is the third-most abundant element in the universe by mass. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula 2. Diatomic oxygen gas constitutes 20.95% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust.[2] Dioxygen provides the energy released in combustion[3] and aerobic cellular respiration,[4] and many major classes of organic molecules in living organisms contain oxygen atoms, such as proteins, nucleic acids, carbohydrates, and fats, as do the major constituent inorganic compounds of animal shells, teeth, and bone. Most of the mass of living organisms is oxygen as a component of water, the major constituent of lifeforms. Oxygen is continuously replenished in Earth's atmosphere by photosynthesis, which uses the energy of sunlight to produce oxygen from water and carbon dioxide. Oxygen is too chemically reactive to remain a free element in air without being continuously replenished by the photosynthetic action of living organisms. Another form (allotrope) of oxygen, ozone (3), strongly absorbs ultraviolet UVB radiation and the high-altitude ozone layer helps protect the biosphere from ultraviolet radiation. However, ozone present at the surface is a byproduct of smog and thus a pollutant. Oxygen was isolated by Michael Sendivogius before 1604, but it is commonly believed that the element was discovered independently by Carl Wilhelm Scheele, in Uppsala, in 1773 or earlier, and Joseph Priestley in Wiltshire, in 1774. Priority is often given for Priestley because his work was published first. Priestley, however, called oxygen dephlogisticated air, and did not recognize it as a chemical element. The name oxygen was coined in 1777 by Antoine Lavoisier, who first recognized oxygen as a chemical element and correctly characterized the role it plays in combustion. Common uses of oxygen include production of steel, plastics and textiles, brazing, welding and cutting of steels and other metals, rocket propellant, oxygen therapy, and life support systems in aircraft, submarines, spaceflight and diving."
d[8] = "Fluorine is a chemical element with the symbol F and atomic number 9. It is the lightest halogen and exists as a highly toxic pale yellow diatomic gas at standard conditions. As the most electronegative element, it is extremely reactive, as it reacts with all other elements, except for argon, neon, and helium. Among the elements, fluorine ranks 24th in universal abundance and 13th in terrestrial abundance. Fluorite, the primary mineral source of fluorine which gave the element its name, was first described in 1529; as it was added to metal ores to lower their melting points for smelting, the Latin verb fluo meaning flow gave the mineral its name. Proposed as an element in 1810, fluorine proved difficult and dangerous to separate from its compounds, and several early experimenters died or sustained injuries from their attempts. Only in 1886 did French chemist Henri Moissan isolate elemental fluorine using low-temperature electrolysis, a process still employed for modern production. Industrial production of fluorine gas for uranium enrichment, its largest application, began during the Manhattan Project in World War II. Owing to the expense of refining pure fluorine, most commercial applications use fluorine compounds, with about half of mined fluorite used in steelmaking. The rest of the fluorite is converted into corrosive hydrogen fluoride en route to various organic fluorides, or into cryolite, which plays a key role in aluminium refining. Molecules containing a carbon–fluorine bond often have very high chemical and thermal stability; their major uses are as refrigerants, electrical insulation and cookware, the last as PTFE (Teflon). Pharmaceuticals such as atorvastatin and fluoxetine contain C−F bonds. The fluoride ion from dissolved fluoride salts inhibits dental cavities, and so finds use in toothpaste and water fluoridation. Global fluorochemical sales amount to more than US$15 billion a year. Fluorocarbon gases are generally greenhouse gases with global-warming potentials 100 to 23,500 times that of carbon dioxide, SF6 having the highest global warming potential of any known substance. Organofluorine compounds often persist in the environment due to the strength of the carbon–fluorine bond. Fluorine has no known metabolic role in mammals; a few plants and sea sponges synthesize organofluorine poisons (most often monofluoroacetates) that help deter predation.[13]"
d[9] = "Neon is a chemical element with the symbol Ne and atomic number 10. It is a noble gas.[10] Neon is a colorless, odorless, inert monatomic gas under standard conditions, with about two-thirds the density of air. It was discovered (along with krypton and xenon) in 1898 as one of the three residual rare inert elements remaining in dry air, after nitrogen, oxygen, argon and carbon dioxide were removed. Neon was the second of these three rare gases to be discovered and was immediately recognized as a new element from its bright red emission spectrum. The name neon is derived from the Greek word, νέον, neuter singular form of νέος (neos), meaning new. Neon is chemically inert, and no uncharged neon compounds are known. The compounds of neon currently known include ionic molecules, molecules held together by van der Waals forces and clathrates. During cosmic nucleogenesis of the elements, large amounts of neon are built up from the alpha-capture fusion process in stars. Although neon is a very common element in the universe and solar system (it is fifth in cosmic abundance after hydrogen, helium, oxygen and carbon), it is rare on Earth. It composes about 18.2 ppm of air by volume (this is about the same as the molecular or mole fraction) and a smaller fraction in Earth's crust. The reason for neon's relative scarcity on Earth and the inner (terrestrial) planets is that neon is highly volatile and forms no compounds to fix it to solids. As a result, it escaped from the planetesimals under the warmth of the newly ignited Sun in the early Solar System. Even the outer atmosphere of Jupiter is somewhat depleted of neon, although for a different reason.[11] Neon gives a distinct reddish-orange glow when used in low-voltage neon glow lamps, high-voltage discharge tubes and neon advertising signs.[12][13] The red emission line from neon also causes the well known red light of helium–neon lasers. Neon is used in some plasma tube and refrigerant applications but has few other commercial uses. It is commercially extracted by the fractional distillation of liquid air. Since air is the only source, it is considerably more expensive than helium."
d[10] = "Sodium is a chemical element with the symbol Na (from Latin natrium) and atomic number 11. It is a soft, silvery-white, highly reactive metal. Sodium is an alkali metal, being in group 1 of the periodic table. Its only stable isotope is 23Na. The free metal does not occur in nature, and must be prepared from compounds. Sodium is the sixth most abundant element in the Earth's crust and exists in numerous minerals such as feldspars, sodalite, and rock salt (NaCl). Many salts of sodium are highly water-soluble: sodium ions have been leached by the action of water from the Earth's minerals over eons, and thus sodium and chlorine are the most common dissolved elements by weight in the oceans. Sodium was first isolated by Humphry Davy in 1807 by the electrolysis of sodium hydroxide. Among many other useful sodium compounds, sodium hydroxide (lye) is used in soap manufacture, and sodium chloride (edible salt) is a de-icing agent and a nutrient for animals including humans. Sodium is an essential element for all animals and some plants. Sodium ions are the major cation in the extracellular fluid (ECF) and as such are the major contributor to the ECF osmotic pressure and ECF compartment volume.[citation needed] Loss of water from the ECF compartment increases the sodium concentration, a condition called hypernatremia. Isotonic loss of water and sodium from the ECF compartment decreases the size of that compartment in a condition called ECF hypovolemia. By means of the sodium-potassium pump, living human cells pump three sodium ions out of the cell in exchange for two potassium ions pumped in; comparing ion concentrations across the cell membrane, inside to outside, potassium measures about 40:1, and sodium, about 1:10. In nerve cells, the electrical charge across the cell membrane enables transmission of the nerve impulse—an action potential—when the charge is dissipated; sodium plays a key role in that activity."
d[11] = "Magnesium is a chemical element with the symbol Mg and atomic number 12. It is a shiny gray solid which bears a close physical resemblance to the other five elements in the second column (group 2, or alkaline earth metals) of the periodic table: all group 2 elements have the same electron configuration in the outer electron shell and a similar crystal structure. Magnesium is the ninth most abundant element in the universe.[9][10] It is produced in large, aging stars from the sequential addition of three helium nuclei to a carbon nucleus. When such stars explode as supernovas, much of the magnesium is expelled into the interstellar medium where it may recycle into new star systems. Magnesium is the eighth most abundant element in the Earth's crust[11] and the fourth most common element in the Earth (after iron, oxygen and silicon), making up 13% of the planet's mass and a large fraction of the planet's mantle. It is the third most abundant element dissolved in seawater, after sodium and chlorine.[12] Magnesium occurs naturally only in combination with other elements, where it invariably has a +2 oxidation state. The free element (metal) can be produced artificially, and is highly reactive (though in the atmosphere, it is soon coated in a thin layer of oxide that partly inhibits reactivity – see passivation). The free metal burns with a characteristic brilliant-white light. The metal is now obtained mainly by electrolysis of magnesium salts obtained from brine, and is used primarily as a component in aluminium-magnesium alloys, sometimes called magnalium or magnelium. Magnesium is less dense than aluminium, and the alloy is prized for its combination of lightness and strength. Magnesium is the eleventh most abundant element by mass in the human body and is essential to all cells and some 300 enzymes.[13] Magnesium ions interact with polyphosphate compounds such as ATP, DNA, and RNA. Hundreds of enzymes require magnesium ions to function. Magnesium compounds are used medicinally as common laxatives, antacids (e.g., milk of magnesia), and to stabilize abnormal nerve excitation or blood vessel spasm in such conditions as eclampsia.[13]"
d[12] = "Aluminium (aluminum in American and Canadian English) is a chemical element with the symbol Al and atomic number 13. It is a silvery-white, soft, non-magnetic and ductile metal in the boron group. By mass, aluminium makes up about 8% of the Earth's crust, where it is the third most abundant element (after oxygen and silicon) and also the most abundant metal. Occurrence of aluminium decreases in the Earth's mantle below, however. The chief ore of aluminium is bauxite. Aluminium metal is highly reactive, such that native specimens are rare and limited to extreme reducing environments. Instead, it is found combined in over 270 different minerals.[7] Aluminium is remarkable for its low density and its ability to resist corrosion through the phenomenon of passivation. Aluminium and its alloys are vital to the aerospace industry[8] and important in transportation and building industries, such as building facades and window frames.[9] The oxides and sulfates are the most useful compounds of aluminium.[8] Despite its prevalence in the environment, no known form of life uses aluminium salts metabolically, but aluminium is well tolerated by plants and animals.[10] Because of these salts' abundance, the potential for a biological role for them is of continuing interest, and studies continue."
d[13] = "Silicon is a chemical element with the symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic lustre, and is a tetravalent metalloid and semiconductor. It is a member of group 14 in the periodic table: carbon is above it; and germanium, tin, and lead are below it. It is relatively unreactive. Because of its high chemical affinity for oxygen, it was not until 1823 that Jöns Jakob Berzelius was first able to prepare it and characterize it in pure form. Its oxides form a family of anions known as silicates. Its melting and boiling points of 1414 °C and 3265 °C respectively are the second-highest among all the metalloids and nonmetals, being only surpassed by boron. Silicon is the eighth most common element in the universe by mass, but very rarely occurs as the pure element in the Earth's crust. It is most widely distributed in dusts, sands, planetoids, and planets as various forms of silicon dioxide (silica) or silicates. More than 90% of the Earth's crust is composed of silicate minerals, making silicon the second most abundant element in the Earth's crust (about 28% by mass) after oxygen. Most silicon is used commercially without being separated, and often with little processing of the natural minerals. Such use includes industrial construction with clays, silica sand, and stone. Silicates are used in Portland cement for mortar and stucco, and mixed with silica sand and gravel to make concrete for walkways, foundations, and roads. They are also used in whiteware ceramics such as porcelain, and in traditional quartz-based soda-lime glass and many other specialty glasses. Silicon compounds such as silicon carbide are used as abrasives and components of high-strength ceramics. Silicon is the basis of the widely used synthetic polymers called silicones. The late 20th century to early 21st century has been described as the Silicon Age (also known as the Digital Age or Information Age) due to elemental silicon having a large impact on the modern world economy. The relatively small portion of very highly purified elemental silicon used in semiconductor electronics (< 10%) is essential to the metal–oxide–silicon (MOS) transistors and integrated circuit chips used in most modern technology (such as computers and cell phones, for example). The most widely used silicon device is the MOSFET (metal–oxide–silicon field-effect transistor), which has been manufactured in larger numbers than any other device in history. Free silicon is also used in the steel refining, aluminium-casting, and fine chemical industries (often to make fumed silica). Silicon is an essential element in biology, although only traces are required by animals. However, various sea sponges and microorganisms, such as diatoms and radiolaria, secrete skeletal structures made of silica. Silica is deposited in many plant tissues.[9]"
d[14] = "Phosphorus is a chemical element with the symbol P and atomic number 15. Elemental phosphorus exists in two major forms, white phosphorus and red phosphorus, but because it is highly reactive, phosphorus is never found as a free element on Earth. It has a concentration in the Earth's crust of about one gram per kilogram (compare copper at about 0.06 grams). In minerals, phosphorus generally occurs as phosphate. Elemental phosphorus was first isolated as white phosphorus in 1669. White phosphorus emits a faint glow when exposed to oxygen – hence the name, taken from Greek mythology, Φωσφόρος meaning light-bearer (Latin Lucifer), referring to the Morning Star, the planet Venus. The term phosphorescence, meaning glow after illumination, derives from this property of phosphorus, although the word has since been used for a different physical process that produces a glow. The glow of phosphorus is caused by oxidation of the white (but not red) phosphorus — a process now called chemiluminescence. Together with nitrogen, arsenic, antimony, and bismuth, phosphorus is classified as a pnictogen. Phosphorus is essential for life. Phosphates (compounds containing the phosphate ion, PO43−) are a component of DNA, RNA, ATP, and phospholipids. Elemental phosphorus was first isolated from human urine, and bone ash was an important early phosphate source. Phosphate mines contain fossils because phosphate is present in the fossilized deposits of animal remains and excreta. Low phosphate levels are an important limit to growth in some aquatic systems. The vast majority of phosphorus compounds mined are consumed as fertilisers. Phosphate is needed to replace the phosphorus that plants remove from the soil, and its annual demand is rising nearly twice as fast as the growth of the human population. Other applications include organophosphorus compounds in detergents, pesticides, and nerve agents."
d[15] = "Sulfur (in British English, sulphur) is a chemical element with the symbol S and atomic number 16. It is abundant, multivalent, and nonmetallic. Under normal conditions, sulfur atoms form cyclic octatomic molecules with a chemical formula S8. Elemental sulfur is a bright yellow, crystalline solid at room temperature. Sulfur is the tenth most common element by mass in the universe, and the fifth most common on Earth. Though sometimes found in pure, native form, sulfur on Earth usually occurs as sulfide and sulfate minerals. Being abundant in native form, sulfur was known in ancient times, being mentioned for its uses in ancient India, ancient Greece, China, and Egypt. Historically and in literature sulfur is also called brimstone,[4] which means burning stone.[5] Today, almost all elemental sulfur is produced as a byproduct of removing sulfur-containing contaminants from natural gas and petroleum. The greatest commercial use of the element is the production of sulfuric acid for sulfate and phosphate fertilizers, and other chemical processes. The element sulfur is used in matches, insecticides, and fungicides. Many sulfur compounds are odoriferous, and the smells of odorized natural gas, skunk scent, grapefruit, and garlic are due to organosulfur compounds. Hydrogen sulfide gives the characteristic odor to rotting eggs and other biological processes. Sulfur is an essential element for all life, but almost always in the form of organosulfur compounds or metal sulfides. Three amino acids (cysteine, cystine, and methionine) and two vitamins (biotin and thiamine) are organosulfur compounds. Many cofactors also contain sulfur, including glutathione, thioredoxin, and iron–sulfur proteins. Disulfides, S–S bonds, confer mechanical strength and insolubility of the protein keratin, found in outer skin, hair, and feathers. Sulfur is one of the core chemical elements needed for biochemical functioning and is an elemental macronutrient for all living organisms."
d[16] = "Chlorine is a chemical element with the symbol Cl and atomic number 17. The second-lightest of the halogens, it appears between fluorine and bromine in the periodic table and its properties are mostly intermediate between them. Chlorine is a yellow-green gas at room temperature. It is an extremely reactive element and a strong oxidising agent: among the elements, it has the highest electron affinity and the third-highest electronegativity on the Pauling scale, behind only oxygen and fluorine. The most common compound of chlorine, sodium chloride (common salt), has been known since ancient times. Around 1630, chlorine gas was first synthesised in a chemical reaction, but not recognised as a fundamentally important substance. Carl Wilhelm Scheele wrote a description of chlorine gas in 1774, supposing it to be an oxide of a new element. In 1809, chemists suggested that the gas might be a pure element, and this was confirmed by Sir Humphry Davy in 1810, who named it from Ancient Greek: χλωρός, romanized: khlôros, lit. 'pale green' based on its colour. Because of its great reactivity, all chlorine in the Earth's crust is in the form of ionic chloride compounds, which includes table salt. It is the second-most abundant halogen (after fluorine) and twenty-first most abundant chemical element in Earth's crust. These crustal deposits are nevertheless dwarfed by the huge reserves of chloride in seawater. Elemental chlorine is commercially produced from brine by electrolysis, predominantly in the chlor-alkali process. The high oxidising potential of elemental chlorine led to the development of commercial bleaches and disinfectants, and a reagent for many processes in the chemical industry. Chlorine is used in the manufacture of a wide range of consumer products, about two-thirds of them organic chemicals such as polyvinyl chloride, and many intermediates for the production of plastics and other end products which do not contain the element. As a common disinfectant, elemental chlorine and chlorine-generating compounds are used more directly in swimming pools to keep them clean and sanitary. Elemental chlorine at high concentrations is extremely dangerous and poisonous for all living organisms. As a chemical warfare agent, chlorine was first used in World War I as a poison gas weapon. In the form of chloride ions, chlorine is necessary to all known species of life. Other types of chlorine compounds are rare in living organisms, and artificially produced chlorinated organics range from inert to toxic. In the upper atmosphere, chlorine-containing organic molecules such as chlorofluorocarbons have been implicated in ozone depletion. Small quantities of elemental chlorine are generated by oxidation of chloride to hypochlorite in neutrophils as part of the immune response against bacteria."
d[17] = "Argon is a chemical element with the symbol Ar and atomic number 18. It is in group 18 of the periodic table and is a noble gas.[6] Argon is the third-most abundant gas in the Earth's atmosphere, at 0.934% (9340 ppmv). It is more than twice as abundant as water vapor (which averages about 4000 ppmv, but varies greatly), 23 times as abundant as carbon dioxide (400 ppmv), and more than 500 times as abundant as neon (18 ppmv). Argon is the most abundant noble gas in Earth's crust, comprising 0.00015% of the crust. Nearly all of the argon in the Earth's atmosphere is radiogenic argon-40, derived from the decay of potassium-40 in the Earth's crust. In the universe, argon-36 is by far the most common argon isotope, as it is the most easily produced by stellar nucleosynthesis in supernovas. The name argon is derived from the Greek word ἀργόν, neuter singular form of ἀργός meaning lazy or inactive, as a reference to the fact that the element undergoes almost no chemical reactions. The complete octet (eight electrons) in the outer atomic shell makes argon stable and resistant to bonding with other elements. Its triple point temperature of 83.8058 K is a defining fixed point in the International Temperature Scale of 1990. Argon is produced industrially by the fractional distillation of liquid air. Argon is mostly used as an inert shielding gas in welding and other high-temperature industrial processes where ordinarily unreactive substances become reactive; for example, an argon atmosphere is used in graphite electric furnaces to prevent the graphite from burning. Argon is also used in incandescent, fluorescent lighting, and other gas-discharge tubes. Argon makes a distinctive blue-green gas laser. Argon is also used in fluorescent glow starters."

In [18]:
import re

def cleaning_text(text):
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*?\]', '', text)    
    return text

In [19]:
print(cleaning_text(d[0]))

Hydrogen is the chemical element with the symbol H and atomic number 1. With a standard atomic weight of 1.008, hydrogen is the lightest element in the periodic table. Hydrogen is the most abundant chemical substance in the Universe, constituting roughly 75% of all baryonic mass. Non-remnant stars are mainly composed of hydrogen in the plasma state. The most common isotope of hydrogen, termed protium , has one proton and no neutrons. The universal emergence of atomic hydrogen first occurred during the recombination epoch . At standard temperature and pressure, hydrogen is a colorless, odorless, tasteless, non-toxic, nonmetallic, highly combustible diatomic gas with the molecular formula H2. Since hydrogen readily forms covalent compounds with most nonmetallic elements, most of the hydrogen on Earth exists in molecular forms such as water or organic compounds. Hydrogen plays a particularly important role in acid–base reactions because most acid-base reactions involve the exchange of pro

In [20]:
def tokenize_text(text):
  text = re.sub('[.,]', '', text)
  return text.split()

In [21]:
for text in d:
  text = cleaning_text(text)
  print(tokenize_text(text))

['Hydrogen', 'is', 'the', 'chemical', 'element', 'with', 'the', 'symbol', 'H', 'and', 'atomic', 'number', '1', 'With', 'a', 'standard', 'atomic', 'weight', 'of', '1008', 'hydrogen', 'is', 'the', 'lightest', 'element', 'in', 'the', 'periodic', 'table', 'Hydrogen', 'is', 'the', 'most', 'abundant', 'chemical', 'substance', 'in', 'the', 'Universe', 'constituting', 'roughly', '75%', 'of', 'all', 'baryonic', 'mass', 'Non-remnant', 'stars', 'are', 'mainly', 'composed', 'of', 'hydrogen', 'in', 'the', 'plasma', 'state', 'The', 'most', 'common', 'isotope', 'of', 'hydrogen', 'termed', 'protium', 'has', 'one', 'proton', 'and', 'no', 'neutrons', 'The', 'universal', 'emergence', 'of', 'atomic', 'hydrogen', 'first', 'occurred', 'during', 'the', 'recombination', 'epoch', 'At', 'standard', 'temperature', 'and', 'pressure', 'hydrogen', 'is', 'a', 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', 'with', 'the', 'molecular', 'formula', 'H2', 'Sin

In [22]:
d_tokenized = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
for i in range(len(d)):
  d_tokenized[i] = tokenize_text(cleaning_text(d[i]))

In [23]:
print(d_tokenized[0])

['Hydrogen', 'is', 'the', 'chemical', 'element', 'with', 'the', 'symbol', 'H', 'and', 'atomic', 'number', '1', 'With', 'a', 'standard', 'atomic', 'weight', 'of', '1008', 'hydrogen', 'is', 'the', 'lightest', 'element', 'in', 'the', 'periodic', 'table', 'Hydrogen', 'is', 'the', 'most', 'abundant', 'chemical', 'substance', 'in', 'the', 'Universe', 'constituting', 'roughly', '75%', 'of', 'all', 'baryonic', 'mass', 'Non-remnant', 'stars', 'are', 'mainly', 'composed', 'of', 'hydrogen', 'in', 'the', 'plasma', 'state', 'The', 'most', 'common', 'isotope', 'of', 'hydrogen', 'termed', 'protium', 'has', 'one', 'proton', 'and', 'no', 'neutrons', 'The', 'universal', 'emergence', 'of', 'atomic', 'hydrogen', 'first', 'occurred', 'during', 'the', 'recombination', 'epoch', 'At', 'standard', 'temperature', 'and', 'pressure', 'hydrogen', 'is', 'a', 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', 'with', 'the', 'molecular', 'formula', 'H2', 'Sin

In [24]:
from nltk.corpus import wordnet as wn

def lemmatize_word(word):
  word=word.lower()

  lemma = wn.morphy(word)
  if lemma is None:
    return word
  else:
    return lemma

In [25]:
for i in range(len(d)):
  for j in range(len(d_tokenized[i])):
    d_tokenized[i][j] = lemmatize_word(d_tokenized[i][j])

print(d_tokenized[0])

['hydrogen', 'be', 'the', 'chemical', 'element', 'with', 'the', 'symbol', 'h', 'and', 'atomic', 'number', '1', 'with', 'a', 'standard', 'atomic', 'weight', 'of', '1008', 'hydrogen', 'be', 'the', 'light', 'element', 'in', 'the', 'periodic', 'table', 'hydrogen', 'be', 'the', 'most', 'abundant', 'chemical', 'substance', 'in', 'the', 'universe', 'constitute', 'roughly', '75%', 'of', 'all', 'baryonic', 'mass', 'non-remnant', 'star', 'are', 'mainly', 'compose', 'of', 'hydrogen', 'in', 'the', 'plasma', 'state', 'the', 'most', 'common', 'isotope', 'of', 'hydrogen', 'term', 'protium', 'ha', 'one', 'proton', 'and', 'no', 'neutron', 'the', 'universal', 'emergence', 'of', 'atomic', 'hydrogen', 'first', 'occur', 'during', 'the', 'recombination', 'epoch', 'at', 'standard', 'temperature', 'and', 'pressure', 'hydrogen', 'be', 'a', 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', 'with', 'the', 'molecular', 'formula', 'h2', 'since', 'hydrogen

In [26]:
def remove_stopwords(word, stopwordset):
  if word in stopwordset:
    return None
  else:
    return word

In [27]:
en_stop = nltk.corpus.stopwords.words('english')

for i in range(len(d)):
  for j in range(len(d_tokenized[i])):
    d_tokenized[i][j] = remove_stopwords(d_tokenized[i][j], en_stop)

print(d_tokenized[0])

['hydrogen', None, None, 'chemical', 'element', None, None, 'symbol', 'h', None, 'atomic', 'number', '1', None, None, 'standard', 'atomic', 'weight', None, '1008', 'hydrogen', None, None, 'light', 'element', None, None, 'periodic', 'table', 'hydrogen', None, None, None, 'abundant', 'chemical', 'substance', None, None, 'universe', 'constitute', 'roughly', '75%', None, None, 'baryonic', 'mass', 'non-remnant', 'star', None, 'mainly', 'compose', None, 'hydrogen', None, None, 'plasma', 'state', None, None, 'common', 'isotope', None, 'hydrogen', 'term', 'protium', 'ha', 'one', 'proton', None, None, 'neutron', None, 'universal', 'emergence', None, 'atomic', 'hydrogen', 'first', 'occur', None, None, 'recombination', 'epoch', None, 'standard', 'temperature', None, 'pressure', 'hydrogen', None, None, 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', None, None, 'molecular', 'formula', 'h2', 'since', 'hydrogen', 'readily', 'form', 'coval

In [28]:
# 前処理はこれで終了. 
# d_tokenizedに各文書の単語のリストが格納されている.
# まず各文書を集合として管理し, ヤッカード係数・ダイス係数・シンプソン係数を計算する.

In [29]:
s = [set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set(),set()]
for i in range(18):
  for j in range(len(d_tokenized[i])):
    s[i].add(d_tokenized[i][j])

print(s[0])

{'market', 'discrete', 'light', 'reaction', 'standard', 'term', 'atomic', 'roughly', 'compound', 'mass', 'neutron', '75%', 'cation', 'substance', 'protium', 'exist', 'named:', 'many', 'electrolysis', None, 'problematic', 'soluble', 'reform', 'tank', 'form', 'bare', 'tasteless', 'mostly', 'water', 'species', 'means', 'organic', 'henry', 'odorless', 'recombination', 'equation', 'acid–base', '1008', 'fuel', 'natural', 'ha', 'proton', 'state', 'study', 'schrödinger', 'non-toxic', 'wa', 'key', 'universal', 'exchange', 'metal', 'common', 'metallurgy', 'h', 'produce', 'gas', 'elements', 'quantum', 'acid', 'two', 'molecular', 'highly', 'occur', 'covalent', 'use', 'universe', 'pipeline', 'play', 'cavendish', 'pressure', 'solve', 'often', '1766–81', 'atom', 'le', 'energetics', 'water-former', 'neutral', '1', 'abundant', 'non-remnant', 'formula', 'constitute', 'since', 'readily', 'role', '16th', 'century', 'diatomic', 'know', 'first', 'chemical', 'write', 'star', 'later', 'table', 'storage', 'com

In [58]:
#ヤッカード係数.

import numpy as np
ja = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    if i==j:
      ja[i][j]=0
      continue
    else:
      ja[i][j]=len(s[i]&s[j])/len(s[i]|s[j])

ma = 0
ma_i = 0
ma_j = 0

for i in range(18):
  for j in range(18):
    if(ja[i][j]>ma):
      ma = ja[i][j]
      ma_i = i
      ma_j = j

print(ma,ma_i,ma_j)

0.23102310231023102 6 7


In [59]:
#ダイス係数.

import numpy as np
dsc = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    if i==j:
      dsc[i][j]=0
      continue
    else:
      dsc[i][j]=2*len(s[i]&s[j])/(len(s[i])+len(s[j]))

ma = 0
ma_i = 0
ma_j = 0

for i in range(18):
  for j in range(18):
    if(dsc[i][j]>ma):
      ma = dsc[i][j]
      ma_i = i
      ma_j = j

print(ma,ma_i,ma_j)

0.3753351206434316 6 7


In [60]:
#シンプソン係数.

import numpy as np
overlap = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    if i==j:
      overlap[i][j]=0
      continue
    else:
      overlap[i][j]=len(s[i]&s[j])/min(len(s[i]), len(s[j]))

ma = 0
ma_i = 0
ma_j = 0

for i in range(18):
  for j in range(18):
    if(overlap[i][j]>ma):
      ma = overlap[i][j]
      ma_i = i
      ma_j = j

print(ma,ma_i,ma_j)

0.4093567251461988 6 7


In [34]:
#今までは文書を集合の形で表現していた. 次に文書をベクトルで表現する. ベクトル表現では1-gramのBoWおよびtf-idfを用いる.

In [35]:
# Bowモデル.

dic = {}
list_words = []

for i in range(18):
  for j in range(len(d_tokenized[i])):
    if d_tokenized[i][j] not in list_words:
      list_words.append(d_tokenized[i][j])
      dic[d_tokenized[i][j]] = len(list_words)-1

bow = np.zeros((18,len(list_words)))

for i in range(18):
  for j in range(len(d_tokenized[i])):
    bow[i][dic[d_tokenized[i][j]]] += 1  

In [36]:
print(list_words)

['hydrogen', None, 'chemical', 'element', 'symbol', 'h', 'atomic', 'number', '1', 'standard', 'weight', '1008', 'light', 'periodic', 'table', 'abundant', 'substance', 'universe', 'constitute', 'roughly', '75%', 'baryonic', 'mass', 'non-remnant', 'star', 'mainly', 'compose', 'plasma', 'state', 'common', 'isotope', 'term', 'protium', 'ha', 'one', 'proton', 'neutron', 'universal', 'emergence', 'first', 'occur', 'recombination', 'epoch', 'temperature', 'pressure', 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', 'molecular', 'formula', 'h2', 'since', 'readily', 'form', 'covalent', 'compound', 'elements', 'earth', 'exist', 'water', 'organic', 'play', 'particularly', 'important', 'role', 'acid–base', 'reaction', 'acid-base', 'involve', 'exchange', 'soluble', 'molecule', 'ionic', 'take', 'negative', 'charge', 'know', 'hydride', 'positively', 'species', 'denote', 'h+', 'cation', 'write', 'though', 'bare', 'reality', 'always', 'comple

In [37]:
print(bow)

[[ 19. 144.   2. ...   0.   0.   0.]
 [  1. 258.   1. ...   0.   0.   0.]
 [  0. 120.   2. ...   0.   0.   0.]
 ...
 [  1.  99.   4. ...   0.   0.   0.]
 [  0. 179.   6. ...   0.   0.   0.]
 [  0. 113.   2. ...   1.   1.   1.]]


In [38]:
#ユークリッド距離を計算する

def euc(list_a, list_b):
  diff_vec = (list_a) - (list_b)
  return np.linalg.norm(diff_vec)

In [39]:
euc_dis = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    euc_dis[i][j]=euc(bow[i], bow[j])

m = 1e15
m_i = 0
m_j = 0

for i in range(18):
  for j in range(18):
    if i==j:
      continue
    if(euc_dis[i][j]<m):
      m = euc_dis[i][j]
      m_i = i
      m_j = j

In [40]:
print(m,m_i,m_j)

26.40075756488817 8 11


In [41]:
# 文書8と文書11(フッ素とマグネシウム)が最も近い.

In [42]:
# ミンコフスキー距離(p=0,1,3)で計算する.

def mnk(list_a, list_b, p):
  diff_vec = (list_a) - (list_b)
  return np.linalg.norm(diff_vec, ord=p)

In [43]:
mnk_0 = np.zeros((18,18))
mnk_1 = np.zeros((18,18))
mnk_3 = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    mnk_0[i][j]=mnk(bow[i], bow[j],0)
    mnk_1[i][j]=mnk(bow[i], bow[j],1)
    mnk_3[i][j]=mnk(bow[i], bow[j],3)

m_0 = 1e15
mi_0 = 0
mj_0 = 0
m_1 = 1e15
mi_1 = 0
mj_1 = 0
m_3 = 1e15
mi_3 = 0
mj_3 = 0

for i in range(18):
  for j in range(18):
    if i==j:
      continue
    if(mnk_0[i][j]<m_0):
      m_0 = mnk_0[i][j]
      mi_0 = i
      mj_0 = j
    if(mnk_1[i][j]<m_1):
      m_1 = mnk_1[i][j]
      mi_1 = i
      mj_1 = j
    if(mnk_3[i][j]<m_3):
      m_3 = mnk_3[i][j]
      mi_3 = i
      mj_3 = j

In [44]:
print(m_0, mi_0, mj_0)
print(m_1, mi_1, mj_1)
print(m_3, mi_3, mj_3)

175.0 10 12
244.0 3 12
14.706902831145593 8 11


In [45]:
# p=0では10と12, p=1では3と12, p=3ではp=2と同じく8と11が最も類似度が高いと計算された.

In [76]:
#コサイン類似度を計算する.

def cos_sim(a,b):
  inner_product = np.dot(a,b)
  size_a = np.linalg.norm(a, ord=2)
  size_b = np.linalg.norm(b, ord=2)
  return inner_product / (size_a * size_b)

In [77]:
cossim = np.zeros((18,18))

for i in range(18):
  for j in range(18):
    cossim[i][j]=cos_sim(bow[i], bow[j])

m = 1e15
m_i = 0
m_j = 0

for i in range(18):
  for j in range(18):
    if i==j:
      continue
    if(cossim[i][j]<m):
      m = cossim[i][j]
      m_i = i
      m_j = j

In [48]:
print(m,m_i,m_j)

0.9561927889684569 10 14


In [49]:
#tf-idfを計算する.



In [50]:
print(d_tokenized)

[['hydrogen', None, None, 'chemical', 'element', None, None, 'symbol', 'h', None, 'atomic', 'number', '1', None, None, 'standard', 'atomic', 'weight', None, '1008', 'hydrogen', None, None, 'light', 'element', None, None, 'periodic', 'table', 'hydrogen', None, None, None, 'abundant', 'chemical', 'substance', None, None, 'universe', 'constitute', 'roughly', '75%', None, None, 'baryonic', 'mass', 'non-remnant', 'star', None, 'mainly', 'compose', None, 'hydrogen', None, None, 'plasma', 'state', None, None, 'common', 'isotope', None, 'hydrogen', 'term', 'protium', 'ha', 'one', 'proton', None, None, 'neutron', None, 'universal', 'emergence', None, 'atomic', 'hydrogen', 'first', 'occur', None, None, 'recombination', 'epoch', None, 'standard', 'temperature', None, 'pressure', 'hydrogen', None, None, 'colorless', 'odorless', 'tasteless', 'non-toxic', 'nonmetallic', 'highly', 'combustible', 'diatomic', 'gas', None, None, 'molecular', 'formula', 'h2', 'since', 'hydrogen', 'readily', 'form', 'cova

In [51]:
def tfidf_vectorizer(docs):
  def tf(word2id, doc):
    term_counts = np.zeros(len(word2id))
    for term in word2id.keys():
      term_counts[word2id[term]] = doc.count(term)
    tf_values = list(map(lambda x: x/sum(term_counts), term_counts))
    return tf_values
  
  def idf(word2id, docs):
    idf = np.zeros(len(word2id))
    for term in word2id.keys():
      idf[word2id[term]] = np.log(len(docs) / sum([bool(term in doc) for doc in docs]))
    return idf
  
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
  
  return [[_tf*_idf for _tf, _idf in zip(tf(word2id, doc), idf(word2id, docs))] for doc in docs], word2id

In [52]:
tfidf_vector, word2id = tfidf_vectorizer(d_tokenized)
print(tfidf_vector)
print(word2id.items())

[[0.04957118941429883, 0.0, 0.0, 0.0, 0.0, 0.007984452369878907, 0.0, 0.0, 0.006069681152862484, 0.005218019938347245, 0.006069681152862484, 0.007984452369878907, 0.0026090099691736227, 0.0016237200687903841, 0.0022401387188296373, 0.0011200693594148187, 0.008309819871692123, 0.0013604322792756746, 0.006069681152862484, 0.007984452369878907, 0.007984452369878907, 0.007984452369878907, 0.0016237200687903841, 0.007984452369878907, 0.003034840576431242, 0.012139362305724969, 0.008309819871692123, 0.006069681152862484, 0.004154909935846061, 0.0006942387521571993, 0.003034840576431242, 0.006069681152862484, 0.007984452369878907, 0.0032474401375807683, 0.003034840576431242, 0.02395335710963672, 0.007984452369878907, 0.006069681152862484, 0.007984452369878907, 0.004871160206371152, 0.0013604322792756746, 0.007984452369878907, 0.007984452369878907, 0.0019147712170164233, 0.003538491285806807, 0.004154909935846061, 0.003538491285806807, 0.006069681152862484, 0.006069681152862484, 0.009899223586

In [53]:
print(tfidf_vector[0])

[0.04957118941429883, 0.0, 0.0, 0.0, 0.0, 0.007984452369878907, 0.0, 0.0, 0.006069681152862484, 0.005218019938347245, 0.006069681152862484, 0.007984452369878907, 0.0026090099691736227, 0.0016237200687903841, 0.0022401387188296373, 0.0011200693594148187, 0.008309819871692123, 0.0013604322792756746, 0.006069681152862484, 0.007984452369878907, 0.007984452369878907, 0.007984452369878907, 0.0016237200687903841, 0.007984452369878907, 0.003034840576431242, 0.012139362305724969, 0.008309819871692123, 0.006069681152862484, 0.004154909935846061, 0.0006942387521571993, 0.003034840576431242, 0.006069681152862484, 0.007984452369878907, 0.0032474401375807683, 0.003034840576431242, 0.02395335710963672, 0.007984452369878907, 0.006069681152862484, 0.007984452369878907, 0.004871160206371152, 0.0013604322792756746, 0.007984452369878907, 0.007984452369878907, 0.0019147712170164233, 0.003538491285806807, 0.004154909935846061, 0.003538491285806807, 0.006069681152862484, 0.006069681152862484, 0.0098992235868

In [54]:
#ベクトルが得られたので, 同様に類似度を計算していく.

def sim_calc(func):
  sim = np.zeros((18,18))
  for i in range(18):
    for j in range(18):
      sim[i][j]=func(tfidf_vector[i], tfidf_vector[j])
  m = 1e15
  mi = 0
  mj = 0 
  for i in range(18):
    for j in range(18):
      if i==j:
        continue
      if m > sim[i][j]:
        m = sim[i][j]
        mi = i
        mj = j
  print(m,mi,mj)

In [55]:
#ミンコフスキー距離をリスト用に再定義する

def mnk(list_a, list_b, p):
  diff_vec = []
  for i in range(len(list_a)):
    diff_vec.append((list_a[i]) - (list_b[i]))
  return np.linalg.norm(diff_vec, ord=p)

In [56]:
def mnk_0(x,y):
  return mnk(x,y,0)
def mnk_1(x,y):
  return mnk(x,y,1)
def mnk_2(x,y):
  return mnk(x,y,2)
def mnk_3(x,y):
  return mnk(x,y,3)
sim_calc(mnk_0)
sim_calc(mnk_1)
sim_calc(mnk_2)
sim_calc(mnk_3)

180.0 3 12
1.6972788421209961 6 7
0.12173307956340505 1 5
0.057211222308122416 1 7


In [57]:
sim_calc(cos_sim)

0.004008277735094699 9 10


In [100]:
#NoneをカットしたBoWモデル.
bow_delete_none = np.zeros((18,len(list_words)-1))

for i in range(18):
  bow_delete_none[i] = np.insert(bow[i, 2:], 0, bow[i][0])

In [102]:
def sim_calc_bow(func):
  sim = np.zeros((18,18))
  for i in range(18):
    for j in range(18):
      sim[i][j]=func(bow_delete_none[i], bow_delete_none[j])
  m = 1e15
  mi = 0
  mj = 0 
  for i in range(18):
    for j in range(18):
      if i==j:
        continue
      if m > sim[i][j]:
        m = sim[i][j]
        mi = i
        mj = j
  print(m,mi,mj)

In [104]:
def mnk_0(x,y):
  return mnk(x,y,0)
def mnk_1(x,y):
  return mnk(x,y,1)
def mnk_2(x,y):
  return mnk(x,y,2)
def mnk_3(x,y):
  return mnk(x,y,3)
sim_calc_bow(mnk_0)
sim_calc_bow(mnk_1)
sim_calc_bow(mnk_2)
sim_calc_bow(mnk_3)
sim_calc_bow(cos_sim)

174.0 10 12
224.0 3 12
21.77154105707724 3 12
12.707470752764678 8 12
0.06357144337895855 9 10
