<strong>Date :</strong> Created on 8 February 2021| Updated on 20 March 2021 </strong>

<strong>Group 2 - Hydrogen vehicles 
    
@author : </strong>Théo SACCAREAU

<strong>scraping_V1.4
    
Description :</strong> The purpose of this notebook is to retrieve the desired information from the hydrogen-related articles on the Core site.

# Install / Download / Import Librairies

In [1]:
# Scraping librairies
import requests
from bs4 import BeautifulSoup

# Text librairy
import re

# Useful librairies
from tqdm import tqdm
import pandas as pd
import numpy as np

# Part 1 - Scraping of general information (id, title, date, authors and language). 
In this first part, we will retrieve information that can be accessed directly from the search page (without the need to "click" on the article link). 

In [2]:
# Create a session object
with requests.Session() as s:

    # Opening the "df_infos.csv" file (rather creation as it does not exist) in write mode.
    # This file will store the scraper information (which avoids re-executing the code each time).
    with open("df_infos.csv", "w") as outf:

        # Writing the header line.
        outf.write("Id;Date;Auteur;Titre;Langue\n")

        # Browsing the pages to retrieve information from 11450 articles (100 articles per page)
        for page in tqdm(range(0, 11450, 100)):

            # Research equation on title and abstract only (not on all the content)
            research_eq = "(vehicle* OR transport OR transports OR train OR trains OR tractor OR bike* \
            OR bicycle* OR boat OR boats OR ship OR ships OR *plane OR *planes OR aircraft* OR car \
            OR cars OR truck* OR lorry* OR bus OR automobile* OR motor* OR rocket*) \
            AND hydrogen AND (ecolog* OR climat* OR pollution* OR environment* OR \"renewable energy\" \
            OR emission* OR \"carbon neutrality\" OR \"global warming\" OR conservation* OR sustaina*)"

            # Data to make the request
            request_data = {"basicQuery": {
                "count": 100,  # 100 articles per page
                "searchCriteria": "title : (" + research_eq + ") abstract:(" + research_eq + ")",
                "offset": page,  # page number
                "sortByDate": False}  # we sort by relevance and not by date
            }

            # Sending the POST request with the request data.
            res = s.post("https://core.ac.uk/search/api/search",
                         json=request_data)

            # Storage of results (in .json format)
            json = res.json()

            # Separation of different information :

            # (1) Identifiants
            ids = [elem['id'] for elem in json['results']]

            # (2) Dates
            dates = [elem['datePublished'][:10]
                     if 'datePublished' in elem else '' for elem in json['results']]

            # (3) Authors
            authors = [elem['authorsString'].replace('\n', '').replace('\r', '').replace('/', '').replace(',', '/')
                       if 'authorsString' in elem else '' for elem in json['results']]

            # (4) Titles
            titles = [elem['title'].replace('\n', '').replace(';', ',').replace('\r', '')
                      if 'title' in elem else '' for elem in json['results']]

            # (5) Language
            languages = [elem['language']['name']
                         if 'language' in elem else '' for elem in json['results']]

            # For each of the 100 articles on the current page ... 
            for identifiant, date, author, title, language in zip(ids, dates, authors, titles, languages):
                # ... writing to the .csv file. 
                outf.write(identifiant + ';' + date + ';' + author +
                           ';' + title + ';' + language + '\n')

100%|██████████| 115/115 [15:14<00:00,  7.95s/it]


# Part 2 - Scraping of specific information (keywords and abstracts).
Thanks to the recovered identifiers in part 1, we can access the content of the articles. In particular, we retrieve the keywords and summaries (if present). 

In [3]:
# To avoid re-executing the code from part 1 (15 min), we read the .csv file into a dataframe.
df1 = pd.read_csv("df_infos.csv", sep=';', index_col=False, encoding='utf-8')

# Any duplicates are removed
df1 = df1.drop_duplicates()

# Format
df1.shape

(8801, 5)

Once the duplicates are removed, only 8801 articles (out of 11450) remain.

In [9]:
# Create a session object
with requests.Session() as s:

    # Opening the "df_content.csv" file (rather creation as it does not exist) in write mode.
    # This file will store the scraper information (which avoids re-executing the code each time).
    with open("df_content.csv", "w") as outf:

        # Writing the header line.
        outf.write("Id;MotCle;Abstract\n")

        # For each identifiant
        for id in tqdm(df1['Id']):

            # Sending the request
            url = "https://core.ac.uk/display/" + str(id) + "?recSetID="
            res = s.get(url)
            soup = BeautifulSoup(res.text)

            # (1) Keywords
            try:
                # Search for the "Topics" area
                divs = soup.find('div', {'class': 'article_sum'})
                topic = divs.find('div', {'class': None}).text.replace(
                    "\r", "").replace('\n', '').strip()

                # We remove the string "Topics" to keep only the keywords
                topic = ' '.join(topic.split())[8:]

                # The list of keywords can be separated by ";", ",", ".", "and" or "AND".
                if (';' in topic and ',' not in topic):
                    topic = topic.split(';')
                elif (',' in topic and ';' not in topic):
                    topic = topic.split(',')
                elif (';' in topic and ',' in topic):
                    tempo = topic.split(';')
                    topic = []
                    for i in tempo:
                        topic.extend(i.split(','))
                elif ('.' in topic):
                    topic = topic.split('.')
                elif (' - ' in topic):
                    topic = topic.split(' - ')
                elif ('and' in topic):
                    topic = topic.split('and')
                elif ('AND' in topic):
                    topic = topic.split('AND')
                else:
                    topic = [topic]

                # In addition, some keywords can be in the summary (at the end with the mention "Keywords: ")
                abstract = soup.find('p',  {'class': 'abstract'})
                if (abstract is not None):
                    abstract = abstract.text.strip().replace("\r", "").replace('\n', '')
                    if ("Keywords" in abstract):
                        index = abstract.rfind("Keywords:") + len("Keywords:")
                        keywords = abstract[index:].split(',')
                        topic.extend(keywords)

            # If the "topics" area does not exist, we search only in the abstracts 
            except:
                topic = []
                abstract = soup.find('p',  {'class': 'abstract'})
                
                if (abstract is not None):
                    abstract = abstract.text.strip().replace("\r", "").replace('\n', '')

                    # The keywords can be :
                    # - either at the end of the summary preceded by the words "Keywords"; or
                    if ("Keywords" in abstract):
                        index = abstract.rfind("Keywords:") + len("Keywords:")
                        keywords = abstract[index:].split(',')
                        topic.extend(keywords)

                    # - or at the end of the summary in a list (in the last sentences)
                    else:
                        index = abstract.rfind('.') + 1
                        
                        # The list of keywords can be separated by ";"
                        if (';' in abstract[index:]):
                            abstract_pv = abstract[index:].split(';')
                            topic = [topic for topic in abstract_pv]

                        # or the list of keywords can be separated by ","
                        elif (',' in abstract[index:]):
                            abstract_v = abstract[index:].split(',')

                            # # The "," may be present in a normal sentence, does not mean that it is necessarily a list.
                            isTopics = True
                            for i in abstract_v:
                                if (len(i) >= 40):
                                    isTopics = False
                            if (isTopics):
                                topic = [topic for topic in abstract_v]

            # (2) Abstract
            try:
                # We check that there is a summary.
                abstract = soup.find('p',  {'class': 'abstract'}).text.strip().replace(
                    "\r", "").replace('\n', '').replace(';', ',')
            except:
                # If not, we return an empty string.
                abstract = ''

            # The list of keywords is combined into a single string separated by '/'.
            topic = [word for word in topic if len(word) > 3]
            chaine = ""
            for i in topic:
                chaine = chaine + i + "/"

            # Writing to the file 
            outf.write(str(id) + ';' + chaine + ';' + abstract + '\n')

100%|██████████| 8801/8801 [2:27:42<00:00,  1.01s/it]   


# Part 3 - Extraction of information from abstracts and titles
In part 1, we have collected the titles of the articles and in part 2, the abstracts. From this information, we will look for the vehicles, fields, technologies, etc. mentioned in the articles.

In [5]:
# To avoid re-executing the code from part 1 (15 min), we read the .csv file into a dataframe.
df2 = pd.read_csv("df_content.csv", sep=';', index_col=False, encoding='utf-8')

# Any duplicates are removed
df2 = df2.drop_duplicates()

# Articles that do not have an abstract are not kept
df2 = df2.dropna(subset=['Abstract'])

# Format
df2.shape

(8439, 3)

After deleting the articles without an abstract, only 8439 articles remain. 

In [6]:
# A join is made between the two dataframes.
# The join concerns the identifiers (only column in common between the two dataframes).
# The 'df' dataframe will contain all the information needed to populate the database. 
df = pd.merge(df1, df2)

# Any duplicates are removed
df = df.drop_duplicates()

# Format
df.shape

(8439, 7)

In the end, we will work on a corpus of 8439 articles. 

In [7]:
# Titles and abstracts are extracted into two variables. 
# To make it easier to find information, we put everything in lower case. 
# This avoids having to differentiate between "Car" and "car". 
title = df['Titre'].apply(lambda x : x if (x is np.nan) else x.lower())
abstract = df['Abstract'].apply(lambda x : x if (x is np.nan) else x.lower())

# Now that the abstracts are temporarily stored in a variable, they can be deleted from
# the dataframe (they will not be stored in the database). 
df = df.drop(columns=['Abstract'])

## Part 3-1) An attribute is associated with a single vocabulary word.

In [10]:
def extraction_info1(list_att_BD: list, list_voc : list, titles: pd.Series, abstracts: pd.Series) -> list:
    """ Documentation :
            - Function that checks for each article whether the summary or title contains a word 
            from the vocabulary list passed in parameter. 

        Parameters: 
            - list_att_BD : list of attributes that will be inserted in the database. 
            - list_voc : list of vocabulary to be searched in the abstracts or titles. 
            - titles : titles of articles 
            - abstracts : abtracts of the articles 
        
        Output : 
            - res : list containing for each article the list of attributes extracted from the abstract or title. 

    """
    res = []

    # The list of titles and summaries is browsed simultaneously. 
    for title, abstract in zip(tqdm(titles), abstracts):
        try:
            # The list of all the attributes contained in the summary is retrieved in "list_abs". 
            list_abs = [item_BD for item, item_BD in zip(
                list_voc, list_att_BD) if re.findall(item, abstracts) != []]
        except:
            # If no match, empty list 
            list_abs = []
        try:
            # Ditto for titles
            list_tit = [item_BD for item, item_BD in zip(
                list_voc, list_att_BD) if re.findall(item, title) != []]
        except:
            # If no match, empty list 
            list_tit = []

        # The results of the two lists are concatenated. 
        list_res = list_abs + list_tit

        # Any duplicates are removed 
        list_res = list(set(list_res))  

        # The list of attributes is combined into a single string separated by '/'.
        chaine_res = ""
        for item in list_res:
            chaine_res = chaine_res + item + "/"

        # We add this string to the final "res" list
        res.append(chaine_res)

    return res

### 3-1-1) Countries

In [11]:
list_countries = ["algeria", "angola", "(benin|dahomey)", "botswana", "burkina", "burundi", "cameroon", "cape verde", "central african republic", "( |,|\.|\-)chad( |,|\.|\-)", "comoros", "( |,|\.|\-)congo( |,|\.|\-)", "djibouti",
                  "egypt", "equatorial guinea", "eritrea", "ethiopia", "( |,|\.|\-)gabon( |,|\.|\-)", "gambia", "ghana", "guinea", "guinea(\-| )bissau", "ivory coast", "kenya", "lesotho", "liberia", "libya",
                  "madagascar", "malawi", "( |,|\.|\-)mali( |,|\.|\-)", "mauritania", "mauritius", "morocco", "mozambique", "namibia", "( |,|\.|\-)niger( |,|\.|\-)", "nigeria", "rwanda", "sao tome and principe", "senegal",
                  "seychelles", "sierra leone", "somalia", "south(\-| )africa", "( |,|\.|\-)sudan( |,|\.|\-)", "swaziland", "tanzania", "( |,|\.|\-)togo( |,|\.|\-)", "tunisia", "uganda", "zambia", "zimbabwe", "albania", "andorra",
                  "armenia", "austria", "azerbaijan", "belarus", "belgium", "bosnia", "bulgaria", "croatia", "cyprus", "(czech republic|czechia)", "denmark", "estonia", "finland",
                  "france", "georgia", "germany", "greece", "hungary", "iceland", "ireland", "italy", "latvia", "liechtenstein", "lithuania", "luxembourg",
                  "macedonia", "( |,|\.|\-)malta( |,|\.|\-)", "moldova", "monaco", "montenegro", "netherlands", "( |,|\.|\-)norway( |,|\.|\-)", "poland", "portugal", "romania", "san marino", "serbia", "slovakia", "slovenia",
                  "(espana|spain)", "sweden", "switzerland", "ukraine", "(united(\-| )kingdom|england|scotland|wales|northern ireland)", "vatican city", "antigua and barbuda", "bahamas", "barbados", "belize", "canada",
                  "costa rica", "( |,|\.|\-)cuba( |,|\.|\-)", "dominica( |,|\.|\-)", "dominican republic", "el salvador", "grenada", "guatemala", "haiti", "honduras", "jamaica", "mexico", "nicaragua", "panama",
                  "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines", "trinidad and tobago", "(( |,|\.|\-)us( |,|\.|\-)|usa|united(\-| )states)", "argentina", "bolivia", "brazil", "( |,|\.|\-)chile( |,|\.|\-)",
                  "colombia", "ecuador", "guyana", "paraguay", "( |,|\.|\-)peru( |,|\.|\-)", "suriname", "uruguay", "venezuela", "afghanistan", "bahrain", "bangladesh", "bhutan", "brunei", "(burma|myanmar)",
                  "cambodia", "china", "(east timor|timor(\-| )leste)", "india", "indonesia", "( |,|\.|\-)iran( |,|\.|\-)", "( |,|\.|\-)iraq( |,|\.|\-)", "israel", "japan", "( |,|\.|\-)jordan( |,|\.|\-)", "kazakhstan",
                  "(republic of korea|south(\-| )korea|korea, south)", "(north(\-| )korea|korea, north)", "kuwait", "kyrgyzstan", "laos", "lebanon", "malaysia", "maldives", "mongolia",
                  "( |,|\.|\-)nepal( |,|\.|\-)", "( |,|\.|\-)oman( |,|\.|\-)", "pakistan", "palestine", "philippines", "qatar", "russia", "saudi arabia", "singapore", "sri lanka", "syria", "tajikistan", "thailand", "turkey",
                  "turkmenistan", "united arab emirates", "uzbekistan", "vietnam", "yemen", "australia", "fiji", "kiribati", "marshall islands", "micronesia", "( |,|\.|\-)nauru( |,|\.|\-)", "new(\-| )zealand",
                  "palau( |,|\.|\-)", "papua new guinea", "samoa", "solomon islands", "tonga", "tuvalu", "vanuatu", "democratic republic of congo"]

list_countries_BD = ["Algérie", "Angola", "Bénin", "Botswana", "Burkina Faso", "Burundi", "Cameroun", "Cap vert",
                     "République centrafricaine", "Tchad", "Comores", "Congo", "Djibouti", "Egypte",
                     "Guinée équatoriale", "Erythree", "Ethiopie", "Gabon", "Gambie", "Ghana", "Guinée",
                     "Guinée-Bissau", "Côte d'Ivoire", "Kenya", "Lesotho", "Liberia", "Libye", "Madagascar",
                     "Malawi", "Mali", "Mauritanie", "Ile Maurice", "Maroc", "Mozambique", "Namibie", "Niger",
                     "Nigeria", "Rwanda", "Sao Tomé-et-Principe", "Sénégal", "Seychelles", "Sierra Leone",
                     "Somalie", "Afrique du Sud", "Soudan", "Swaziland", "Tanzanie", "Togo", "Tunisie", "Ouganda",
                     "Zambie", "Zimbabwe", "Albanie", "Andorre", "Arménie", "Autriche", "Azerbaijan",
                     "Biélorussie", "Belgique", "Bosnie", "Bulgarie", "Croatie", "Chypre", "République Tchèque",
                     "Danemark", "Estonie", "Finlande", "France",
                     "Géorgie", "Allemagne", "Grèce", "Hongrie", "Islande", "Irelande", "Italie", "Lettonie",
                     "Liechtenstein", "Lituanie", "Luxembourg", "Macédoine", "Malte", "Moldavie", "Monaco",
                     "Montenegro", "Pays-Bas", "Norvège", "Pologne", "Portugal", "Roumanie", "Saint-Marin",
                     "Serbie", "Slovaquie", "Slovénie", "Espagne", "Suède", "Suisse", "Ukraine", "Royaume-Uni",
                     "Vatican", "Antigua-et-Barbuda", "Bahamas", "La Barbade", "Belize", "Canada", "Costa Rica",
                     "Cuba", "Dominique", "République dominicaine", "El Salvador", "Grenade", "Guatemala", "Haiti",
                     "Honduras", "Jamaique", "Mexique", "Nicaragua", "Paname", "Saint-Christophe et Niévès",
                     "Saint Lucie", "Saint Vincent et les Grenadines", "Trinidad et Tobago", "Etats-Unis",
                     "Argentine", "Bolivie", "Brésil", "Chili", "Colombie", "Equateur", "Guyane", "Paraguay",
                     "Pérou", "Suriname", "Uruguay", "Venezuela", "Afghanistan", "Bahrein", "Bangladesh",
                     "Bhoutan", "Brunei", "Myanmar", "Cambodge", "Chine", "Timor oriental", "Inde", "Indonésie",
                     "Iran", "Irak", "Israël", "Japon", "Jordanie", "Kazakhstan", "Corée du Sud", "Corée du Nord",
                     "Koweït", "Kyrgyzstan", "Laos", "Liban", "Malaysie", "Maldives", "Mongolie", "Népal", "Oman",
                     "Pakistan", "Palestine", "Philippines", "Qatar", "Russie", "Arabie Saoudite", "Singapour",
                     "Sri Lanka", "Syrie", "Tajikistan", "Thaïlande", "Turquie", "Turkmenistan",
                     "Emirats Arabes Unis", "Ouzbekistan", "Viêtnam", "Yémen", "Australie", "Fiji",
                     "République de Kiribati", "Iles Marshall", "Micronésie", "Nauru", "Nouvelle-Zélande", "Palau",
                     "Papouasie Nouvelle Guinée", "Iles Samoa", "Iles Salomon", "Tonga", "Tuvalu", "Vanuatu",
                     "République Democratique du Congo"]

df['Pays'] = extraction_info1(
    list_countries_BD, list_countries, title, abstract)

100%|██████████| 8439/8439 [00:02<00:00, 3162.19it/s]


### 3-1-2) Vehicles

In [12]:
list_vehicles = [" train", " tractor", " (bike|bicycle)", " (boat|ship|vessel|ferry|narrowboat|barge|liner|yacht)",
                 " (aeroplane| plane|airplane|aircraft| jet)", " (car|automobile|saloon|van|pickup)",
                 " (truck|lorry|semitrailer|rig|juggernaut|hgv|heavy goods vehicle)", " (bus|coach)", " rocket",
                 " helicopter", "(( | motor)bike| motorcycle)", " (tram|streetcar)", "submarine"]
# We want a space (end of word), a comma, an s or a full stop after our words.
list_vehicles = [vehi + "( |s|,|\.)" for vehi in list_vehicles]

list_vehicles_BD = ["Train", "Tracteur", "Vélo", "Bateau", "Avion", "Voiture", "Camion", "Bus", "Fusée",
                    "Hélicoptère", "Moto", "Tramway", "Sous-marin"]

df['Vehicule'] = extraction_info1(
    list_vehicles_BD, list_vehicles, title, abstract)

100%|██████████| 8439/8439 [00:00<00:00, 22629.95it/s]


### 3-1-3) Brands

In [13]:
list_brands = ["hyundai", "toyota", "renault", "honda", "airbus", "boeing", "thales", "mercedes",
               "( |')audi( |'|\.|,)", "( |'|\.|,)kia( |'|\.|,)", "river( |\-|)simple", "nissan",
               "( |'|\.|,)ford( |'|\.|,)",
               "daimler", "alstom", "bmw", "hopium", "peugeot", "volkswagen", "general motors", "( |'|\.|,)psa( |'|\.|,)",
               "(roland gumpert|apollo automobil|gmbh)", "mazda", "aston martin", "pininfarina", "suzuki",
               "volvo", "( |'|\.|,)opel( |'|\.|,)", "dassault", "cessna", "bombardier", " mig( |'|\.|,)",
               "diamond aircraft", "zeroavia", "rolls-royce", "( |'|\.|,)eviation( |'|\.|,)", "(gknpz|khrounitchev)",
               "spacex", "avio( |'|\.|,)", "ariane", "united launch alliance", "mcdonnell douglas", "mitsubishi",
               "isro", "ioujnoïe", "citroen", "( |'|\.|,)fiat( |'|\.|,)", "( |'|\.|,)lancia( |'|\.|,)", "skoda",
               "yamaha", "( |'|\.|,)ktm( |'|\.|,)", "kawasaki", "( |'|\.|,)ducati( |'|\.|,)", "suzuki"]

list_brands_BD = ["Hyundai", "Toyota", "Renault", "Honda", "Airbus", "Boeing", "Thalès", "Mercedes", "Audi", "Kia",
                  "Riversimple", "Nissan", "Ford", "Daimler", "Alstom", "BMW", "Hopium", "Peugeot", "Volkswagen",
                  "General Motors", "PSA", "Roland Gumpert", "Mazda", "Aston Martin", "Pininfarina", "Suzuki",
                  "Volvo", "Opel", "Dassault", "Cessna", "Bombardier", "MiG", "Diamond Aircraft", "ZeroAvia",
                  "Rolls-Royce", "Eviation", "Khrounitchev", "SpaceX", "Avio", "ArianeGroup",
                  "United Launch Alliance", "McDonnell Douglas", "Mitsubishi Heavy Industries", "ISRO",
                  "Ioujnoie", "Citroën", "Fiat", "Lancia", "Skoda", "Yamaha", "KTM", "Kawasaki", "Ducati",
                  "Suzuki"]

df['Marque'] = extraction_info1(list_brands_BD, list_brands, title, abstract)

100%|██████████| 8439/8439 [00:00<00:00, 10827.96it/s]


## Part 3-2) An attribute is associated with several vocabulary words.
For instance : "militar", "army", "soldie", "warfare", "armed forces" for the domain "military"

In [16]:
def extraction_info2(list_att_BD: list, list_list_voc: list, titles: pd.Series, abstracts: pd.Series) -> list:
    """ Documentation :
            - Function that checks for each article whether the summary or title contains a word 
            from the vocabulary list passed in parameter. 
            The difference with the previous function is that here an attribute is associated with several vocabulary words. 


        Parameters: 
            - list_att_BD : list of attributes that will be inserted in the database. 
            - list_list_voc : list containing the vocabulary lists to be searched in the abstracts or titles. 
            - titles : titles of articles 
            - abstracts : abtracts of the articles 

        Output : 
            - res : list containing for each article the list of attributes extracted from the abstract or title. 

    """
    res = []

    # The list of titles and summaries is browsed simultaneously.
    for title, abstract in zip(tqdm(titles), abstracts):

        # Each vocabulary list associated with an attribute is browsed
        list_tempo = []
        for att_BD, list_voc in zip(list_att_BD, list_list_voc):

            # # The list of all the attributes contained in the summary is retrieved in "list_abs".
            try:
                list_abs = [
                    voc for voc in list_voc if re.findall(voc, abstract) != []]
            except:
                list_abs = []

            # Ditto for titles
            try:
                list_tit = [
                    voc for voc in list_voc if re.findall(voc, title) != []]
            except:
                list_tit = []

            # The results of the two lists are concatenated.
            list_res = list_abs + list_tit

            # If the list contains at least one item then the attribute is mentioned in the summary or title
            if (len(list_res) > 0):
                list_tempo.append(att_BD)

        # The list of attributes is combined into a single string separated by '/'.
        chaine_res = ""
        for item in list_tempo:
            chaine_res = chaine_res + item + "/"

        # We add this string to the final "res" list
        res.append(chaine_res)

    return res

### 3-2-1) Engines

In [17]:
combustion_int = ["internal( |\-)combustion engine", "i.c.e", "burn(s| ) * fuel * cylinder(s| )",
                  "reciprocating engine", "piston engine"]

reaction = ["jet (engine|propulsion)", "turbojet", "propellant ", "projection * fluid", "turbofan",
            "ramjet", "pulse jet"]

pile_combustible = ["fuel cell", "electrochemical cell", "oxidi(z|s)ation", "oxidizing agent",
                    "redox", "chemical energy", "anode", "cathode", "electrolyte"]

hybride = ["hybrid", "hydrogen addition on"]

list_engines = [combustion_int, reaction, pile_combustible, hybride]
list_engines_BD = ["Combustion interne",
                   "Réaction",  "Pile à combustible", "Hybride"]

df['Moteur'] = extraction_info2(list_engines_BD, list_engines, title, abstract)

100%|██████████| 8439/8439 [00:01<00:00, 5362.21it/s]


### 3-2-2) Storages

In [18]:
gaz = ["compress", "gaseous", "increase * pressure", "high( |-)pressure", "ch2", "cgh2",
       "under( |-)pressure", "storage density", "hydrogen tanks"]

liquide = ["liqu(e|i)f", "liquid (hydrogen|state|form)", "LH2", "cooled ", "critical point", "33( |)k",
           "-253(| )°c", "252(,|.)87", "20(.|,)28( |)k", "parahydrogen", "orthohydrogen", "low temperature",
           "cooling", "chilled"]

solide = ["solid (hydrogen|state|form)", "a(b|d)sorption",
          "metal alloy", "hydride", "alanate"]

list_storages = [gaz, liquide, solide]
list_storages_BD = ['Gaz', 'Liquide', 'Solide']

df['Stockage'] = extraction_info2(
    list_storages_BD, list_storages, title, abstract)

100%|██████████| 8439/8439 [00:01<00:00, 5362.01it/s]


### 3-2-3) Types of production 

In [20]:
# (1) Hydrocarbons
SMR = ["smr", "steam (methane |)reforming", "nickel catalyst", "endothermic reaction", "iron oxide",
       "reaction * (hydrocarbons|fuels) * water"]

methane_pyrolysis = ["pyrolysis", "bubble column",
                     "molten metal catalyst", "solid carbon"]

partial_oxidation = ["partial oxidation", "( |\()pox( |\)|,|\.)", "substoichiometric",
                     "(fuel\-air|fuel\-oxygen) mixture", "water-gas shift reaction",
                     "partial. combusted", "partial oxidation reactor"]

plasma_reforming = ["plasma reforming", "kvaerner", "cb&h", "carbon black", "plasma arc waste disposal",
                    "plasma gasification", "plasma converter"]

coal_gasification = ["coal gasification", "break molecular bonds in coal", "coal * water * (air|oxygen)",
                     "gaseous mix of hydrogen and carbon monoxide"]

# (2) Water
water_electrolysis = ["electrolysis", "(split|decompose) water", "water (splitting|decomposition)",
                      "potential difference", "electrolyser", "electrolyte", "polymer", "amkaline"]

electrochemically = ["electrochemically", "(methanol|ethanol|formic acid[glycerol) * electrolys",
                     "sulfur-iodine cycle", "s-i cycle", "sulfur * iodine", ]

radiolysis = ["radiolysis", "radiolytically"]

thermolysis = ["thermolysis"]

thermochemical = ["thermochemical", "heat sources * chemical reactions"]

ferrosilicon = ["ferrosilicon"]

algae = ["photobiological (water|) splitting", "algae",
         "photobioreactor", "photosynthesis"]

photocatalytic = ["photocatalytic", "solar energy to hydrogen", "photoelectrochemical cell",
                  "artificial photosynthesis"]

# (3) Biohydrgen
fermentative = ["bioreactor", "bacteria", "fermentative", "organic (substrate|compound)", "biohydrogen",
                "fermentation"]

enzymatic = ["enzyma", "sugars"]

biocatalysed_electrolysis = ["electrohydrogenesis", "microbial fuel cell", "biocatalysed electrolysis",
                             "electrohydrogenesis", "electrolysis * microbes"]


list_type_prod = [SMR, methane_pyrolysis, partial_oxidation, plasma_reforming, coal_gasification,
                 water_electrolysis, electrochemically, radiolysis, thermolysis, thermochemical,
                 ferrosilicon, algae, photocatalytic]
list_type_prod_BD = ["SMR", "Pyrolyse du méthane", "Oxydation partielle", "Reformage plasma",
                     "Gazéification du charbon", "Electrolyse", "Electrochimie", "Radiolyse", "Thermolyse",
                     "Thermochimie", "Ferrosilicium", "Culture d'algues", "Fission photocatalytique",
                     "Fermentation", "Production enzymatique", "Electrolyse biocatalysée"]


df['Production'] = extraction_info2(
    list_type_prod_BD, list_type_prod, title, abstract)

100%|██████████| 8439/8439 [00:04<00:00, 1767.98it/s]


### 3-2-4) Domains

In [21]:
militaire = ["militar", "army", "soldie", "warfare", "armed forces",
             "navy", "air force", "nato", "defence", "regiment", "fighting"]  # NATO

agriculture = ["agricultur", "cultivat", "agronomy", "harvest", "crops", "tractor", "sowing", "seeding",
               "gmo", "harvest", "plough", "plow", "irrigation", "insecticid", "herbicide",
               "weedkiller", "seed", "farm", "fertilizer", "fertiliser", "cultur"]

astronautique = [" space", "astronautics", "orbit", "atmosphere", "space shuttle", "rocket", "landing",
                 "aerodynamic", "propulsion", "thrust", "cosmonaut", "jet engine", "thales", "boeing",
                 "take-off ", "takeoff ", "(gknpz|khrounitchev)", "( |'|\.|,)nasa( |'|\.|,)", "spacex",
                 "avio( |'|\.|,)", "ariane", "united launch alliance", "mcdonnell douglas",
                 "isro", "ioujnoïe"]

aéronautique = ["(aeroplane| plane|airplane)", "aircraft", "aeronautic", "aviation", "helicopter",
                "aerodynamic", "boeing", "airbus", "(transonic|(in|)compressible) flow",
                "take-off ", "takeoff ", "thales", "dassault", "cessna", "bombardier"]

transport_march = ["charter", " (truck|lorry|semitrailer|rig|juggernaut|hgv|heavy goods vehicle)( |s|,|\.)",
                   " trailer", " container", " freight", " cargo", " shipment", " merchant ship", "service road",
                   " logistics", " handling", " hefting", " merchandise", " goods", " transporter", " haulier",
                   "long-haul * transportation", "haulage contractor", " carr(ying|ier)", "wagon"]

transport_pers = ["(public|mass|urban|school) (transport|transit)", " passenger", " occupant", " traverl(l|)er",
                  " voyager", "carrriage", " (bike|bicycle|yacht|ferry|jet|tram|streetcar|( | motor)bike| motorcycle|\
                         car|automobile|saloon|van|pickup|bus|coach|underground|subwat|metro)( |s|,|\.)", ]

politique = [" decree( |s|,|\.)", " law( |s|,|\.)", " legislation", "carbon tax", "paris agreement", "unfccc", " cop",
             " protocol", "cmp [0-9]", "cop [0-9]", "climate change conference", "subsidy"]

automobile = ["automobile", " (saloon|van|pickup|car)( |s|,|\.)", " motoring( |s|,|\.)", "tailpipe",
              "exhaust pipe", "horsepower", "dealership", "motor rac(ing|e)", "hyundai", "toyota",
              "renault", "honda", "mercedes", "( |')audi( |'|\.|,)", "( |'|\.|,)kia( |'|\.|,)",
              "river( |\-|)simple", "nissan", "( |'|\.|,)ford( |'|\.|,)",
              "daimler", "bmw", "hopium", "peugeot", "volkswagen", "general motors", "( |'|\.|,)psa( |'|\.|,)",  "(roland gumpert|apollo automobil|gmbh)",
              "mazda", "aston martin", "pininfarina", "suzuki", "volvo", "( |'|\.|,)opel( |'|\.|,)", "rolls-royce", "citroen", "( |'|\.|,)fiat( |'|\.|,)",
              "( |'|\.|,)lancia( |'|\.|,)", "skoda", "yamaha", "( |'|\.|,)ktm( |'|\.|,)", "kawasaki", "ducati", "suzuki"]

ferroviaire = [" rail", " train( |s|,|\.)", "alstom", "locomotive",
               "interurbain", "wagon", "freight car", "carrriage", ]

energies_renouv = ["solar (panel|energy)", "wind (power|energy)", "hydropower",
                   "biofuel", "renewable (energy|resource)", "biomass", "firewood", "tidal energy"]

energies_fossiles = ["fossil (energy|fuel)", " gas ", " oil ",
                     "petroleum", " coal ", " fuels ", " hydrocarbons"]

performance = [" horsepower", " record ", " potency ", " optimum ", " maximum ", " performance ", "battery life",
               "yield", "profitability", "cost-effectiveness", "financial viability", "investment"]

chimie = [" chemi*", " substances", " elementary form", " matter ", " atomic ", "microscop*", "catalyst",
          "colloid", "isomer", "reactant", "satureted", "solubility", "solvent", "(hetero|homo)geneous mixture"]


list_domains = [militaire, agriculture, astronautique, aéronautique, transport_march, transport_pers,
                politique, automobile, ferroviaire, energies_renouv, energies_fossiles, performance, chimie]

list_domains_BD = ['Militaire', 'Agriculture', 'Astronautique', "Aéronautique", "Transport Marchandises",
                   "Transport Personnes", "Politique", "Automobile", "Ferroviaire", "Energies Renouvelables",
                   "Energies Fossiles", "Performance", "Chimie"]


df['Domaine'] = extraction_info2(
    list_domains_BD, list_domains, title, abstract)

100%|██████████| 8439/8439 [00:15<00:00, 549.49it/s]


In [22]:
# The results are checked 
df.head()

Unnamed: 0,Id,Date,Auteur,Titre,Langue,MotCle,Pays,Vehicule,Marque,Moteur,Stockage,Production,Domaine
0,6334032,,Hezlin Ashraf-Ball and Andrew J. Oswald and Ja...,Hydrogen Transport and the Spatial Requirement...,English,Renewable energy / wind power / land use / ene...,,,,,Gaz/,,Energies Renouvelables/Energies Fossiles/
1,48223,2009-05-01,Andrew J. Oswald and James I. Oswald and Hezli...,Hydrogen transport and the spatial requirement...,English,,,,,,Gaz/,,Energies Renouvelables/Energies Fossiles/
2,52955835,2011-01-01,PeiYuan Hsu and Xu Yang and Joshua L. Dibia an...,Design of Residential Hydrogen Fueling System ...,,Faculty research day/,,,,,Gaz/,,Automobile/Energies Renouvelables/Energies Fos...
3,70657172,2008-09-24,Maria Antónia Travassos and A. I. Correia de S...,Penetration of hydrogen technologies: study on...,,Road transport-Portugal/ Pollutant emissions/ ...,Portugal/,,,,,,Transport Personnes/Politique/Automobile/
4,40069044,2007-01-01,Ivo Veldhuis,Application of hydrogen marine systems in high...,,130 - Mechanical/ industrial/ civil and marine...,,,,,Liquide/,,Aéronautique/Transport Marchandises/Transport ...


In [23]:
# Saving the DataFrame to a file 
df.to_csv("df_scraping.csv", index=False)