# Filtering the data from Open Alex

I first import the modules I will need.

In [1]:
import json
from tqdm import tqdm
from datetime import datetime
import pickle
import pandas as pd

I define some variables for further use.

In [2]:
my_years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
my_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December']

In [3]:
# technologies from Sarah's work
concept_ids = {
     'Authentication protocol': 'C21564112',
    'Biometrics': 'C184297639',
    'Blockchain': 'C2779687700',
    # 'Database Encryption': '',
    'Differential Privacy': 'C23130292',
    'Digital rights management': 'C537843408',
    'Digital signature': 'C118463975',
    'Disk Encryption': 'C9368797',
    'Distributed algorithm': 'C130120984',
    'Electronic voting': 'C2780612046',
    # 'Email encryption': '',
    'Functional encryption': 'C2780746774',
    'Hardware acceleration': 'C13164978',
    'Hardware security module': 'C39217717',
    'Hash function': 'C99138194',
    'Homomorphic encryption': 'C158338273',
    'Identity management': 'C555379026',
    # 'Identity-based encryption': '',
    'Key management': 'C17886624',
    'Link encryption': 'C69254412',
    'Post-quantum cryptography': 'C108277079',
    # 'Private set intersection': '',
    'Public-key cryptography': 'C203062551',
    'Quantum key distribution': 'C95466800',
    'Quantum cryptography': 'C144901912',
    'Random number generation': 'C201866948',
    # 'Searchable symmetric encryption': '',
    'Symmetric-key algorithm': 'C65302260',
    'Threshold cryptosystem': 'C123744220',
    'Trusted Computing': 'C2776831232',
    # 'Trusted execution environment': '',
    'Tunneling protocol': 'C76885553',
    'Zero-knowlegde proof': 'C176329583'
}

I download the raw data I scrapped from OpenAlex, to filter it.

In [4]:
with open('data_creation_variables/raw_data.json', "rb") as f:
    raw_data = json.load(f)

I define some variables, I will compute to analyze slightly my data.

In [5]:
number_missing_referencedwork = 0
number_missing_abstracts = 0
number_missing_ids = 0
number_missing_titles = 0
number_missing_authors = 0
number_missing_date = 0
number_missing_concepts = 0
number_missing_citing_works = 0

listofpaper = []

I define my dictionary, that I will fill and turn into a dataframe.

In [6]:
filtered_data = {'id':[],
                'title':[],
                'publication_date' : [],
                'authorships' : [],
                'referenced_works' : [],
                'abstract_inverted_index':[],
                'concepts' : [],
                'year' : [],
                'month' : [],
                'work_citing_this_paper' : []}

Now I filter my data iterating over it.

In [7]:
for concept, concept_id in tqdm(concept_ids.items()):
    lentec = len(raw_data[concept])
    
    # we take the length of the list of papers for one
    # technology and iterate over this list
    
    for i in range(lentec):
        p = raw_data[concept][i]
        if p['id'] not in listofpaper:
            listofpaper.append(p['id'])
            dt = datetime.strptime(p['publication_date'], '%Y-%m-%d')
            year = dt.year
            month = dt.month
            
            # I choose only papers above 2011. This is an arbitrary choice, I could change that.
            
            if year > 2001 and year < 2023:
                if len(p['referenced_works']) == 0:
                    number_missing_referencedwork = number_missing_referencedwork + 1
                if len(p['cited_by_api_url']) == 0:
                    number_missing_citing_works = number_missing_citing_works + 1
                if p['abstract_inverted_index'] == {} or p['abstract_inverted_index'] == None:
                    number_missing_abstracts = number_missing_abstracts + 1
                if p['id'] == None:
                    number_missing_ids = number_missing_ids + 1
                if p['title'] == None:
                    number_missing_titles = number_missing_titles + 1
                if p['publication_date'] == None:
                    number_missing_date = number_missing_date + 1
                if len(p['authorships']) == 0:
                    number_missing_authors = number_missing_authors + 1
                if len(p['concepts']) == 0:
                    number_missing_concepts = number_missing_concepts + 1
                    
                # I want to take only papers with all the informations I want
                # In fact, the others (that are not many) will be of no use for my research
                if p['publication_date'] != None and len(p['authorships']) != 0 \
                        and len(p['concepts']) != 0 and p['title'] != None:

                    dt = datetime.strptime(p['publication_date'], '%Y-%m-%d')
                    year = dt.year
                    index_month = dt.month - 1  # it gives a number and I take minus 1 to have my index
                    month = my_months[index_month]
                    filtered_data['id'].append(p['id'])
                    filtered_data['title'].append(p['title'])
                    filtered_data['publication_date'].append(p['publication_date'])
                    filtered_data['authorships'].append(p['authorships'])
                    filtered_data['referenced_works'].append(p['referenced_works'])
                    filtered_data['abstract_inverted_index'].append(p['abstract_inverted_index'])
                    filtered_data['concepts'].append(p['concepts'])
                    filtered_data['year'].append(year)
                    filtered_data['month'].append(month)
                    filtered_data['work_citing_this_paper'].append([p['cited_by_api_url']])

100%|███████████████████████████████████████████████████████████████████████████████| 27/27 [1:37:16<00:00, 216.18s/it]


I save all my data and print some statistics I have computed so far.

In [8]:
with open("data_creation_variables/filtered_data", "wb") as fp:   #Pickling
    pickle.dump(filtered_data, fp)

In [9]:
print('Papers with missing referenced works: ' + str(number_missing_referencedwork))
print('Papers with missing abstracts:' + str(number_missing_abstracts))
print('Papers with missing id:' + str(number_missing_ids))
print('Papers with missing titles:' + str(number_missing_titles))
print('Papers with missing authors:' + str(number_missing_authors))
print('Papers with missing concepts:' + str(number_missing_concepts))
print('Papers with missing publication date:' + str(number_missing_date))
print('Papers with missing citing works: ' + str(number_missing_citing_works))

Papers with missing referenced works: 113738
Papers with missing abstracts:62682
Papers with missing id:0
Papers with missing titles:79
Papers with missing authors:7466
Papers with missing concepts:0
Papers with missing publication date:0
Papers with missing citing works: 0
