In [1]:
import glob
import json
import pandas as pd
from tqdm import tqdm

import os

path = os.getcwd()
os.chdir(f"{path}")

## Data extraction

### If 'result' DataFrame needed:

In [None]:
# data structure
# .
# ├── aps-dataset-metadata-2013
# │   └── PR
# │   └── PRA
# │   └── PRB
# │   ...
# │   └── RMP
# ├── json_to_csv.py
# ├── result_PR1.csv
# ├── result_PR2.csv
# ├── ...
# ├── result_RMP1.csv
# └── result.csv

import glob
import json
import pandas as pd
from tqdm import tqdm

def json_to_df(json_data): # json file to DataFrame in a particular format
    
    json_cols = json_data.keys()
    result = dict()
    
    if "id" in json_cols:
        result["doi"] = json_data["id"]
    else:
        result["doi"] = [""]
    
    if "authors" in json_cols:
        numAuthor = len(json_data["authors"])
        names = []
        surnames = []
        
        for author in json_data["authors"]:
            if "name" in author.keys():
                names.append(author["name"].lower().replace(" ", ""))
            else:
                names.append("")
            
            if "surname" in author.keys():
                surnames.append(author["surname"].lower())
        
        result["name"] = names
        result["order"] = list(range(1, numAuthor + 1))
        result["numAuthor"] = numAuthor
        
        # alphabetical order
        if numAuthor >= 4 and surnames == sorted(surnames):
            result["is_alpha"] = True
        else:
            result["is_alpha"] = False
    else:
        result["name"] = [""]
        result["order"] = [""]
        result["numAuthor"] = 0
        result["is_alpha"] = False
    
    if "date" in json_cols:
        result["year"] = json_data["date"][:4]
    else:
        result["year"] = [""]
    
    if "articleType" in json_cols:
        result["articleType"] = json_data["articleType"]
    else:
        result["articleType"] = [""]
    
    if "journal" in json_cols:
        result["journal"] = json_data["journal"]["id"]
    else:
        result["journal"] = [""]
    
    for i in range(10):
        result["pacs" + str(i) + "0"] = 0
    
    if "classificationSchemes" in json_cols and "pacs" in json_data["classificationSchemes"].keys():
        for pac in json_data["classificationSchemes"]["pacs"]:
            if pac["id"][0] in list(map(str, range(10))):
                result["pacs" + pac["id"][0] + "0"] = 1
    
    return pd.DataFrame(result)

def init_df():
    
    cols = ["doi", "name", "order", "numAuthor", "is_alpha",
            "year", "articleType", "journal"] + ["pacs" + str(i) + "0" for i in range(10)]
    
    return pd.DataFrame(columns=cols)

# 'aps-dataset-metadata-2013' json files to each dataframe, then to .csv
for file_path in glob.glob("./aps-dataset-metadata-2013/*"):
    dir_name = file_path.split("/")[-1]
    result = init_df() # dataframe default
    
    for i, file_name in tqdm(enumerate(glob.glob(file_path + "/**", recursive=True), 1)):
        if file_name.endswith(".json"):        
            with open(file_name, "r") as j:
                json_data = json.loads(j.read())
                result = result.append(json_to_df(json_data), ignore_index=True)
                
        if i % 10000 == 0:
            file_id = dir_name + str(i // 10000)
            result.to_csv(f"./result_{file_id}.csv", index=False) # dataframe to csv
            result = init_df() # dataframe default
            
    # dataframe to csv (save)
    file_id = dir_name + str(i//10000 + 1)
    result.to_csv(f"./result_{file_id}.csv", index=False)

# csvs to one csv
result = init_df()

for file_name in tqdm(glob.glob("./*")):
    if file_name.endswith(".csv"):
        result = pd.concat([result, pd.read_csv(file_name)])

### If 'surnames' DataFrame needed:

In [None]:
def json_to_df_sur(json_data):
    
    json_cols = json_data.keys()
    surs = dict()
    
    if "authors" in json_cols:
        names = []
        surnames = []
        
        for author in json_data["authors"]:
            if "name" in author.keys():
                names.append(author["name"].lower().replace(" ", ""))
            else:
                names.append("")
            
            if "surname" in author.keys():
                surnames.append(author["surname"].lower())
            else:
                surnames.append("")
        
        surs["name"] = names
        surs["surname"] = surnames
    else:
        surs["name"] = [""]
        surs["surname"] = [""]

    return pd.DataFrame(surs)

def init_df_sur():
    
    cols = ["name", "surname"]
    
    return pd.DataFrame(columns=cols)

for file_path in glob.glob("./aps-dataset-metadata-2013/*"):
    dir_name = file_path.split("/")[-1]
    surs = init_df_sur()
    
    for i, file_name in tqdm(enumerate(glob.glob(file_path + "/**", recursive=True), 1)):
        if file_name.endswith(".json"):        
            with open(file_name, "r") as j:
                json_data = json.loads(j.read())
                surs = surs.append(json_to_df_sur(json_data), ignore_index=True)
                
        if i % 10000 == 0:
            file_id = dir_name + str(i // 10000)
            surs.to_csv(f"./surname_{file_id}.csv", index=False)
            surs = init_df_sur()
            
    file_id = dir_name + str(i//10000 + 1)
    surs.to_csv(f"./surname_{file_id}.csv", index=False)

surs = init_df_sur()

for file_name in tqdm(glob.glob("./*")):
    if file_name.endswith(".csv"):
        surs = pd.concat([surs, pd.read_csv(file_name)])