In [None]:
import glob
import json
import pandas as pd
from tqdm import tqdm

import os

path = os.getcwd()
os.chdir(f"{path}")

## Data extraction

### If 'result' DataFrame needed:

In [None]:
# data structure
# .
# ├── aps-dataset-metadata-2013
# │   └── PR
# │   └── PRA
# │   └── PRB
# │   ...
# │   └── RMP
# ├── json_to_csv.py
# ├── result_PR1.csv
# ├── result_PR2.csv
# ├── ...
# ├── result_RMP1.csv
# └── result.csv

import glob
import json
import pandas as pd
from tqdm import tqdm

def json_to_df(json_data): # json file to DataFrame in a particular format
    
    json_cols = json_data.keys()
    result = dict()
    
    if "id" in json_cols:
        result["doi"] = json_data["id"]
    else:
        result["doi"] = [""]
    
    if "authors" in json_cols:
        numAuthor = len(json_data["authors"])
        names = []
        surnames = []
        
        for author in json_data["authors"]:
            if "name" in author.keys():
                names.append(author["name"].lower().replace(" ", ""))
            else:
                names.append("")
            
            if "surname" in author.keys():
                surnames.append(author["surname"].lower())
        
        result["name"] = names
        result["order"] = list(range(1, numAuthor + 1))
        result["numAuthor"] = numAuthor
        
        # alphabetical order
        if numAuthor >= 4 and surnames == sorted(surnames):
            result["is_alpha"] = True
        else:
            result["is_alpha"] = False
    else:
        result["name"] = [""]
        result["order"] = [""]
        result["numAuthor"] = 0
        result["is_alpha"] = False
    
    if "date" in json_cols:
        result["year"] = json_data["date"][:4]
    else:
        result["year"] = [""]
    
    if "articleType" in json_cols:
        result["articleType"] = json_data["articleType"]
    else:
        result["articleType"] = [""]
    
    if "journal" in json_cols:
        result["journal"] = json_data["journal"]["id"]
    else:
        result["journal"] = [""]
    
    for i in range(10):
        result["pacs" + str(i) + "0"] = 0
    
    if "classificationSchemes" in json_cols and "pacs" in json_data["classificationSchemes"].keys():
        for pac in json_data["classificationSchemes"]["pacs"]:
            if pac["id"][0] in list(map(str, range(10))):
                result["pacs" + pac["id"][0] + "0"] = 1
    
    return pd.DataFrame(result)

def init_df():
    
    cols = ["doi", "name", "order", "numAuthor", "is_alpha",
            "year", "articleType", "journal"] + ["pacs" + str(i) + "0" for i in range(10)]
    
    return pd.DataFrame(columns=cols)

# 'aps-dataset-metadata-2013' json files to each dataframe, then to .csv
for file_path in glob.glob("./aps-dataset-metadata-2013/*"):
    dir_name = file_path.split("/")[-1]
    result = init_df() # dataframe default
    
    for i, file_name in tqdm(enumerate(glob.glob(file_path + "/**", recursive=True), 1)):
        if file_name.endswith(".json"):        
            with open(file_name, "r") as j:
                json_data = json.loads(j.read())
                result = result.append(json_to_df(json_data), ignore_index=True)
                
        if i % 10000 == 0:
            file_id = dir_name + str(i // 10000)
            result.to_csv(f"./result_{file_id}.csv", index=False) # dataframe to csv
            result = init_df() # dataframe default
            
    # dataframe to csv (save)
    file_id = dir_name + str(i//10000 + 1)
    result.to_csv(f"./result_{file_id}.csv", index=False)

# csvs to one csv
result = init_df()

for file_name in tqdm(glob.glob("./*")):
    if file_name.endswith(".csv"):
        result = pd.concat([result, pd.read_csv(file_name)])

### If 'surnames' DataFrame needed:

In [None]:
def json_to_df_sur(json_data):
    
    json_cols = json_data.keys()
    surs = dict()
    
    if "authors" in json_cols:
        names = []
        surnames = []
        
        for author in json_data["authors"]:
            if "name" in author.keys():
                names.append(author["name"].lower().replace(" ", ""))
            else:
                names.append("")
            
            if "surname" in author.keys():
                surnames.append(author["surname"].lower())
            else:
                surnames.append("")
        
        surs["name"] = names
        surs["surname"] = surnames
    else:
        surs["name"] = [""]
        surs["surname"] = [""]

    return pd.DataFrame(surs)

def init_df_sur():
    
    cols = ["name", "surname"]
    
    return pd.DataFrame(columns=cols)

for file_path in glob.glob("./aps-dataset-metadata-2013/*"):
    dir_name = file_path.split("/")[-1]
    surs = init_df_sur()
    
    for i, file_name in tqdm(enumerate(glob.glob(file_path + "/**", recursive=True), 1)):
        if file_name.endswith(".json"):        
            with open(file_name, "r") as j:
                json_data = json.loads(j.read())
                surs = surs.append(json_to_df_sur(json_data), ignore_index=True)
                
        if i % 10000 == 0:
            file_id = dir_name + str(i // 10000)
            surs.to_csv(f"./surname_{file_id}.csv", index=False)
            surs = init_df_sur()
            
    file_id = dir_name + str(i//10000 + 1)
    surs.to_csv(f"./surname_{file_id}.csv", index=False)

surs = init_df_sur()

for file_name in tqdm(glob.glob("./*")):
    if file_name.endswith(".csv"):
        surs = pd.concat([surs, pd.read_csv(file_name)])

## Data Preprocessing: special character handling

This part is already performed, so check for the methodology purposes instead of actually running them.

In [37]:
sbc = pd.read_csv("./sbc_edited.csv").sort_values(by = ['author']).reset_index(drop=True)
sbc.author = sbc.author.str.lower()

gender = pd.read_csv("./gender_edited.csv").sort_values(by = ['Name']).apply(
        lambda x: x.astype(str).str.lower()).reset_index(drop=True)

result = pd.read_csv("./result.csv").sort_values(
    by = ['name']).reset_index(drop=True)
result.name = result.name.str.lower()

surs = pd.read_csv("./surnames.csv").sort_values(
    by = ['name']).reset_index(drop=True)

In [None]:
# example:

conv = []

for x in range(len(surs)):
    try:
        if not surs['surname'][x].isascii():
            for c in surs['surname'][x]:
                if not c.isascii():
                    conv.append(c)
    except AttributeError:
        continue

for x in range(len(surs)):
    try:
        if not surs['surname'][x].isalpha():
            for c in surs['surname'][x]:
                if not c.isalpha():
                    conv.append(c)
    except AttributeError:
        continue

conv = sorted(list(set(conv)))

# change target to appropriate characters
target = [''] * len(conv)

for s, t in zip(conv, target):
    surs['surname'] = surs['surname'].str.replace(s, t)
    print(f'replaced: {s} to {t}')

## Data Preprocessing: Name matching

### Fuzzymatching (not recommended)

In [None]:
%%capture
pip install fuzzymatcher

This package is a bit more optimized than the fuzzywuzzy package. The reason I do not recommend this methodology, however, is that it returns minimal enhancement in regards to matching, yet the time cost and the memory cost are extremely demanding. I write only a partial documentation here.

In [None]:
f = fuzzymatcher.fuzzy_left_join(sbc, gender, left_on = "author", right_on = "Name")
sbc_fuzz = pd.merge(sbc, f, on=['author', 'id'])

In [None]:
f2 = fuzzymatcher.fuzzy_left_join(gender, sbc, left_on = "Name", right_on = "author")
g_fuzz = pd.merge(gender, f2, on=['Name', 'Gender'])

In [None]:
# best match score threshold to 0.25

sbc_fuzz = sbc_fuzz[sbc_fuzz['best_match_score'] >= 0.25].sort_values('best_match_score').drop(
    ['__id_left','__id_right'], axis=1).reset_index(drop=True)
g_fuzz = g_fuzz[g_fuzz['best_match_score'] >= 0.25].sort_values('best_match_score').drop(
    ['__id_left','__id_right'], axis=1).reset_index(drop=True)

fuzzy = sbc_fuzz.append(g_fuzz).reset_index(drop=True)

In [None]:
fuzzy_left = fuzzy.drop(['Name'], axis=1)
fuzzy_left.rename(columns={'Gender': 'gender'}, inplace=True)
fuzzy_right = fuzzy.drop(['author'], axis=1)
fuzzy_right.rename(columns={'Name': 'author', 'Gender': 'gender'}, inplace=True)

nameinfo = fuzzy_left.append(fuzzy_right).reset_index(drop=True)
nameinfo.id = nameinfo.id.astype(int)

### Surname matching