In [1]:
import pandas as pd
import numpy as np
import json as json

## converting jsonl files of the metadata and pdf_pases to data frames

In [2]:
metadata_path = '20200705v1/sample/metadata/sample.jsonl'

In [3]:
df=pd.read_json(metadata_path, lines=True)

In [4]:
df.keys()

Index(['paper_id', 'title', 'authors', 'abstract', 'year', 'arxiv_id',
       'acl_id', 'pmc_id', 'pubmed_id', 'doi', 'venue', 'journal', 'mag_id',
       'mag_field_of_study', 'outbound_citations', 'inbound_citations',
       'has_outbound_citations', 'has_inbound_citations', 'has_pdf_parse',
       's2_url', 'has_pdf_body_text', 'has_pdf_parsed_abstract',
       'has_pdf_parsed_body_text', 'has_pdf_parsed_bib_entries',
       'has_pdf_parsed_ref_entries'],
      dtype='object')

In [5]:
pdf_parses_path = '20200705v1/sample/pdf_parses/sample.jsonl'

In [6]:
df_pdf = pd.read_json(pdf_parses_path, lines=True)

In [7]:
df_pdf.keys()

Index(['paper_id', '_pdf_hash', 'abstract', 'body_text', 'bib_entries',
       'ref_entries'],
      dtype='object')

## removing all columns except 'paper_id', 'abstract' and 'mag_field_of_study' and droping NaN value containging rows

In [8]:
df_abstract = df.filter(['paper_id', 'abstract', 'mag_field_of_study'])

In [9]:
df_abstract.keys()

Index(['paper_id', 'abstract', 'mag_field_of_study'], dtype='object')

In [10]:
df_abstract.isna().sum()

paper_id                0
abstract              512
mag_field_of_study     47
dtype: int64

In [11]:
df_abstract.dropna(inplace=True)

In [12]:
df_abstract.isna().sum()

paper_id              0
abstract              0
mag_field_of_study    0
dtype: int64

In [13]:
df_abstract.columns

Index(['paper_id', 'abstract', 'mag_field_of_study'], dtype='object')

In [14]:
df_abstract.shape

(468, 3)

## selecting rows with 'mag_field_of_study' columns values being only CS, Phy, Med, Bio, Chem and Mat

In [15]:
df_abstract.mag_field_of_study.head(3)

21    [Medicine]
43    [Medicine]
51    [Medicine]
Name: mag_field_of_study, dtype: object

In [16]:
df_abstract.mag_field_of_study=df_abstract.mag_field_of_study.str[0]
df_abstract.mag_field_of_study.head(3)

21    Medicine
43    Medicine
51    Medicine
Name: mag_field_of_study, dtype: object

In [17]:
df_abstract.mag_field_of_study.unique()

array(['Medicine', 'Engineering', 'Materials Science', 'Chemistry',
       'Physics', 'Geology', 'Political Science', 'Business', 'Economics',
       'Art', 'Mathematics', 'Biology', 'Computer Science', 'Psychology',
       'Geography', 'Sociology', 'History', 'Philosophy'], dtype=object)

In [18]:
df_abst_pmbccm = df_abstract.loc[df_abstract.mag_field_of_study.isin(['Computer Science', 'Physics', 'Medicine', 'Biology', 'Chemistry', 'Mathematics'])]

In [19]:
df_abst_pmbccm.mag_field_of_study.unique()

array(['Medicine', 'Chemistry', 'Physics', 'Mathematics', 'Biology',
       'Computer Science'], dtype=object)

## merge metadata and pdf_parses

In [20]:
df_merged = df_abst_pmbccm.merge(df_pdf, on='paper_id', how='left')

In [21]:
df_merged.columns

Index(['paper_id', 'abstract_x', 'mag_field_of_study', '_pdf_hash',
       'abstract_y', 'body_text', 'bib_entries', 'ref_entries'],
      dtype='object')

In [22]:
### droping columns of no relavance here

In [23]:
df_merged.drop(columns=['_pdf_hash', 'bib_entries', 'ref_entries', 'abstract_y'],  inplace=True)
df_merged.columns

Index(['paper_id', 'abstract_x', 'mag_field_of_study', 'body_text'], dtype='object')

In [24]:
df_merged.isna().sum()

paper_id                0
abstract_x              0
mag_field_of_study      0
body_text             250
dtype: int64

In [25]:
df_merged.head(10)

Unnamed: 0,paper_id,abstract_x,mag_field_of_study,body_text
0,77491955,"OBJECTIVE: To report a case of a large, bilate...",Medicine,
1,77493472,Leukotrienes (LTs) are important lipid mediato...,Medicine,
2,77494170,A NDB bull weighing 350kg underwent a ruffian ...,Medicine,
3,77495155,Background. The vertebral genesis of many func...,Medicine,
4,77496281,Purpose: This study was conducted to determine...,Medicine,
5,77497951,Introduction: Elderly patients (pts) with FL d...,Medicine,
6,77498969,--- Introduction. The most common endocrine di...,Medicine,
7,94551157,Abstract A semi-empirical formula for the disc...,Chemistry,
8,94551239,Abstract Scanning probe-based memories have de...,Chemistry,[]
9,94551428,Abstract The influence of Na 2 HPO 4 ·12H 2 O ...,Chemistry,


## creating a function to filter body_text column to have only text key values

In [26]:
df_merged.body_text.isnull()[0]

True

In [27]:
df_merged.body_text[1]!=df_merged.body_text[1]

True

In [28]:
df_merged.body_text[95]!=df_merged.body_text[95]

False

In [29]:
def get_text(y):
    if y!=y:
        return np.nan
    else:
        text = ''
        for dict_ in y:
            text = text+' '+dict_['text']
        return text

In [30]:
df_merged.body_text = df_merged['body_text'].apply(get_text)

## merged file is converted to a feather file

In [31]:
df_merged.to_feather('20200705v1/sample/df_merged.feather')

In [32]:
ls 20200705v1/sample/

df_merged.feather  [34mmetadata[m[m/          [34mpdf_parses[m[m/
