## Setup 

In [26]:
import pandas as pd
import numpy as np

In [27]:
import os
os.chdir('C:/Users/syjan/Desktop/암연구소 인턴')

In [129]:
df = pd.read_csv('pubmed_task.csv', header=1, encoding='euc-kr')
df.columns

Index(['NUM', 'PMID', 'Title', 'Unnamed: 3', 'Authors', 'Citation',
       'First Author', 'Journal/Book', 'Publication Year', 'Create Date',
       'PMCID', 'NIHMS ID', 'DOI', 'J', 'S', 'C', 'J.1', 'S.1', 'C.1', 'J.2',
       'S.2', 'C.2'],
      dtype='object')

## Filter conditions and assign appropriate numbers


In [134]:
def filter_pubmed(df):
    df1 = df.copy()
    
    df1.S = np.nan
    #Condition 1: Articles published before 1970
    df1.loc[df1['Publication Year'] < 1970, 'S'] = 1
    print('Condition 1: excluded ', df1.query("S==1").shape[0], 'rows')

    #Condition 2: Not an original article 
    unoriginal = 'comment|letter|editorial|case report|image|video|review|meta-analysis|meta analysis|rare case|rare association|one case of'
    df1.loc[df1.Title.str.contains(unoriginal, case=False) & df1.S.isnull(), 'S'] = 2
    df1.loc[df1.Title.str.contains("A case of", case=True) & df1.S.isnull(), 'S'] = 2
    print('Condition 2: excluded ', df1.query("S==2").shape[0], 'rows')

    #Condition 3: Not pulmonary TB 
    lung_related = 'pulmonary|chest|lung'
    other_organs = '|'.join(['meningitis', 'TBM','colon', 'kidney', 'bone', 'musculoskeletal', 'lymph', 'abdomen', 'ocular', 
                'liver', 'spleen', 'abdominal', 'intestine', 'kidney', 'bladder', 'urogenital', 'renal','tuberculoma'])
    df1.loc[-df1.Title.str.contains(lung_related, case=False)& 
       df1.Title.str.contains(other_organs, case=False) & 
       df1.S.isnull() , 'S'] = 3
    print('Condition 3: excluded ', df1.query("S==3").shape[0], 'rows')

    #Condition 4: Studies on TB vaccine 
    vaccine_related = 'BCG|TB vaccination|TB vaccine'
    df1.loc[df1.Title.str.contains(vaccine_related, case=False) & df1.S.isnull(), 'S'] = 4
    print('Condition 4: excluded ', df1.query("S==4").shape[0], 'rows')

    #Condition 5: Cost analyses
    cost_related ='performance yield|cost analysis|cost analyses|economic analysis|economic analyses|cost-effectiveness'
    df1.loc[df1.Title.str.contains("Cost ", case=True) & df1.S.isnull(), 'S'] = 2
    df1.loc[df1.Title.str.contains(cost_related, case=False) & df1.S.isnull(), 'S'] = 5
    print('Condition 5: excluded ', df1.query("S==5").shape[0], 'rows')

    #Condition 6: Not human
    animals = "bovine|cattle|animal tuberculosis|animal TB|raccoon|primates|horses|dogs|elk|rhesus macaque"
    df1.loc[df1.Title.str.contains(animals, case=False) & df1.S.isnull(), 'S'] = 6
    print('Condition 6: excluded ', df1.query("S==6").shape[0], 'rows')
    
    return df1

In [135]:
new_df = filter_pubmed(df)
new_df[['Title', 'Publication Year', 'S']].head()

Condition 1: excluded  497 rows
Condition 2: excluded  870 rows
Condition 3: excluded  695 rows
Condition 4: excluded  39 rows
Condition 5: excluded  19 rows
Condition 6: excluded  4 rows


Unnamed: 0,Title,Publication Year,S
0,Four-Month Rifapentine Regimens with or withou...,2021,
1,Pulmonary Tuberculosis: Role of Radiology in D...,2017,
2,Endobronchial tuberculosis,2004,
3,Chest ultrasound compared to chest X-ray for p...,2019,
4,Endobronchial tuberculosis: an overview,2011,


In [136]:
print("In total,",new_df.shape[0] - new_df[new_df.S.isnull()].shape[0],"rows were deleted out of", df.shape[0])

In total, 2130 rows were deleted out of 7437


In [137]:
#Check numbers
new_df.S.value_counts(dropna=False).reset_index(name='n').sort_values(by='index').rename(columns={'index':'criteria'}).fillna('included')

Unnamed: 0,criteria,n
3,1,497
1,2,876
2,3,695
4,4,39
5,5,19
6,6,4
0,included,5307


## Check with examples

In [138]:
# Condition 1 example
new_df[['Title', 'Publication Year', 'S']].query("S==1").head(5)

Unnamed: 0,Title,Publication Year,S
210,Mobile chest radiography service for general p...,1968,1.0
230,THE EVALUATION AND PREPARATION FOR SURGERY OF ...,1963,1.0
234,OBSERVATIONS ON THE PROTECTIVE EFFECT OF BCG V...,1964,1.0
266,COCCIDIOIDOMYCOSIS. LONG-TERM TREATMENT WITH A...,1964,1.0
274,Immigration in the midlands,1965,1.0


In [139]:
# Condition 2 example
new_df.query("S==2").Title.tolist()[:5]

['Chest X-ray and chest CT findings in patients diagnosed with pulmonary tuberculosis following solid organ transplantation: a systematic review',
 'The effect of a tuberculosis chest X-ray image reference set on non-expert reader performance',
 'Severe pulmonary tuberculosis complicated with insidious pulmonary thromboembolism: a case report and literature review',
 'A Rare Case of a Tuberculosis Patient with Sarcoidosis',
 'Diagnostic accuracy of chest radiography for the diagnosis of tuberculosis (TB) and its role in the detection of latent TB infection: a systematic review']

In [140]:
# Condition 3 example
new_df.query("S==3").Title.tolist()[:5]

['Renal Tuberculosis: The Masquerader',
 'Diagnostic potential of interferon-gamma release assay to detect latent tuberculosis infection in kidney transplant recipients',
 'Isoniazid- and streptomycin-resistant miliary tuberculosis complicated by intracranial tuberculoma in a Japanese infant',
 'Ocular tuberculosis: a clinicopathologic and molecular study',
 'Everolimus-induced activation of latent Mycobacterium tuberculosis infection in a patient with metastatic renal cell carcinoma']

In [141]:
# Condition 4 example
new_df.query("S==4").Title.tolist()[:5]

['The potential of imaging tools as correlates of infection and disease for new TB vaccine development',
 '[Tuberculin-BCG therapy in combined treatment of patients with pulmonary tuberculosis]',
 '[Miliary pulmonary tuberculosis following intravesical BCG-therapy]',
 'Tuberculosis in children and BCG vaccination in North Sumatra',
 '[Pulmonary infiltrates and fever after intravesical instillation of BCG]']

In [142]:
# Condition 5 example
new_df.query("S==5").Title.tolist()[:5]

['Economic analysis of tuberculosis diagnostic tests in disease control: how can it be modelled and what additional information is needed?',
 'The role and performance of chest X-ray for the diagnosis of tuberculosis: a cost-effectiveness analysis in Nairobi, Kenya',
 'Tuberculosis screening of travelers to higher-incidence countries: a cost-effectiveness analysis',
 'Cost-effectiveness of tuberculosis screening and isoniazid treatment in the TB/HIV in Rio (THRio) Study',
 'Cost-effectiveness of polymerase chain reaction versus Ziehl-Neelsen smear microscopy for diagnosis of tuberculosis in Kenya']

In [143]:
# Condition 6 example
new_df.query("S==6").Title.tolist()

['Establishment of an aerosol challenge model of tuberculosis in rhesus macaques and an evaluation of endpoints for vaccine testing',
 'Re-activation of bovine tuberculosis in a patient treated with infliximab',
 'Bovine tuberculosis is more prevalent in cattle owned by farmers with active tuberculosis in central Ethiopia',
 'Human bovine tuberculosis - remains in the differential']

In [56]:
#write as csv
new_df.to_csv('pubmed_task_0704.csv',index=False)

PermissionError: [Errno 13] Permission denied: 'pubmed_task_0704.csv'