In [1]:
import pandas as pd
import time as time

import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import tensorflow as tf

In [4]:
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None, "display.max_columns", None)

## Load Datasets

In [5]:
s = time.time()
articles_df = pd.read_excel('data/articles_countries.xlsx', index_col=0, sheet_name='articles_countries')
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(articles_df))
#clin_ner.info(verbose=True)

Multicore Loading Time = 12.908427476882935
32472


In [6]:
s = time.time()
size_df = pd.read_excel('data/final_sizes.xlsx', index_col=0, sheet_name='final_sizes')
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(size_df))
#coarse_ner.info(verbose=True)

Multicore Loading Time = 5.36381459236145
32474


In [7]:
s = time.time()
ner_df = pd.read_excel('data/final_ner.xlsx', index_col=0, sheet_name='final_ner')
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(ner_df))
#coarse_ner.info(verbose=True)

Multicore Loading Time = 28.081473112106323
32472


In [8]:
s = time.time()
affil_df = pd.read_csv('data/comm_affil.csv', index_col=0)
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(ner_df))
#coarse_ner.info(verbose=True)

Multicore Loading Time = 0.22621870040893555
32472


## Clean base_data 

In [9]:
base_data = articles_df.copy()

In [10]:
base_data.isna().sum()

pmid                          0
doi                        4853
title                         0
abstract                      0
article_date               8351
pubmed_date                   0
article_type                  0
lang                          0
journal                       0
journal_short                 0
journal_country              45
authors                     785
author_affils             10248
keywords                  15194
mesh_terms                 8612
references_pmids          17098
feature                       0
include                       0
include_raw                   0
include_fuzzy                 0
affil_countries             102
affil_countries_unique      102
affil_first_country       10416
affil_last_country        10422
dtype: int64

In [11]:
## using first affiliation as country
## fill first with data from last author, then with pubmed country metadata
base_data['affil_first_country'] = base_data['affil_first_country'].fillna(base_data['affil_last_country'])
base_data['affil_first_country'] = base_data['affil_first_country'].fillna(base_data['journal_country'])

base_data['affil_first_country'] = base_data['affil_first_country'].astype('string')

In [12]:
##clean countries
base_data["affil_first_country"].replace({"England": "United Kingdom", 
                                             "Wales": "United Kingdom", 
                                             "Scotland": "United Kingdom", 
                                             "China (Republic : 1949- )" : "Taiwan"}, inplace=True)

base_data['affil_first_country'].value_counts()

United States             12118
China                      4773
United Kingdom             2696
Germany                    1217
South Korea                1182
Netherlands                1092
Japan                       810
India                       809
Canada                      718
Italy                       585
Spain                       518
Taiwan                      440
France                      413
Switzerland                 398
Australia                   396
Turkey                      363
Iran                        346
Brazil                      283
Ireland                     225
Israel                      190
Singapore                   190
Sweden                      169
Pakistan                    164
Greece                      154
Poland                      128
Saudi Arabia                122
Austria                     120
Finland                     120
Denmark                     120
Belgium                     111
Portugal                    109
Mexico  

In [13]:
base_data['countries_lc'] = base_data['affil_countries_unique'].str.lower()

In [14]:
## lmic_flag_all

lmic_list = ['afghanistan', 'albania', 'algeria', 'angola', 'antigua', 'barbuda', 'argentina', 'armenia', 'china',
             'azerbaijan', 'bangladesh', 'belarus', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia', 'herzegovina', 
             'botswana', 'brazil', 'burkina', 'faso', 'burundi', 'verde', 'cambodia', 'cameroon', 'africa', 'chad', 
             'colombia', 'comoros', 'congo', 'costa rica', 'ivoire', 'cuba', 'djibouti', 'dominica', 'dominica', 
             'ecuador', 'egypt', 'salvador', 'guinea', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia', 
             'georgia', 'ghana', 'grenada', 'guatemala', 'guinea', 'guyana', 'haiti', 'honduras', 'india', 
             'indonesia', 'iran', 'iraq', 'jamaica', 'jordan', 'kazakhstan', 'kenya', 'kiribati', 'dpr', 'north korea', 
             'kosovo', 'kyrgyzstan', 'lao', 'lebanon', 'lesotho', 'liberia', 'libya', 'macedonia', 'madagascar', 'malawi', 
             'malaysia', 'maldives', 'mali', 'marshall', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 
             'mongolia', 'montenegro', 'montserrat', 'morocco', 'mozambique', 'myanmar', 'namibia', 'nauru', 'nepal', 
             'nicaragua', 'niger', 'nigeria', 'niue', 'pakistan', 'palau', 'panama', 'papua', 'paraguay', 'peru', 
             'philippines', 'rwanda', 'helena', 'samoa', 'príncipe', 'senegal', 'serbia', 'sierra leone', 'solomon', 
             'somalia', 'south africa', 'sudan', 'sri lanka', 'saint lucia', 'saint vincent', 'grenadines', 'sudan', 
             'suriname', 'syria', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'tokelau', 'tonga', 'tunisia', 
             'turkey', 'turkmenistan', 'tuvalu', 'uganda', 'ukraine', 'uzbekistan', 'vanuatu', 'venezuela', 'vietnam', 
             'wallis', 'west bank', 'gaza', 'palestine', 'yemen', 'zambia', 'zimbabwe', 'low-income', 'middle-income', 
             'lmic', 'scarce']

lmic_lower_list = ['afghanistan', 'algeria', 'angola', 'antigua', 'barbuda',
             'azerbaijan', 'bangladesh', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia', 'herzegovina', 
             'burkina', 'faso', 'burundi', 'verde', 'cambodia', 'cameroon', 'africa', 'chad', 
             'comoros', 'congo', 'ivoire', 'djibouti',
             'ecuador', 'egypt', 'salvador', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia', 
             'ghana', 'grenada', 'guatemala', 'guinea', 'guyana', 'haiti', 'honduras', 'india', 
             'indonesia', 'iran', 'jordan', 'kazakhstan', 'kenya', 'kiribati', 'dpr', 'north korea', 
             'kyrgyzstan', 'lao', 'lesotho', 'liberia', 'macedonia', 'madagascar', 'malawi', 
             'mali', 'mauritania', 'mexico', 'micronesia', 'moldova', 
             'mongolia', 'montenegro', 'montserrat', 'morocco', 'mozambique', 'myanmar', 'nauru', 'nepal', 
             'nicaragua', 'niger', 'nigeria', 'niue', 'pakistan', 'palau', 'panama', 'papua', 'paraguay', 'peru', 
             'philippines', 'rwanda', 'helena', 'samoa', 'príncipe', 'senegal', 'sierra leone', 'solomon', 
             'somalia', 'south africa', 'sudan', 'sri lanka', 'saint lucia', 'saint vincent', 'grenadines', 'sudan', 
             'suriname', 'syria', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'tokelau', 'tonga', 'tunisia', 
             'uganda', 'ukraine', 'uzbekistan', 'vanuatu', 'venezuela', 'vietnam', 
             'wallis', 'west bank', 'gaza', 'palestine', 'yemen', 'zambia', 'zimbabwe', 'low-income', 'middle-income', 
             'lmic', 'scarce']

In [15]:
base_data['lmic_author_flag'] = np.where(base_data['countries_lc'].str.contains('iran'), "1", "0")
base_data['lmic_author_lower_flag'] = np.where(base_data['countries_lc'].str.contains('iran'), "1", "0")
base_data['lmic_china_flag'] = np.where(base_data['countries_lc'].str.contains('china'), "1", "0")

for x in lmic_list:
    base_data['lmic_author_flag'] = np.where(base_data['countries_lc'].str.contains(x), "1", base_data['lmic_author_flag'])
    
for y in lmic_lower_list:
    base_data['lmic_author_lower_flag'] = np.where(base_data['countries_lc'].str.contains(x), "1", base_data['lmic_author_lower_flag'])

In [16]:
## new column for year, and year+month
base_data['year'] = base_data['pubmed_date'].dt.year
base_data['date'] = base_data['pubmed_date']

In [17]:
## drop not needed for analysis

base_data = base_data.drop(['doi', 'article_date', 'lang', 'journal', 'authors', 
                            'keywords', 'pubmed_date', 'mesh_terms', 'references_pmids', 'feature', 'include_fuzzy',
                           'journal_country', 'author_affils', 'affil_countries', 'affil_last_country',
                           'countries_lc', 'affil_countries_unique', 'include_raw'], axis=1)

base_data = base_data.rename(columns={"include": "mature_flag"})

In [18]:
##mask countries below first 12
occurrence = base_data['affil_first_country'].value_counts()

condition = occurrence < 440
mask = occurrence[condition].index
mask_dict = dict.fromkeys(mask, 'Other')

base_data['affil_country_masked'] = base_data['affil_first_country'].replace(mask_dict)

#total_country_output = base_data['affil_country_masked'].value_counts().rename_axis('country').reset_index(name='counts')
#total_country_output

In [19]:
base_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 161524
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   pmid                    32472 non-null  int64         
 1   title                   32472 non-null  object        
 2   abstract                32472 non-null  object        
 3   article_type            32472 non-null  object        
 4   journal_short           32472 non-null  object        
 5   mature_flag             32472 non-null  int64         
 6   affil_first_country     32470 non-null  object        
 7   lmic_author_flag        32472 non-null  object        
 8   lmic_author_lower_flag  32472 non-null  object        
 9   lmic_china_flag         32472 non-null  object        
 10  year                    32472 non-null  int64         
 11  date                    32472 non-null  datetime64[ns]
 12  affil_country_masked    32470 non-null  objec

In [20]:
base_data['mature_flag'].value_counts()

0    31305
1     1167
Name: mature_flag, dtype: int64

In [21]:
base_data = base_data.sort_index()
base_data.tail(15)

Unnamed: 0,pmid,title,abstract,article_type,journal_short,mature_flag,affil_first_country,lmic_author_flag,lmic_author_lower_flag,lmic_china_flag,year,date,affil_country_masked
161465,9470396,Decision support for psychiatric diagnosis based on a simple questionnaire.,"This paper compares two classifiers: Pseudo Bayesian and Neural Network for assisting in making diagnoses of psychiatric patients based on a simple yes/no questionnaire which is provided at the outpatient's first visit to the hospital. The classifiers categorize patients into three most commonly seen ICD classes, i.e. schizophrenic, emotional and neurotic disorders. One hundred completed questionnaires were utilized for constructing and evaluating the classifiers. Average correct decision rates were 73.3% for the Pseudo Bayesian Classifier and 77.3% for the Neural Network classifier. These rates were higher than the rate which an experienced psychiatrist achieved based on the same restricted data as the classifiers utilized. These classifiers may be effectively utilized for assisting psychiatrists in making their final diagnoses.",Comparative Study,Methods Inf Med,0,Germany,0,0,0,1998,1998-02-21,Germany
161479,9431637,Classification of low back pain from dynamic motion characteristics using an artificial neural network.,"Data were collected from 183 subjects who were randomly assigned to the training and test groups. During testing of the classification system, knowledge of the low back pain condition or motion characteristics of the patients in the test group was not made available to the system.",Journal Article,Spine (Phila Pa 1976),0,United States,0,0,0,1998,1998-02-12,United States
161482,9458164,Improving morphology-based malignancy grading schemes in astrocytic tumors by means of computer-assisted techniques.,"We propose an original methodology which improves the accuracy of the prognostic values associated with conventional morphologically-based classifications in supratentorial astrocytic tumors in the adult. This methodology may well help neuropathologists, who must determine the aggressiveness of astrocytic tumors on the basis of morphological criteria. The proposed methodology comprises two distinct steps, i.e. i) the production of descriptive quantitative variables (related to DNA ploidy level and morphonuclear aspects) by means of computer-assisted microscopy and ii) data analysis based on an artificial intelligence-related method, i.e. the decision tree approach. Three prognostic problems were considered on a series of 250 astrocytic tumors including 39 astrocytomas (AST), 47 anaplastic astrocytomas (ANA) and 164 glioblastomas (GBM) identified in accordance with the WHO classification. These three problems concern i) variations in the aggressiveness level of the high-grade tumors (ANA and GBM), ii) the detection of the aggressive as opposed to the less aggressive low-grade astrocytomas (AST), and iii) the detection of the aggressive as opposed to the less aggressive anaplastic astrocytomas (ANA). Our results show that the proposed computer-aided methodology improves conventional prognosis based on conventional morphologically-based classifications. In particular, this methodology enables some reference points to be established on the biological continuum according to the sequence AST-->ANA-->GBM.",Journal Article,Brain Pathol,0,Switzerland,0,0,0,1998,1998-02-11,Other
161485,9457438,Ventilation mode recognition using artificial neural networks.,"This study investigated the capabilities of artificial neural networks to identify spontaneous and pressure support ventilation modes from gas flow and airway pressure signals. After receiving written informed consent, flow and pressure waveforms were recorded from 13 patients undergoing general anesthesia. During analysis, the inspiratory phase of each breath was extracted and normalized in amplitude and wavelength. Neural networks were configured to input flow, pressure, or both waveforms and to output the ventilatory mode. Neural network training was accomplished with data from 500 breaths obtained from 7 patients. Neural network performance was tested with 433 breaths from the remaining 6 patients. Networks using flow, pressure, and both waveforms recognized correctly 78% (337), 97% (423), and 100% (433) of the test waveforms, respectively. Results indicate that neural networks can be used effectively for breathing pattern recognition and encourage the application of neural networks in other types of respiratory pattern recognition problems.",Clinical Trial,Comput Biomed Res,0,United States,0,0,0,1998,1998-02-11,United States
161486,9457435,Automated detection of hereditary syndromes using data mining.,"Computer-based data mining methodology applied to family history clinical data can algorithmically create highly accurate, clinically oriented hereditary disease pattern recognizers. For the example of hereditary colon cancer, the data mining's selection of relevant factors to assess for hereditary colon cancer was statistically significant (P < 0.05). All final recognizer-formulated patterns of hereditary colon cancer were independently confirmed by a clinical expert. Applied to previously analyzed family histories, the recognizer identified the definitive hereditary histories, correctly responded negatively to the putative hereditary histories, and correctly responded negatively to empirically elevated colon cancer risk situations. This capability facilitates patient selection for DNA studies in search of gene mutations. When genetic mutations are included as parameters in a patient database for a genetic disease, the process yields an expert system which characterizes variations in clinical disease presentations in terms of genetic mutations. Such information can greatly improve the efficiency of gene testing.",Journal Article,Comput Biomed Res,0,United States,0,0,0,1998,1998-02-11,United States
161490,9456211,Neural network assessment of perioperative cardiac risk in vascular surgery patients.,"Neural networks were developed to predict perioperative cardiac complications with data from 567 vascular surgery patients. Neural network scores were based on cardiac risk factors and dipyridamole thallium results. These scores were converted into likelihood ratios that predicted cardiac risk. The prognostic accuracy of the neural networks was similar to that of logistic regression models (ROC areas 76.0% vs 75.8%), but their calibration was better. Logistic regression overestimated event rates in a group of high-risk patients (predicted event rate, 64%; observed rate 30%; n=50, p<0.001). On a validation set of 514 patients, the neural networks still had ROC similar areas to those of logistic regression (68.3% vs 67.5%), but logistic regression again overestimated event rates for a group of high-risk patients. The calibration difference was reflected in the Hosmer-Lemeshow chi-square statistic (18.6 for the neural networks, 45.0 for logistic regression). The neural networks successfully estimated perioperative cardiac risk with better calibration than comparable logistic regression models.",Journal Article,Med Decis Making,0,United States,0,0,0,1998,1998-02-10,United States
161496,9453525,TACHY: an expert system for the management of supraventricular tachycardia in the elderly.,"Many physicians find the management of supraventricular tachyarrhythmia (SVT) in the elderly complex and challenging. With the use of artificial intelligence theory, we developed an interactive computer expert system, TACHY, to recommend therapies and warn physicians of potential therapeutic side effects.",Journal Article,Am Heart J,0,United States,0,0,0,1998,1998-02-07,United States
161502,9450258,Detection of ECG waveforms by neural networks.,"In this study, ECG waveform detection was performed by using artificial neural networks (ANNs). Initially, the R peak of the QRS complex is detected, and then feature vectors are formed by using the amplitudes of the significant frequency components of the DFT spectrum. Grow and Learn (GAL) and Kohonen networks are comparatively investigated to detect four different ECG waveforms. The comparative performance results of GAL and Kohonen networks are reported.",Journal Article,Med Eng Phys,0,United Kingdom,0,0,0,1998,1998-02-05,United Kingdom
161506,9445150,Neural network analysis of breast cancer from MRI findings.,To evaluate how much the experience of radiologists affects the performance of an artificial neural network (ANN) trained by two highly experienced radiologists.,Comparative Study,Radiat Med,1,Japan,0,0,0,1998,1998-01-28,Japan
161507,9440819,Automated classification of patients with chronic lymphocytic leukemia and immunocytoma from flow cytometric three-color immunophenotypes.,"The goal of this study was the discrimination between chronic lymphocytic leukemia (B-CLL), clinically more aggressive lymphoplasmocytoid immunocytoma (LP-IC) and other low-grade non-Hodgkin's lymphomas (NHL) of the B-cell type by automated analysis of flow cytometric immunophenotypes CD45/14/20, CD4/8/3, kappa/CD19/5, lambda/CD19/5 and CD10/23/19 from peripheral blood and bone marrow aspirate leukocytes using the multiparameter classification program CLASSIF1. The immunophenotype list mode files were exhaustively evaluated by combined lymphocyte, monocyte, and granulocyte (LMG) analysis. The results were introduced into databases and automatically classified in a standardized way. The resulting triple matrix classifiers are laboratory and instrument independent, error tolerant, and robust in the classification of unknown test samples. Practically 100% correct individual patient classification was achievable, and most manually unclassifiable patients were unambiguously classified. It is of interest that the single lambda/CD19/5 antibody triplet provided practically the same information as the full set of the five antibody triplets. This demonstrates that standardized classification can be used to optimize immunophenotype panels. On-line classification of test samples is accessible on the Internet: http://www.biochem.mpg.de/valet/leukaem1.html Immunophenotype panels are usually devised for the detection of the frequency of abnormal cell populations. As shown by computer classification, most the highly discriminant information is, however, not contained in percentage frequency values of cell populations, but rather in total antibody binding, antibody binding ratios, and relative antibody surface density parameters of various lymphocyte, monocyte, and granulocyte cell populations.",Journal Article,Cytometry,0,United States,0,0,0,1998,1998-01-24,United States


## Clean ner_data 

In [22]:
ner_data = ner_df.drop(['countries', 'text', 'lmic_flag'], axis=1)

In [23]:
ner_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 77604
Data columns (total 83 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   algo_neural_net      32472 non-null  int64
 1   algo_support_vector  32472 non-null  int64
 2   algo_regression      32472 non-null  int64
 3   algo_decision_tree   32472 non-null  int64
 4   algo_discriminant    32472 non-null  int64
 5   algo_naive_bayes     32472 non-null  int64
 6   algo_transfer        32472 non-null  int64
 7   algo_federated       32472 non-null  int64
 8   algo_k_nearest       32472 non-null  int64
 9   algo_unsupervised    32472 non-null  int64
 10  feat_imaging         32472 non-null  int64
 11  feat_xr              32472 non-null  int64
 12  feat_ct              32472 non-null  int64
 13  feat_mri             32472 non-null  int64
 14  feat_eeg             32472 non-null  int64
 15  feat_ecg             32472 non-null  int64
 16  feat_us              3

In [24]:
ner_data = ner_data.sort_index()
ner_data.tail(15)

Unnamed: 0_level_0,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_imaging,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,subspec_icu,subspec_ed,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_gynonc,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_renalca,subspec_haemonc,subspec_breast,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,spec_endo,subspec_dm,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_paeds,spec_dent,spec_audio,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,subspec_arrhyt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1
161465,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161479,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161482,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161485,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161490,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
161502,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
161506,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161507,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
#create null file for manual review

null = ner_data.loc[(ner_data==0).all(axis=1)]

In [26]:
#null['text'] = ner_df['text'].copy() 

In [27]:
#null.to_csv('ner_null.csv')

len(null)

919

In [28]:
## INTENSIVE CARE MEDICINE / icu
## EMERGENCY MEDICINE / ed
## INFECTIONS [C01] / id
    #### SEPSIS / sepsis
    #### COVID-19 / cov19
    #### MALARIA / malaria
    #### HIV / hiv
    #### TB / tb
## DERMATOLOGY [C17] / derm
    ####SKIN CANCERS / dermca
## NEOPLASMS [C04] / onc
    #### RADIOTHERAPY / rx
    #### LUNG / lungca
    #### NEURO / neuroca
    #### GI / gica
    #### HPB / hepca
    #### GYNAE / gynonc
    #### PROSTATE / prosca
    #### RENAL / renalca
    #### HAEM / haemonc
## BREAST / breast (<- almost entirely onc)
## PSYCHIATRY / psych
    #### SUICIDE / suicide
## MUSCULOSKELETAL [C05] / msk
    #### FRACTURE / frac
## CONNECTIVE TISSUE [C17] / rheum
## GASTROINTESTINAL [C06] / gi
## HEPATOLOGY & BILIARY [C06] / hep
## RESPIRATORY [C08] / resp
    #### PNEUMONIA / pneum
## NERVOUS SYSTEM [C10] / neuro
    #### STROKE / cva
    #### SEIZURE / epilep
    #### DEMENTIA / alzh
## CARDIOVASCULAR [C14] / cvs
    #### ISCHAEMIC HEART DISEASE / ihd
    #### CARDIAC FAILURE / hf
## ENDOCRINE [C19] / endo
    #### DIABETES / dm
## OPHTHALMOLOGY [C11] / eye
    #### RETINOPATHIES / retina
## HAEMATOLOGIC [C15] / haem
## GYNAE/OBSTETRIC [C13] / obs
## NEPHROLOGY [C12] / renal
    #### ACUTE & CHRONIC KIDNEY / ackd
## PAEDIATRICS / paeds
## STOMATOGNATHIC [C07] / dental
## AUDIOLOGY [C09] / ent
## PUBLIC HEALTH / pubh
## BCI
## PROSTHESIS CONTROL
## WHEELCHAIR CONTROL
## HOME SENSORS

## Clean size_data

In [29]:
size_df['ptdata_size'] = size_df[['patient_size', 'n_size']].max(axis=1)

In [30]:
size_data = size_df[['ptdata_size', 'feature_size']].copy().astype(int)

In [31]:
size_data = size_data.replace(0, np.nan)

In [32]:
size_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32474 entries, 86876 to 161524
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ptdata_size   7822 non-null   float64
 1   feature_size  5558 non-null   float64
dtypes: float64(2)
memory usage: 761.1+ KB


In [33]:
size_data.sample(5)

Unnamed: 0_level_0,ptdata_size,feature_size
index,Unnamed: 1_level_1,Unnamed: 2_level_1
152330,12.0,384.0
70139,,333.0
25553,86.0,
129212,,
40971,,


## Clean affil data

In [34]:
affil_data = affil_df['comm_flag'].copy()

In [35]:
affil_data = affil_data.sort_index()
affil_data.sample(50)

99286     0
12718     0
111161    0
151895    0
15807     1
54714     0
30543     1
91109     0
26826     0
9573      0
150614    0
79763     0
15392     0
97933     0
58710     0
70210     0
114935    0
138934    0
8553      0
97586     0
41530     0
90746     0
43639     0
59003     0
23133     0
153513    0
84426     0
110687    0
101987    0
77100     0
54778     0
34014     0
16547     1
81437     1
79577     0
146058    0
36577     0
68826     0
40233     0
2174      1
155583    0
96871     0
44227     0
94752     0
21167     0
94893     0
17974     0
18902     0
16992     0
55358     0
Name: comm_flag, dtype: int64

## Combine datasets

In [36]:
raw_joined = pd.concat([base_data, ner_data, size_data, affil_data], axis=1)

print(len(raw_joined))

32474


In [37]:
raw_joined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32474 entries, 0 to al Center
Data columns (total 99 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   pmid                    32472 non-null  float64       
 1   title                   32472 non-null  object        
 2   abstract                32472 non-null  object        
 3   article_type            32472 non-null  object        
 4   journal_short           32472 non-null  object        
 5   mature_flag             32472 non-null  float64       
 6   affil_first_country     32470 non-null  object        
 7   lmic_author_flag        32472 non-null  object        
 8   lmic_author_lower_flag  32472 non-null  object        
 9   lmic_china_flag         32472 non-null  object        
 10  year                    32472 non-null  float64       
 11  date                    32472 non-null  datetime64[ns]
 12  affil_country_masked    32470 non-null  object 

In [38]:
raw_joined.sample(5)

Unnamed: 0,pmid,title,abstract,article_type,journal_short,mature_flag,affil_first_country,lmic_author_flag,lmic_author_lower_flag,lmic_china_flag,year,date,affil_country_masked,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_imaging,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,subspec_icu,subspec_ed,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_gynonc,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_renalca,subspec_haemonc,subspec_breast,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,spec_endo,subspec_dm,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_paeds,spec_dent,spec_audio,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,subspec_arrhyt,ptdata_size,feature_size,comm_flag
19128,33360973.0,GAMER MRI: Gated-attention mechanism ranking of multi-contrast MRI in brain pathology.,"During the last decade, a multitude of novel quantitative and semiquantitative MRI techniques have provided new information about the pathophysiology of neurological diseases. Yet, selection of the most relevant contrasts for a given pathology remains challenging. In this work, we developed and validated a method, Gated-Attention MEchanism Ranking of multi-contrast MRI in brain pathology (GAMER MRI), to rank the relative importance of MR measures in the classification of well understood ischemic stroke lesions. Subsequently, we applied this method to the classification of multiple sclerosis (MS) lesions, where the relative importance of MR measures is less understood.",Journal Article,Neuroimage Clin,0.0,Switzerland,0,0,0,2020.0,2020-12-29,Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0
99647,25929619.0,Using within-subject pattern classification to understand lifespan age differences in oscillatory mechanisms of working memory selection and maintenance.,"In lifespan studies, large within-group heterogeneity with regard to behavioral and neuronal data is observed. This casts doubt on the validity of group-statistics-based approaches to understand age-related changes on cognitive and neural levels. Recent progress in brain-computer interface research demonstrates the potential of machine learning techniques to derive reliable person-specific models, representing brain behavior mappings. The present study now proposes a supervised learning approach to derive person-specific models for the identification and quantification of interindividual differences in oscillatory EEG responses related to working memory selection and maintenance mechanisms in a heterogeneous lifespan sample. EEG data were used to discriminate different levels of working memory load and the focus of visual attention. We demonstrate that our approach leads to person-specific models with better discrimination performance compared to classical person-nonspecific models. We show how these models can be interpreted both on an individual as well as on a group level. One of the key findings is that, with regard to the time dimension, the between-person variance of the obtained person-specific models is smaller in older than in younger adults. This is contrary to what we expected because of increased behavioral and neuronal heterogeneity in older adults.",Journal Article,Neuroimage,0.0,Germany,0,0,0,2015.0,2015-05-02,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
8078,33925256.0,A Machine Learning Decision Support System (DSS) for Neuroendocrine Tumor Patients Treated with Somatostatin Analog (SSA) Therapy.,"The application of machine learning (ML) techniques could facilitate the identification of predictive biomarkers of somatostatin analog (SSA) efficacy in patients with neuroendocrine tumors (NETs). We collected data from 74 patients with a pancreatic or gastrointestinal NET who received SSA as first-line therapy. We developed three classification models to predict whether the patient would experience a progressive disease (PD) after 12 or 18 months based on clinic-pathological factors at the baseline. The dataset included 70 samples and 15 features. We initially developed three classification models with accuracy ranging from 55% to 70%. We then compared ten different ML algorithms. In all but one case, the performance of the Multinomial NaÃƒÂ¯ve Bayes algorithm (80%) was the highest. The support vector machine classifier (SVC) had a higher performance for the recall metric of the progression-free outcome (97% vs. 94%). Overall, for the first time, we documented that the factors that mainly influenced progression-free survival (PFS) included age, the number of metastatic sites and the primary site. In addition, the following factors were also isolated as important: adverse events G3-G4, sex, Ki67, metastatic site (liver), functioning NET, the primary site and the stage. In patients with advanced NETs, ML provides a predictive model that could potentially be used to differentiate prognostic groups and to identify patients for whom SSA therapy as a single agent may not be sufficient to achieve a long-lasting PFS.",Journal Article,Diagnostics (Basel),0.0,Bosnia and Herzegovina,1,0,0,2021.0,2021-05-01,Other,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,70.0,0.0
32091,32717848.0,Classification of Neurological Patients to Identify Fallers Based on Spatial-Temporal Gait Characteristics Measured by a Wearable Device.,"Neurological patients can have severe gait impairments that contribute to fall risks. Predicting falls from gait abnormalities could aid clinicians and patients mitigate fall risk. The aim of this study was to predict fall status from spatial-temporal gait characteristics measured by a wearable device in a heterogeneous population of neurological patients. Participants (<i>n</i> = 384, age 49-80 s) were recruited from a neurology ward of a University hospital. They walked 20 m at a comfortable speed (single task: ST) and while performing a dual task with a motor component (DT1) and a dual task with a cognitive component (DT2). Twenty-seven spatial-temporal gait variables were measured with wearable sensors placed at the lower back and both ankles. Partial least square discriminant analysis (PLS-DA) was then applied to classify fallers and non-fallers. The PLS-DA classification model performed well for all three gait tasks (ST, DT1, and DT2) with an evaluation of classification performance Area under the receiver operating characteristic Curve (AUC) of 0.7, 0.6 and 0.7, respectively. Fallers differed from non-fallers in their specific gait patterns. Results from this study improve our understanding of how falls risk-related gait impairments in neurological patients could aid the design of tailored fall-prevention interventions.",Journal Article,Sensors (Basel),0.0,Netherlands,0,0,0,2020.0,2020-07-29,Netherlands,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,384.0,,0.0
41322,32166491.0,Artificial intelligence and radiomics enhance the positive predictive value of digital chest tomosynthesis for lung cancer detection within SOS clinical trial.,To enhance the positive predictive value (PPV) of chest digital tomosynthesis (DTS) in the lung cancer detection with the analysis of radiomics features.,Clinical Trial,Eur Radiol,0.0,Germany,0,0,0,2020.0,2020-03-14,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0


In [39]:
raw_joined = raw_joined.dropna(how='all')

In [40]:
all_ner = ['algo_neural_net', 'algo_support_vector', 'algo_regression', 'algo_decision_tree', 
            'algo_discriminant', 'algo_naive_bayes', 'algo_transfer', 'algo_federated', 'algo_k_nearest',
            'algo_unsupervised',
           #
            'feat_imaging', 'feat_xr', 'feat_ct', 'feat_mri', 'feat_eeg', 'feat_ecg',
            'feat_us', 'feat_echo', 'feat_histo', 'feat_oct', 'feat_mamm', 'feat_endoscop', 'feat_derm',
            'feat_gene', 'feat_bio', 'feat_nlp', 'feat_ehr', 'feat_sensor', 'feat_phone', 
           # 
           'subspec_icu', 'subspec_ed', 'spec_id', 'subspec_sepsis', 'subspec_hiv', 'subspec_cov19', 'subspec_tb',
            'subspec_malaria', 'spec_derm', 'subspec_dermca', 'spec_onc', 'subspec_rx', 'subspec_gynonc', 
            'subspec_lungca', 'subspec_brainca', 'subspec_gica', 'subspec_hepca', 'subspec_prosca',
            'subspec_renalca', 'subspec_haemonc', 'subspec_breast', 'spec_psych','subspec_suicide', 'spec_msk', 
            'subspec_frac', 'spec_rheum', 'spec_gi', 'spec_hep', 'spec_resp', 'subspec_pneum',
            'spec_neuro', 'subspec_epilep', 'subspec_cva', 'subspec_alzh', 'spec_cvs', 'subspec_ihd', 'subspec_hf', 
            'spec_endo', 'subspec_dm', 'spec_eye', 'subspec_retina', 'spec_haem', 'spec_obs', 'spec_renal', 
            'subspec_ackd', 'spec_paeds', 'spec_dent',  'spec_audio', 'spec_pubh', 'subspec_bci',
            'subspec_prosth', 'subspec_arrhyt', 'subspec_assist','subspec_activity']

for x in all_ner:
    raw_joined[x] = raw_joined[x].astype(int)

In [98]:
raw_joined.to_csv('for_export/clean/raw_joined.csv')

## Add comparative flags for all

In [53]:
ner_cats = raw_joined[['algo_neural_net', 'algo_support_vector', 'algo_regression', 'algo_decision_tree', 
            'algo_discriminant', 'algo_naive_bayes', 'algo_transfer', 'algo_federated', 'algo_k_nearest',
            'algo_unsupervised',
           #
            'feat_imaging', 'feat_xr', 'feat_ct', 'feat_mri', 'feat_eeg', 'feat_ecg',
            'feat_us', 'feat_echo', 'feat_histo', 'feat_oct', 'feat_mamm', 'feat_endoscop', 'feat_derm',
            'feat_gene', 'feat_bio', 'feat_nlp', 'feat_ehr', 'feat_sensor', 'feat_phone', 
           # 
           'subspec_icu', 'subspec_ed', 'spec_id', 'subspec_sepsis', 'subspec_hiv', 'subspec_cov19', 'subspec_tb',
            'subspec_malaria', 'spec_derm', 'subspec_dermca', 'spec_onc', 'subspec_rx', 'subspec_gynonc', 
            'subspec_lungca', 'subspec_brainca', 'subspec_gica', 'subspec_hepca', 'subspec_prosca',
            'subspec_renalca', 'subspec_haemonc', 'subspec_breast', 'spec_psych','subspec_suicide', 'spec_msk', 
            'subspec_frac', 'spec_rheum', 'spec_gi', 'spec_hep', 'spec_resp', 'subspec_pneum',
            'spec_neuro', 'subspec_epilep', 'subspec_cva', 'subspec_alzh', 'spec_cvs', 'subspec_ihd', 'subspec_hf', 
            'spec_endo', 'subspec_dm', 'spec_eye', 'subspec_retina', 'spec_haem', 'spec_obs', 'spec_renal', 
            'subspec_ackd', 'spec_paeds', 'spec_dent',  'spec_audio', 'spec_pubh', 'subspec_bci',
            'subspec_prosth', 'subspec_arrhyt', 'subspec_assist','subspec_activity', 
            #
            'mature_flag']].copy()


In [54]:
## subset of mature studies
ner_comparative = ner_cats.loc[ner_cats['mature_flag'] == 1].drop(['mature_flag'], axis=1).astype(int)

In [55]:
## rename columns
ner_comparative = ner_comparative.rename(columns=lambda x: 'mature_' + x)

In [56]:
ner_comparative.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1167 entries, 58 to 161506
Data columns (total 83 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   mature_algo_neural_net      1167 non-null   int32
 1   mature_algo_support_vector  1167 non-null   int32
 2   mature_algo_regression      1167 non-null   int32
 3   mature_algo_decision_tree   1167 non-null   int32
 4   mature_algo_discriminant    1167 non-null   int32
 5   mature_algo_naive_bayes     1167 non-null   int32
 6   mature_algo_transfer        1167 non-null   int32
 7   mature_algo_federated       1167 non-null   int32
 8   mature_algo_k_nearest       1167 non-null   int32
 9   mature_algo_unsupervised    1167 non-null   int32
 10  mature_feat_imaging         1167 non-null   int32
 11  mature_feat_xr              1167 non-null   int32
 12  mature_feat_ct              1167 non-null   int32
 13  mature_feat_mri             1167 non-null   int32
 14  mature_fea

## Add oncology flags for all

In [57]:
## subset of oncology studies
ner_oncology = ner_cats.loc[ner_cats['spec_onc'] == 1].drop(['mature_flag', 'spec_onc'], axis=1).astype(int)

In [58]:
## rename columns
ner_oncology = ner_oncology.rename(columns=lambda x: 'onc_' + x)

In [59]:
ner_oncology.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9237 entries, 0 to 161513
Data columns (total 82 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   onc_algo_neural_net      9237 non-null   int32
 1   onc_algo_support_vector  9237 non-null   int32
 2   onc_algo_regression      9237 non-null   int32
 3   onc_algo_decision_tree   9237 non-null   int32
 4   onc_algo_discriminant    9237 non-null   int32
 5   onc_algo_naive_bayes     9237 non-null   int32
 6   onc_algo_transfer        9237 non-null   int32
 7   onc_algo_federated       9237 non-null   int32
 8   onc_algo_k_nearest       9237 non-null   int32
 9   onc_algo_unsupervised    9237 non-null   int32
 10  onc_feat_imaging         9237 non-null   int32
 11  onc_feat_xr              9237 non-null   int32
 12  onc_feat_ct              9237 non-null   int32
 13  onc_feat_mri             9237 non-null   int32
 14  onc_feat_eeg             9237 non-null   int32
 15  onc_fea

## Join sub ner dfs 

In [60]:
temp = pd.concat([raw_joined, ner_comparative, ner_oncology], axis=1).astype(int, errors='ignore')

In [61]:
len(temp)

32472

In [62]:
joined = temp.fillna(0).astype(int, errors='ignore')

In [63]:
joined['total'] = 1

In [64]:
joined.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 161524
Data columns (total 265 columns):
 #    Column                      Dtype         
---   ------                      -----         
 0    pmid                        int32         
 1    title                       object        
 2    abstract                    object        
 3    article_type                object        
 4    journal_short               object        
 5    mature_flag                 int32         
 6    affil_first_country         object        
 7    lmic_author_flag            int32         
 8    lmic_author_lower_flag      int32         
 9    lmic_china_flag             int32         
 10   year                        int32         
 11   date                        datetime64[ns]
 12   affil_country_masked        object        
 13   algo_neural_net             int32         
 14   algo_support_vector         int32         
 15   algo_regression             int32         
 16   a

In [65]:
#temp = joined_by_date.drop(['pmid', 'abstract', 'title', 'article_type', 'journal_short', 'affil_countries_unique', 'affil_first_country',
#                          'countries_lc', 'date', 'affil_country_masked', 'feature_size', 'ptdata_size'], axis=1)

In [66]:
#temp = temp[temp['year'].notna()]

In [67]:
joined_by_year = joined.groupby(['year']).sum().astype(int).drop(['pmid'], axis=1)

In [68]:
joined_by_year

Unnamed: 0_level_0,mature_flag,lmic_author_flag,lmic_author_lower_flag,lmic_china_flag,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_imaging,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,subspec_icu,subspec_ed,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_gynonc,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_renalca,subspec_haemonc,subspec_breast,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,spec_endo,subspec_dm,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_paeds,spec_dent,spec_audio,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,subspec_arrhyt,ptdata_size,feature_size,comm_flag,mature_algo_neural_net,mature_algo_support_vector,mature_algo_regression,mature_algo_decision_tree,mature_algo_discriminant,mature_algo_naive_bayes,mature_algo_transfer,mature_algo_federated,mature_algo_k_nearest,mature_algo_unsupervised,mature_feat_imaging,mature_feat_xr,mature_feat_ct,mature_feat_mri,mature_feat_eeg,mature_feat_ecg,mature_feat_us,mature_feat_echo,mature_feat_histo,mature_feat_oct,mature_feat_mamm,mature_feat_endoscop,mature_feat_derm,mature_feat_gene,mature_feat_bio,mature_feat_nlp,mature_feat_ehr,mature_feat_sensor,mature_feat_phone,mature_subspec_icu,mature_subspec_ed,mature_spec_id,mature_subspec_sepsis,mature_subspec_hiv,mature_subspec_cov19,mature_subspec_tb,mature_subspec_malaria,mature_spec_derm,mature_subspec_dermca,mature_spec_onc,mature_subspec_rx,mature_subspec_gynonc,mature_subspec_lungca,mature_subspec_brainca,mature_subspec_gica,mature_subspec_hepca,mature_subspec_prosca,mature_subspec_renalca,mature_subspec_haemonc,mature_subspec_breast,mature_spec_psych,mature_subspec_suicide,mature_spec_msk,mature_subspec_frac,mature_spec_rheum,mature_spec_gi,mature_spec_hep,mature_spec_resp,mature_subspec_pneum,mature_spec_neuro,mature_subspec_epilep,mature_subspec_cva,mature_subspec_alzh,mature_spec_cvs,mature_subspec_ihd,mature_subspec_hf,mature_spec_endo,mature_subspec_dm,mature_spec_eye,mature_subspec_retina,mature_spec_haem,mature_spec_obs,mature_spec_renal,mature_subspec_ackd,mature_spec_paeds,mature_spec_dent,mature_spec_audio,mature_spec_pubh,mature_subspec_bci,mature_subspec_prosth,mature_subspec_arrhyt,mature_subspec_assist,mature_subspec_activity,onc_algo_neural_net,onc_algo_support_vector,onc_algo_regression,onc_algo_decision_tree,onc_algo_discriminant,onc_algo_naive_bayes,onc_algo_transfer,onc_algo_federated,onc_algo_k_nearest,onc_algo_unsupervised,onc_feat_imaging,onc_feat_xr,onc_feat_ct,onc_feat_mri,onc_feat_eeg,onc_feat_ecg,onc_feat_us,onc_feat_echo,onc_feat_histo,onc_feat_oct,onc_feat_mamm,onc_feat_endoscop,onc_feat_derm,onc_feat_gene,onc_feat_bio,onc_feat_nlp,onc_feat_ehr,onc_feat_sensor,onc_feat_phone,onc_subspec_icu,onc_subspec_ed,onc_spec_id,onc_subspec_sepsis,onc_subspec_hiv,onc_subspec_cov19,onc_subspec_tb,onc_subspec_malaria,onc_spec_derm,onc_subspec_dermca,onc_subspec_rx,onc_subspec_gynonc,onc_subspec_lungca,onc_subspec_brainca,onc_subspec_gica,onc_subspec_hepca,onc_subspec_prosca,onc_subspec_renalca,onc_subspec_haemonc,onc_subspec_breast,onc_spec_psych,onc_subspec_suicide,onc_spec_msk,onc_subspec_frac,onc_spec_rheum,onc_spec_gi,onc_spec_hep,onc_spec_resp,onc_subspec_pneum,onc_spec_neuro,onc_subspec_epilep,onc_subspec_cva,onc_subspec_alzh,onc_spec_cvs,onc_subspec_ihd,onc_subspec_hf,onc_spec_endo,onc_subspec_dm,onc_spec_eye,onc_subspec_retina,onc_spec_haem,onc_spec_obs,onc_spec_renal,onc_subspec_ackd,onc_spec_paeds,onc_spec_dent,onc_spec_audio,onc_spec_pubh,onc_subspec_bci,onc_subspec_prosth,onc_subspec_arrhyt,onc_subspec_assist,onc_subspec_activity,total
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1
1998,9,0,0,0,108,0,13,4,5,1,0,0,1,1,34,4,3,10,6,11,1,1,20,0,4,3,0,6,6,0,7,0,0,4,0,5,1,1,0,0,0,4,2,42,4,4,4,7,3,2,2,0,3,10,6,2,3,0,2,4,6,10,0,16,4,3,0,17,7,0,3,3,4,0,4,7,5,1,7,1,0,1,0,1,3,0,10,192079,5763,0,8,0,0,0,0,0,0,0,0,0,3,0,0,3,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,27,0,3,3,1,0,0,0,1,1,11,3,1,7,0,0,0,0,14,0,4,2,0,4,4,0,2,0,0,1,0,0,0,0,0,0,0,2,2,4,4,4,7,3,2,2,0,3,9,0,0,1,0,0,3,2,5,0,7,0,0,0,1,1,0,0,0,0,0,3,4,0,0,2,1,0,1,0,0,0,0,0,137
1999,18,1,1,1,107,0,18,3,9,3,0,0,2,5,45,9,0,5,5,5,9,3,13,0,7,1,3,3,8,2,15,0,0,5,3,11,1,3,0,1,0,6,3,52,4,5,1,3,2,1,10,0,1,16,5,2,2,0,1,7,3,19,4,19,7,2,4,21,11,1,5,3,3,0,1,9,4,1,7,3,1,0,1,0,0,0,6,71767,35889,0,7,0,0,0,0,0,0,0,0,1,11,4,0,0,0,1,2,2,4,0,2,0,0,0,1,1,0,0,0,0,0,4,0,0,0,0,0,0,0,7,0,1,0,0,0,0,2,0,0,4,1,1,0,0,0,1,0,4,3,1,0,0,1,4,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,34,0,9,1,3,0,0,0,1,1,12,3,0,2,0,0,5,1,11,0,7,1,3,2,7,0,5,0,0,0,0,2,0,1,0,0,0,3,3,4,5,1,3,2,1,10,0,1,16,0,0,0,0,0,6,1,2,0,4,0,1,0,2,1,0,2,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,157
2000,15,0,0,0,114,1,19,6,15,0,0,0,0,4,33,4,5,3,8,8,3,1,14,0,7,0,1,1,6,1,10,2,0,5,1,7,0,0,0,0,0,10,1,47,1,4,2,1,1,1,8,2,2,18,5,1,6,2,1,7,3,12,2,20,8,0,0,22,10,2,5,4,1,0,3,9,1,1,8,0,2,0,1,3,0,1,8,66259,4850,0,10,0,0,0,0,0,0,0,0,0,3,0,0,0,2,0,1,1,2,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,5,0,1,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,2,0,2,3,0,0,2,1,0,1,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,34,1,3,1,6,0,0,0,0,2,8,1,1,1,0,0,1,1,13,0,7,0,1,1,4,0,2,0,0,0,0,1,0,0,0,0,0,1,1,1,4,2,1,1,1,8,2,2,17,0,0,0,0,0,2,1,2,0,4,0,0,0,1,0,0,0,0,0,0,2,5,0,0,0,0,1,0,0,0,0,0,0,159
2001,15,1,1,1,126,2,17,9,4,1,0,0,1,2,33,5,3,8,8,13,4,1,12,0,9,0,1,2,3,2,16,0,0,10,1,10,0,0,0,0,0,5,3,44,0,2,0,1,2,3,7,1,3,15,5,1,5,1,1,6,6,9,2,26,9,1,3,27,8,0,7,7,2,0,4,6,3,0,11,3,3,0,1,3,1,0,12,20877,7409,0,11,0,1,1,0,0,0,0,0,0,7,2,1,2,1,1,2,0,6,0,3,0,0,0,0,1,1,0,0,0,1,3,0,0,0,0,0,0,0,7,0,0,0,0,0,1,0,0,1,5,0,0,0,0,0,1,1,2,1,2,1,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,29,1,5,3,0,0,0,0,1,0,7,1,0,5,0,0,2,0,9,0,9,0,1,2,1,0,5,0,0,1,0,2,0,0,0,0,0,3,3,0,2,0,1,2,3,7,1,3,14,0,0,0,0,0,4,4,0,0,2,0,0,0,0,0,0,0,0,0,0,3,2,1,0,0,0,0,0,0,0,0,0,0,180
2002,13,0,0,0,136,3,17,7,6,1,0,0,1,6,37,7,5,5,9,7,10,2,16,0,13,0,1,12,12,2,16,0,0,7,5,11,0,4,0,1,0,3,1,63,4,4,5,2,3,2,12,1,4,21,2,1,4,0,3,8,5,27,2,16,8,1,2,22,6,0,3,1,8,0,5,7,1,0,13,2,1,2,0,1,0,0,7,10957,21832,0,8,0,0,0,0,0,0,0,0,0,6,2,1,1,1,1,0,0,2,0,0,0,0,0,0,1,2,0,0,0,1,1,0,0,0,0,0,0,0,4,0,1,1,0,0,0,0,0,1,2,1,1,1,0,1,0,0,5,1,1,2,0,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,1,0,0,1,0,0,49,2,5,5,1,0,0,0,1,5,12,2,2,2,0,0,2,2,15,0,13,0,1,10,9,1,6,0,0,0,0,3,0,1,0,0,0,1,1,4,4,5,2,3,2,12,1,4,20,0,0,1,0,0,5,2,7,1,3,0,0,0,2,0,0,0,0,0,0,4,4,0,0,1,0,0,1,0,0,0,0,0,187
2003,8,0,0,0,125,9,24,7,5,3,0,0,3,8,40,3,10,7,10,4,12,3,20,0,9,2,2,13,12,0,8,2,0,8,2,11,1,2,0,1,1,6,6,77,1,4,10,2,6,3,9,8,5,21,2,1,0,0,2,13,4,20,2,21,7,0,2,16,5,0,3,3,5,0,7,5,12,4,5,2,1,0,1,3,1,0,5,10917,32657,1,6,0,0,0,0,0,0,0,0,0,3,0,2,1,0,0,0,0,2,0,3,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43,6,11,5,3,2,0,0,3,5,17,0,8,3,0,0,5,2,19,0,9,1,2,12,10,0,4,0,0,0,0,3,0,0,0,0,0,5,6,1,4,10,2,6,3,9,8,5,21,0,0,0,0,0,9,3,10,0,5,0,0,0,2,0,0,0,0,1,0,5,5,2,0,1,1,0,0,0,0,0,0,0,187
2004,20,3,0,0,154,22,26,10,10,1,0,0,1,6,60,5,13,12,13,13,17,2,26,0,17,3,3,17,18,2,14,1,0,8,4,24,0,0,0,2,1,9,8,99,4,3,11,5,10,4,15,0,4,28,10,2,6,1,1,15,9,26,2,31,15,4,2,32,12,2,9,6,11,0,5,8,4,1,19,3,3,0,1,2,1,1,11,54505,11431,0,9,0,1,0,2,0,0,0,0,0,8,0,6,2,0,0,3,0,3,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,11,1,0,3,0,0,0,1,0,0,6,1,0,0,0,0,1,0,5,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,0,1,0,1,0,0,0,0,0,0,49,11,9,4,5,1,0,0,1,1,26,3,9,8,0,0,6,2,24,0,17,0,3,14,14,0,4,0,0,0,0,8,0,0,0,1,0,8,8,4,3,11,5,10,4,15,0,4,27,0,0,1,0,0,12,5,12,1,7,0,0,0,3,0,1,1,0,2,0,4,4,0,0,4,2,0,0,0,0,0,1,0,261
2005,10,3,0,1,161,34,26,8,16,3,0,0,6,9,79,6,14,18,21,9,22,6,23,2,12,1,6,15,15,2,16,1,0,5,3,17,1,0,0,0,0,18,13,114,3,5,10,6,7,8,7,4,2,32,4,0,2,0,0,13,14,23,3,49,19,7,2,38,12,1,7,5,22,1,3,10,11,2,12,4,1,0,1,4,3,1,11,23614,52094,1,4,0,0,0,0,0,0,0,1,1,3,1,3,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,0,0,2,2,4,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,1,1,0,1,1,2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,43,19,3,6,5,2,0,0,2,6,31,3,9,10,0,0,7,3,19,0,12,1,6,15,13,0,7,0,0,0,0,5,0,0,0,0,0,15,13,3,5,10,6,6,8,7,4,2,31,0,0,1,0,0,9,9,12,0,6,0,0,0,4,0,0,3,2,1,0,2,5,1,0,0,1,0,0,0,0,0,0,0,318
2006,22,2,0,0,169,32,23,8,26,3,0,0,3,12,118,10,31,27,29,17,19,5,31,1,15,6,2,10,19,3,25,3,0,5,4,23,0,2,0,0,0,9,7,133,5,4,17,8,16,6,12,3,3,40,13,4,10,2,0,26,17,37,3,73,33,4,7,52,18,1,12,6,11,2,7,12,7,0,14,2,3,2,6,3,1,2,16,27250,145353,1,11,0,0,0,0,0,0,0,0,0,9,0,5,4,0,2,0,1,2,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,7,0,0,0,1,3,2,0,0,0,2,0,0,0,0,0,3,3,0,0,5,1,0,0,4,2,0,1,1,2,1,0,0,1,0,0,0,0,0,0,0,2,0,0,49,13,10,4,16,1,0,0,2,6,46,2,19,9,0,0,8,2,23,1,15,1,2,9,16,0,7,0,0,0,0,10,0,0,0,0,0,7,7,5,4,17,8,16,6,12,3,3,38,1,0,1,0,0,18,6,17,0,8,0,0,0,4,0,0,4,1,0,0,3,5,2,0,1,1,0,0,0,0,0,0,0,397
2007,18,9,0,1,190,71,37,26,28,8,0,0,13,21,180,10,40,45,55,44,37,10,35,5,28,8,5,22,22,6,32,12,0,11,8,31,1,3,0,1,0,14,6,180,7,13,25,8,11,4,20,3,8,53,18,1,22,8,3,23,12,68,1,101,47,12,16,81,16,4,20,13,21,5,12,23,10,3,30,1,2,2,10,7,5,7,43,21825,235880,2,3,3,0,1,0,0,0,0,0,0,9,2,3,2,0,0,2,1,2,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,9,1,1,2,0,0,0,2,0,1,2,0,0,3,1,0,0,0,5,0,0,0,0,0,2,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,47,22,10,9,7,4,0,0,4,9,59,1,23,16,0,0,11,3,31,1,28,0,5,19,16,1,11,0,0,0,0,11,0,1,0,0,0,7,6,7,13,25,8,11,4,18,3,8,51,1,0,3,0,0,12,6,26,0,11,0,0,1,4,0,0,4,1,0,0,9,13,2,0,3,0,0,0,0,0,0,0,0,616


In [69]:
joined_by_year.to_csv('for_export/clean/joined_by_year.csv')

In [70]:
joined.to_csv('for_export/clean/joined_by_date.csv')

## Dot matrix counts

In [71]:
for_transpose = joined.drop(['pmid', 'abstract', 'title', 'article_type', 'journal_short', 'affil_first_country', 
                             'year', 'date', 'affil_country_masked', 
                             #'feature_size', 'ptdata_size'
                            ], axis=1)

In [72]:
#temp = pd.concat([spec_by_year, feat_by_year], axis=1).astype(int)
#for_transpose = pd.concat([temp, subspec_by_year], axis=1).astype(int)
#
#for_transpose['mature_flag'] = joined['mature_flag'].copy()
#for_transpose['lmic_flag'] = joined['lmic_author_flag'].copy()
#for_transpose['lmic_lower_flag'] = joined['lmic_author_lower_flag'].copy()
#for_transpose['lmic_china_flag'] = joined['lmic_china_flag'].copy()

for_transpose.head(50)

Unnamed: 0,mature_flag,lmic_author_flag,lmic_author_lower_flag,lmic_china_flag,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_imaging,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,subspec_icu,subspec_ed,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_gynonc,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_renalca,subspec_haemonc,subspec_breast,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,spec_endo,subspec_dm,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_paeds,spec_dent,spec_audio,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,subspec_arrhyt,ptdata_size,feature_size,comm_flag,mature_algo_neural_net,mature_algo_support_vector,mature_algo_regression,mature_algo_decision_tree,mature_algo_discriminant,mature_algo_naive_bayes,mature_algo_transfer,mature_algo_federated,mature_algo_k_nearest,mature_algo_unsupervised,mature_feat_imaging,mature_feat_xr,mature_feat_ct,mature_feat_mri,mature_feat_eeg,mature_feat_ecg,mature_feat_us,mature_feat_echo,mature_feat_histo,mature_feat_oct,mature_feat_mamm,mature_feat_endoscop,mature_feat_derm,mature_feat_gene,mature_feat_bio,mature_feat_nlp,mature_feat_ehr,mature_feat_sensor,mature_feat_phone,mature_subspec_icu,mature_subspec_ed,mature_spec_id,mature_subspec_sepsis,mature_subspec_hiv,mature_subspec_cov19,mature_subspec_tb,mature_subspec_malaria,mature_spec_derm,mature_subspec_dermca,mature_spec_onc,mature_subspec_rx,mature_subspec_gynonc,mature_subspec_lungca,mature_subspec_brainca,mature_subspec_gica,mature_subspec_hepca,mature_subspec_prosca,mature_subspec_renalca,mature_subspec_haemonc,mature_subspec_breast,mature_spec_psych,mature_subspec_suicide,mature_spec_msk,mature_subspec_frac,mature_spec_rheum,mature_spec_gi,mature_spec_hep,mature_spec_resp,mature_subspec_pneum,mature_spec_neuro,mature_subspec_epilep,mature_subspec_cva,mature_subspec_alzh,mature_spec_cvs,mature_subspec_ihd,mature_subspec_hf,mature_spec_endo,mature_subspec_dm,mature_spec_eye,mature_subspec_retina,mature_spec_haem,mature_spec_obs,mature_spec_renal,mature_subspec_ackd,mature_spec_paeds,mature_spec_dent,mature_spec_audio,mature_spec_pubh,mature_subspec_bci,mature_subspec_prosth,mature_subspec_arrhyt,mature_subspec_assist,mature_subspec_activity,onc_algo_neural_net,onc_algo_support_vector,onc_algo_regression,onc_algo_decision_tree,onc_algo_discriminant,onc_algo_naive_bayes,onc_algo_transfer,onc_algo_federated,onc_algo_k_nearest,onc_algo_unsupervised,onc_feat_imaging,onc_feat_xr,onc_feat_ct,onc_feat_mri,onc_feat_eeg,onc_feat_ecg,onc_feat_us,onc_feat_echo,onc_feat_histo,onc_feat_oct,onc_feat_mamm,onc_feat_endoscop,onc_feat_derm,onc_feat_gene,onc_feat_bio,onc_feat_nlp,onc_feat_ehr,onc_feat_sensor,onc_feat_phone,onc_subspec_icu,onc_subspec_ed,onc_spec_id,onc_subspec_sepsis,onc_subspec_hiv,onc_subspec_cov19,onc_subspec_tb,onc_subspec_malaria,onc_spec_derm,onc_subspec_dermca,onc_subspec_rx,onc_subspec_gynonc,onc_subspec_lungca,onc_subspec_brainca,onc_subspec_gica,onc_subspec_hepca,onc_subspec_prosca,onc_subspec_renalca,onc_subspec_haemonc,onc_subspec_breast,onc_spec_psych,onc_subspec_suicide,onc_spec_msk,onc_subspec_frac,onc_spec_rheum,onc_spec_gi,onc_spec_hep,onc_spec_resp,onc_subspec_pneum,onc_spec_neuro,onc_subspec_epilep,onc_subspec_cva,onc_subspec_alzh,onc_spec_cvs,onc_subspec_ihd,onc_subspec_hf,onc_spec_endo,onc_subspec_dm,onc_spec_eye,onc_subspec_retina,onc_spec_haem,onc_spec_obs,onc_spec_renal,onc_subspec_ackd,onc_spec_paeds,onc_spec_dent,onc_spec_audio,onc_spec_pubh,onc_subspec_bci,onc_subspec_prosth,onc_subspec_arrhyt,onc_subspec_assist,onc_subspec_activity,total
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
15,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
24,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
37,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
43,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2224,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [73]:
for_transpose = for_transpose.astype(int)

In [74]:
transposed = for_transpose.T

In [75]:
transposed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, mature_flag to total
Columns: 32472 entries, 0 to 161524
dtypes: int32(32472)
memory usage: 31.7+ MB


In [76]:
matrix = transposed.dot(for_transpose).astype(int)

In [77]:
matrix['total'] = (pd.Series(np.diag(matrix), index=[matrix.index, matrix.columns])).tolist()

In [78]:
matrix

Unnamed: 0,mature_flag,lmic_author_flag,lmic_author_lower_flag,lmic_china_flag,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_imaging,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,subspec_icu,subspec_ed,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_gynonc,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_renalca,subspec_haemonc,subspec_breast,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,spec_endo,subspec_dm,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_paeds,spec_dent,spec_audio,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,subspec_arrhyt,ptdata_size,feature_size,comm_flag,mature_algo_neural_net,mature_algo_support_vector,mature_algo_regression,mature_algo_decision_tree,mature_algo_discriminant,mature_algo_naive_bayes,mature_algo_transfer,mature_algo_federated,mature_algo_k_nearest,mature_algo_unsupervised,mature_feat_imaging,mature_feat_xr,mature_feat_ct,mature_feat_mri,mature_feat_eeg,mature_feat_ecg,mature_feat_us,mature_feat_echo,mature_feat_histo,mature_feat_oct,mature_feat_mamm,mature_feat_endoscop,mature_feat_derm,mature_feat_gene,mature_feat_bio,mature_feat_nlp,mature_feat_ehr,mature_feat_sensor,mature_feat_phone,mature_subspec_icu,mature_subspec_ed,mature_spec_id,mature_subspec_sepsis,mature_subspec_hiv,mature_subspec_cov19,mature_subspec_tb,mature_subspec_malaria,mature_spec_derm,mature_subspec_dermca,mature_spec_onc,mature_subspec_rx,mature_subspec_gynonc,mature_subspec_lungca,mature_subspec_brainca,mature_subspec_gica,mature_subspec_hepca,mature_subspec_prosca,mature_subspec_renalca,mature_subspec_haemonc,mature_subspec_breast,mature_spec_psych,mature_subspec_suicide,mature_spec_msk,mature_subspec_frac,mature_spec_rheum,mature_spec_gi,mature_spec_hep,mature_spec_resp,mature_subspec_pneum,mature_spec_neuro,mature_subspec_epilep,mature_subspec_cva,mature_subspec_alzh,mature_spec_cvs,mature_subspec_ihd,mature_subspec_hf,mature_spec_endo,mature_subspec_dm,mature_spec_eye,mature_subspec_retina,mature_spec_haem,mature_spec_obs,mature_spec_renal,mature_subspec_ackd,mature_spec_paeds,mature_spec_dent,mature_spec_audio,mature_spec_pubh,mature_subspec_bci,mature_subspec_prosth,mature_subspec_arrhyt,mature_subspec_assist,mature_subspec_activity,onc_algo_neural_net,onc_algo_support_vector,onc_algo_regression,onc_algo_decision_tree,onc_algo_discriminant,onc_algo_naive_bayes,onc_algo_transfer,onc_algo_federated,onc_algo_k_nearest,onc_algo_unsupervised,onc_feat_imaging,onc_feat_xr,onc_feat_ct,onc_feat_mri,onc_feat_eeg,onc_feat_ecg,onc_feat_us,onc_feat_echo,onc_feat_histo,onc_feat_oct,onc_feat_mamm,onc_feat_endoscop,onc_feat_derm,onc_feat_gene,onc_feat_bio,onc_feat_nlp,onc_feat_ehr,onc_feat_sensor,onc_feat_phone,onc_subspec_icu,onc_subspec_ed,onc_spec_id,onc_subspec_sepsis,onc_subspec_hiv,onc_subspec_cov19,onc_subspec_tb,onc_subspec_malaria,onc_spec_derm,onc_subspec_dermca,onc_subspec_rx,onc_subspec_gynonc,onc_subspec_lungca,onc_subspec_brainca,onc_subspec_gica,onc_subspec_hepca,onc_subspec_prosca,onc_subspec_renalca,onc_subspec_haemonc,onc_subspec_breast,onc_spec_psych,onc_subspec_suicide,onc_spec_msk,onc_subspec_frac,onc_spec_rheum,onc_spec_gi,onc_spec_hep,onc_spec_resp,onc_subspec_pneum,onc_spec_neuro,onc_subspec_epilep,onc_subspec_cva,onc_subspec_alzh,onc_spec_cvs,onc_subspec_ihd,onc_subspec_hf,onc_spec_endo,onc_subspec_dm,onc_spec_eye,onc_subspec_retina,onc_spec_haem,onc_spec_obs,onc_spec_renal,onc_subspec_ackd,onc_spec_paeds,onc_spec_dent,onc_spec_audio,onc_spec_pubh,onc_subspec_bci,onc_subspec_prosth,onc_subspec_arrhyt,onc_subspec_assist,onc_subspec_activity,total
mature_flag,1167,208,8,141,687,31,31,24,5,2,24,0,3,4,505,145,202,124,12,26,104,26,150,23,59,58,33,6,25,34,66,6,15,14,35,88,5,4,32,8,1,64,52,478,15,18,80,17,61,14,33,11,11,134,26,7,74,30,4,79,25,175,35,107,17,33,9,101,32,10,75,51,103,43,19,34,22,4,55,24,6,4,0,2,5,1,27,2907911,17402668,217,687,31,31,24,5,2,24,0,3,4,505,145,202,124,12,26,104,26,150,23,59,58,33,6,25,34,66,6,15,14,35,88,5,4,32,8,1,64,52,478,15,18,80,17,61,14,33,11,11,134,26,7,74,30,4,79,25,175,35,107,17,33,9,101,32,10,75,51,103,43,19,34,22,4,55,24,6,4,0,2,27,5,1,269,14,13,13,3,0,8,0,0,1,189,24,95,59,0,0,55,4,122,2,59,44,33,5,13,10,18,0,4,1,1,17,0,0,2,3,0,56,52,15,18,80,17,61,14,33,11,11,127,1,0,7,2,1,73,15,84,4,20,0,2,0,10,3,1,18,3,8,1,11,19,4,0,3,5,0,2,0,0,0,4,0,1167
lmic_author_flag,208,8666,497,5145,3650,1394,477,739,147,87,298,6,199,150,3051,377,1005,1055,483,285,376,59,576,121,124,124,54,611,846,46,547,162,68,159,45,1038,62,25,509,105,29,189,115,3007,177,168,456,338,359,256,127,97,108,555,638,175,198,38,33,471,398,1049,247,1766,450,213,313,882,239,71,547,375,409,138,180,314,237,80,416,86,39,107,69,45,45,65,303,42793801,12266541,1088,143,4,2,3,0,0,6,0,1,0,87,29,41,13,0,2,21,2,24,5,4,15,2,0,3,2,8,1,8,3,1,21,1,2,14,3,1,5,5,93,4,4,23,0,16,2,6,1,1,18,1,0,14,8,0,21,3,42,13,6,0,3,0,10,4,1,26,17,30,12,4,7,1,0,11,6,0,1,0,0,2,3,0,1279,426,171,230,42,23,102,0,47,57,1057,42,485,483,3,4,167,3,481,7,124,74,52,475,408,14,164,7,6,6,0,154,4,2,18,6,1,122,115,177,167,449,338,354,255,122,97,106,532,15,2,51,4,3,379,274,466,10,379,3,17,8,40,8,1,138,22,13,0,115,184,77,10,60,12,2,6,0,0,4,9,0,8666
lmic_author_lower_flag,8,497,497,107,212,79,27,46,9,7,17,0,15,13,162,21,45,61,40,17,11,5,14,3,3,2,3,30,30,4,29,9,3,8,4,77,5,3,43,5,1,12,5,129,16,5,10,22,15,5,8,3,11,32,37,7,18,1,3,20,8,53,18,107,34,6,21,64,22,1,32,27,13,5,16,14,17,6,28,4,2,8,4,3,4,4,19,313274,455741,20,6,0,1,0,0,0,1,0,0,0,5,0,2,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,2,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,54,14,7,11,1,3,4,0,2,6,41,0,10,30,0,0,2,0,12,0,3,1,3,24,15,1,7,0,1,0,0,9,0,0,1,0,0,7,5,16,5,10,22,15,5,8,3,9,30,1,0,4,0,0,17,6,13,0,25,0,0,2,1,0,0,5,1,0,0,9,6,4,1,2,0,1,0,0,0,0,1,0,497
lmic_china_flag,141,5145,107,5145,2137,745,329,382,62,28,139,4,55,78,1887,136,706,731,240,146,254,34,341,76,61,96,19,428,559,22,313,73,24,93,25,483,35,9,205,30,4,81,55,2077,135,113,349,214,292,219,81,73,59,323,342,101,127,19,16,359,317,630,102,1001,220,134,185,462,128,38,290,171,228,58,91,190,151,49,219,34,24,59,44,19,24,24,151,39386585,5089753,753,98,4,0,2,0,0,4,0,1,0,63,17,34,9,0,1,16,1,17,2,3,12,1,0,3,1,5,0,4,0,0,8,0,0,7,0,0,4,4,70,3,3,19,0,15,2,1,1,0,13,0,0,11,5,0,18,3,30,7,4,0,3,0,6,4,0,15,7,18,4,3,4,1,0,7,3,0,1,0,0,1,2,0,859,250,142,148,21,10,53,0,15,30,788,26,386,356,2,3,134,2,291,4,61,58,19,344,301,7,102,2,1,5,0,91,3,0,10,1,0,56,55,135,113,349,214,287,218,79,73,59,309,10,0,43,3,2,304,228,357,2,231,2,11,4,17,4,0,84,8,10,0,64,121,52,5,41,7,1,2,0,0,3,5,0,5145
algo_neural_net,687,3650,212,2137,12740,937,723,610,193,75,561,7,189,159,4771,851,1583,1336,725,546,677,98,958,278,283,277,155,337,698,72,784,353,84,282,116,1088,50,28,495,93,25,355,243,3851,343,152,544,347,401,258,281,95,109,840,561,121,417,109,43,644,446,1428,291,2298,663,339,389,1371,324,70,658,467,764,209,201,359,244,61,570,175,65,89,84,116,78,127,540,57443935,41382167,1168,687,5,17,5,1,1,23,0,0,1,345,111,133,82,8,18,57,18,89,15,24,28,24,4,15,5,29,3,7,4,18,43,1,2,20,6,1,42,35,269,11,13,56,8,37,9,16,7,6,61,9,1,59,25,4,48,17,110,16,59,9,21,2,63,18,5,39,24,66,23,10,23,11,3,28,20,2,4,0,2,17,2,0,3851,260,192,167,58,20,193,2,61,56,1438,117,667,529,2,7,243,18,800,12,283,159,151,242,302,12,211,5,10,10,3,165,1,4,18,9,2,253,243,343,151,540,347,394,256,274,95,108,804,14,5,52,4,3,455,280,577,9,402,5,21,8,74,14,2,165,32,29,1,113,170,55,6,51,18,4,10,0,0,7,5,1,12740
algo_support_vector,31,1394,79,745,937,4362,516,810,265,136,59,0,342,112,1706,82,239,684,455,182,182,23,283,39,78,43,27,274,492,23,281,198,35,75,21,303,17,16,58,22,5,93,56,1130,55,72,151,131,110,72,81,22,50,276,498,95,97,23,11,177,139,372,25,1432,415,113,298,433,102,37,181,139,113,37,89,167,63,26,246,26,17,39,72,48,39,105,178,5200106,1281463,163,5,31,3,4,2,0,1,0,1,0,16,2,4,8,0,0,3,0,6,3,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,2,2,14,0,1,3,1,0,0,0,1,0,3,1,0,2,0,0,1,0,5,0,9,0,0,4,1,0,0,1,0,2,0,1,1,1,0,2,0,0,0,0,0,0,0,0,260,1130,120,209,63,38,20,0,86,43,379,14,115,154,2,0,71,5,241,4,78,20,26,194,171,4,93,5,0,2,1,63,1,1,3,1,0,58,55,55,72,149,131,110,72,81,22,50,269,5,3,12,0,0,129,81,163,4,145,3,3,4,17,4,2,34,8,0,0,54,78,21,2,30,2,1,5,0,0,0,1,0,4362
algo_regression,31,477,27,329,723,516,1922,573,76,50,8,1,85,44,391,29,86,180,64,33,52,17,119,6,20,8,4,153,248,15,306,44,10,96,51,179,21,13,43,9,3,36,11,488,32,30,64,41,53,39,47,16,20,107,180,63,60,19,21,85,82,168,18,378,59,68,75,269,99,45,135,112,30,8,53,89,91,39,127,21,9,31,8,9,4,14,39,18848872,2254395,152,17,3,31,1,1,0,0,0,0,0,15,5,6,6,0,1,4,0,3,0,4,0,0,0,1,0,3,0,0,2,1,4,0,0,2,0,0,0,0,13,0,0,2,2,0,0,0,0,2,8,2,0,0,0,0,0,1,7,3,7,0,0,0,5,2,1,1,0,0,0,2,0,0,0,1,0,1,0,0,0,1,0,0,192,120,488,112,20,15,3,0,22,15,120,6,35,61,1,1,22,1,98,0,20,5,3,100,106,2,53,0,0,3,3,26,1,0,4,0,0,11,11,32,30,64,41,53,39,45,16,18,103,4,1,8,1,0,61,41,68,2,48,2,4,3,19,4,2,23,8,3,0,20,33,13,3,9,2,1,4,0,0,1,0,0,1922
algo_decision_tree,24,739,46,382,610,810,573,2440,116,83,14,0,211,53,558,49,144,207,95,70,83,11,142,30,38,14,9,215,316,19,309,107,19,95,35,270,23,16,78,14,2,48,24,639,33,35,80,53,70,54,48,22,44,144,253,63,83,10,21,113,99,241,25,504,95,86,114,294,98,35,155,117,69,17,77,106,72,27,181,19,12,36,8,14,11,48,80,19547220,3923974,214,5,4,1,24,0,0,0,0,1,0,7,1,4,1,0,1,3,0,7,0,1,1,0,1,2,2,0,1,0,0,1,2,0,0,0,0,0,0,0,13,1,2,0,0,1,0,4,1,1,2,0,0,1,0,0,2,0,2,2,2,0,0,0,1,1,0,0,0,1,0,1,2,2,0,1,0,0,0,0,0,1,1,0,167,209,112,639,30,23,3,0,59,16,169,9,74,62,1,1,30,0,123,1,38,5,9,130,142,2,63,2,0,3,0,50,0,1,3,1,0,22,24,33,35,79,53,70,54,46,22,43,142,4,0,19,0,0,78,61,88,1,69,1,7,5,16,3,2,37,12,2,0,46,39,22,4,16,5,3,5,0,0,1,1,0,2440
algo_discriminant,5,147,9,62,193,265,76,116,600,23,6,0,77,31,260,8,25,61,100,17,16,3,45,1,18,4,4,18,49,1,31,26,2,8,2,46,0,3,6,3,1,18,10,172,9,17,20,27,10,3,8,3,8,48,50,9,16,1,3,19,7,56,3,191,81,6,28,39,8,1,17,11,14,3,16,31,10,1,25,3,0,1,25,25,9,8,17,141321,39250,14,1,2,1,0,5,0,0,0,0,0,2,0,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,63,20,30,172,5,0,0,15,12,70,2,14,25,0,0,7,1,42,0,18,3,4,15,20,0,8,0,0,1,0,16,0,1,0,1,0,11,10,9,17,20,27,10,3,8,3,8,48,1,0,3,0,0,13,4,25,0,28,0,0,0,1,0,0,5,0,0,0,8,18,3,0,1,1,0,1,0,1,0,0,0,600
algo_naive_bayes,2,87,7,28,75,136,50,83,23,263,4,0,44,9,71,14,21,19,17,3,8,0,12,0,5,1,4,19,18,7,35,11,3,9,2,26,2,0,8,0,3,6,4,81,6,6,9,6,6,6,1,3,6,21,19,5,8,4,1,7,8,22,3,54,16,8,12,29,8,4,9,8,7,3,8,7,8,2,7,1,2,7,1,4,6,5,3,242075,471501,12,1,0,0,0,0,2,0,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,38,15,23,5,81,0,0,16,4,18,1,10,6,0,0,5,0,11,0,5,1,4,14,6,1,12,1,0,0,0,7,0,0,0,0,0,5,4,6,6,9,6,6,6,1,3,6,20,0,0,1,0,0,6,7,9,0,7,0,0,0,2,1,0,2,1,0,0,6,6,4,0,0,0,0,1,0,0,0,0,0,263


In [79]:
matrix.to_csv('for_export/clean/matrix.csv')