In [75]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# to create deep copies of objects
import copy

In [2]:
### Read datasets

# read labels.pkl
with open("source_datasets/labels.pkl", "rb") as f:
    labels_dataset = pickle.load(f)
    
# read terms.pkl
with open("source_datasets/terms.pkl", "rb") as f:
    terms_dataset = pickle.load(f)

# read texts.pkl
with open("source_datasets/texts.pkl", "rb") as f:
    texts_dataset = pickle.load(f)

In [3]:
# get the size of dataset
len(labels_dataset)

11987

In [4]:
# make sure consistent sizes for other dataset parts

assert len(labels_dataset) == len(terms_dataset)
assert len(labels_dataset) == len(texts_dataset)

In [5]:
# show a random sample to get a feeling of data
print(terms_dataset[100])
print(labels_dataset[100])
print(texts_dataset[100])

Solithromycin
antibiotic
["Solithromycin (trade name Solithera) is a ketolide antibiotic undergoing clinical development for the treatment of community-acquired pneumonia and other infections.Solithromycin exhibits excellent in vitro activity against a broad spectrum of Gram-positive respiratory tract pathogens, including macrolide-resistant strains.  Solithromycin has activity against most common respiratory Gram-positive and fastidious Gram-negative pathogens, and is being evaluated for its utility in treating gonorrhea.\n\n\n== Pre-clinical studies ==\nAn in vivo pre-clinical study performed by Jeffrey Keelan done in sheep may provide a prophylactic approach for intrauterine infections during pregnancy. This study was carried out by administering solithromycin to pregnant sheep, resulting in effective concentrations greater than 30 ng/ml in the fetal plasma, maternal plasma and amniotic fluid. A single maternal dose maintained these concentrations for over 12 hours.\n\n\n== Clinical

## Exploratory analysis and Dataset Preparation

In [6]:
# check elements type in labels_dataset
for id_index in range(len(labels_dataset)):
    if not isinstance(labels_dataset[id_index], str):
        print("In labels_dataset elements type is not consistent (not always str): ", id_index)

# check elements type in terms_dataset 
for id_index in range(len(terms_dataset)):
    if not isinstance(terms_dataset[id_index], str):
        print("In terms_dataset elements type is not consistent (not always str): ", id_index)
        
# check elements type in texts_dataset 
for id_index in range(len(texts_dataset)):
    if not isinstance(texts_dataset[id_index], list):
        print("In texts_dataset elements type is not consistent (not always list): ", id_index)

In [7]:
# problem - text_dataset is of form list of lists

# check that there is just one string in one sample
for id_index in range(len(texts_dataset)):
    
    if len(texts_dataset[id_index]) != 1:
        print("There are more than 1 element in texts_dataset, index: ", id_index)
        
# indeed each sample in texts_dataset has just one element - string(text)

In [8]:
# transform text_dataset from list of lists(of size 1) --> list of strings
texts_dataset = [texts_dataset[id_index][0] for id_index in range(len(texts_dataset))]

In [9]:
### lower case samples in every source dataset (e.g. easier for duplicates detection)

texts_dataset = [texts_dataset[id_index].lower() for id_index in range(len(texts_dataset))]
terms_dataset = [terms_dataset[id_index].lower() for id_index in range(len(terms_dataset))]
labels_dataset = [labels_dataset[id_index].lower() for id_index in range(len(labels_dataset))]

In [10]:
### Create pandas DF for data exploration

# create a dict for df
tmp_df = {
            'term':terms_dataset,
            'label':labels_dataset,
            'text':texts_dataset
           }


# create DataFrame
general_dataset_df = pd.DataFrame(tmp_df)

In [11]:
# check the shape of the dataset
general_dataset_df.shape

(11987, 3)

In [12]:
# check that dimensions align with source dataset
assert general_dataset_df.shape[0] == len(labels_dataset)

In [13]:
# show 10 data samples
general_dataset_df.head(10)

Unnamed: 0,term,label,text
0,celesticetin,antibiotic,no results
1,aminocoumarin resistant gyrb,antibiotic,"dna gyrase, or simply gyrase, is an enzyme wit..."
2,bpi,antibiotic,disambiguationerror
3,axyxy-oprz,antibiotic,no results
4,ant(2''),antibiotic,ant-man and the wasp: quantumania is an upcomi...
5,netilmicin,antibiotic,netilmicin is a member of the aminoglycoside f...
6,carbapenem,antibiotic,carbapenems are a class of highly effective an...
7,g418,antibiotic,g418 (geneticin) is an aminoglycoside antibiot...
8,daptomycin resistant liar,antibiotic,no results
9,tobramycin,antibiotic,tobramycin is an aminoglycoside antibiotic der...


**Observation:** already some incorrect data points are displayed (e.g. "No Results", "DisambiguationError") - have to be removed

### Check for duplicates with DF

In [14]:
# count distinct observations row-wise
general_dataset_df.nunique(axis=0)

term     11229
label        6
text      8481
dtype: int64

In [15]:
# num of distinct values in a column ('term')
general_dataset_df['term'].nunique()

11229

In [16]:
# num of distinct values in a column ('text')
general_dataset_df['text'].nunique()

8481

In [17]:
# Show rows that are duplicates (Mark duplicates as True except for the first occurrence.)
general_dataset_df[general_dataset_df.duplicated()]

Unnamed: 0,term,label,text
102,antibiotic sensitive peptidyl transferase (23s...,antibiotic,a transfer rna (abbreviated trna and formerly ...
119,antibiotic sensitive dihydropteroate synthase,antibiotic,trimethoprim (tmp) is an antibiotic used mainl...
120,cfr 23s ribosomal rna methyltransferase,antibiotic,methyltransferases are a large group of enzyme...
131,abc-f atp-binding cassette ribosomal protectio...,antibiotic,no results
132,abc-f atp-binding cassette ribosomal protectio...,antibiotic,no results
...,...,...,...
9482,windowbase,non_science,windowbase was a database management system cr...
10081,st. moritz library,non_science,bibliothek st. moritz (in english st. moritz l...
11384,hyalospectra grisea,non_science,hyalospectra grisea is a moth in the family dr...
11486,carum,non_science,carum is a genus of about 20 species of flower...


**Observation:** There are duplicates in the dataset. Need to remove them from dataset to make sure that there are only unique elements in train-val-test for ML model.

### Remove duplicates in the source datasets

In [18]:
print("Num of duplicates in terms_dataset: ", len(terms_dataset)-len(set(terms_dataset)))

Num of duplicates in terms_dataset:  758


In [19]:
### Create complement data structure for processed dataset

# key - term, values = {'label', 'text'}
general_dataset_proc = dict()

In [20]:
### Remove duplicates in terms_dataset

# keep track of num of duplicates in terms_dataset
COUNT_TERM_DUPLICATES = 0

# keep only uniques terms in general_dataset_proc
for id_index in range(len(terms_dataset)):
    
    curr_term = terms_dataset[id_index]
    
    if curr_term not in general_dataset_proc: 
        general_dataset_proc[curr_term] = {
            'label': labels_dataset[id_index],
            'text': texts_dataset[id_index]
        }
    else:
        # case when curr_term is a duplicate (there is already existing term)
        COUNT_TERM_DUPLICATES += 1

print("Num.of duplicates in terms_dataset: ", COUNT_TERM_DUPLICATES)

Num.of duplicates in terms_dataset:  758


In [21]:
### Remove duplicates in texts_dataset

# create deep copy, remove elements directly in general_dataset_proc
general_dataset_proc_copy = copy.deepcopy(general_dataset_proc)

# keep track of duplicates in texts
COUNT_TEXT_DUPLICATES = 0

# save unique texts to a set 
text_unique = set()

# remove duplicates in texts
for term_key in general_dataset_proc_copy:
    
    curr_text = general_dataset_proc[term_key]['text']
    
    if curr_text not in text_unique: 
        text_unique.add(curr_text)
    else:
        # case when curr_text is a duplicate (there is the same text already in the dataset)
        COUNT_TEXT_DUPLICATES += 1
        
        # remove entry if text is duplicate
        del general_dataset_proc[term_key]
        
print("Num. of duplicates among texts: ", COUNT_TEXT_DUPLICATES)

Num. of duplicates among texts:  2749


In [22]:
# print number of sample in dataset after duplicates were removed
len(general_dataset_proc)

8480

### Remove outliers with Z-Score method

Get the length of each text. Remove samples with Text length outside of 3 standard deviations.

In [23]:
# get the length of each text and keep in one array
text_lengths = np.array([len(general_dataset_proc[term_key]['text']) for term_key in general_dataset_proc])

In [24]:
# lengths of text that are not outliers
valid_text_lengths = text_lengths[abs(text_lengths - np.mean(text_lengths)) < 3 * np.std(text_lengths)]

print("The size of valid_text_lengths: ", len(valid_text_lengths))

The size of valid_text_lengths:  8276


In [25]:
# text lengths that are outliers
outliers_text_lengths = text_lengths[abs(text_lengths - np.mean(text_lengths)) > 3 * np.std(text_lengths)]

print("The size of outliers_text_lengths: ", len(outliers_text_lengths))
print("The min size is ", min(outliers_text_lengths))

The size of outliers_text_lengths:  204
The min size is  35851


In [26]:
### Remove outliers

# create a deep copy, modify general_dataset_proc directly
general_dataset_proc_copy = copy.deepcopy(general_dataset_proc)

# keep samples with valid text length
for term_key in general_dataset_proc_copy:
    
    curr_text = general_dataset_proc[term_key]['text']
    
    if len(curr_text) in outliers_text_lengths:
        del general_dataset_proc[term_key]

In [27]:
### Remove samples with empty text (same procedure as with ourliers removal)

general_dataset_proc_copy = copy.deepcopy(general_dataset_proc)

for term_key in general_dataset_proc_copy:
    
    curr_text = general_dataset_proc[term_key]['text']
    
    if len(curr_text) < 1:
        del general_dataset_proc[term_key]

In [28]:
### Remove certain samples after manually checking the datasetc (same procedure as with ourliers removal)

general_dataset_proc_copy = copy.deepcopy(general_dataset_proc)

for term_key in general_dataset_proc_copy:
    
    curr_text = general_dataset_proc[term_key]['text']
    
    if curr_text in ['no results', 'disambiguationerror', 'disambiguation error', '\n== references ==']:
        del general_dataset_proc[term_key]

In [29]:

print("The size of the dataset after outlier removal step: ", len(general_dataset_proc))

The size of the dataset after outlier removal step:  8272


### Display a processed dataset in DF format

In [30]:
### Create pandas DF for data exploration

terms_list = []
labels_list = []
texts_list = []

for term_key in general_dataset_proc:
    terms_list.append(term_key)
    labels_list.append(general_dataset_proc[term_key]['label'])
    texts_list.append(general_dataset_proc[term_key]['text'])

# create a dict for df
tmp_df = {
            'term':terms_list,
            'label':labels_list,
            'text':texts_list
           }


# create DataFrame
general_dataset_df = pd.DataFrame(tmp_df)

In [31]:
general_dataset_df.shape

(8272, 3)

In [32]:
general_dataset_df.head(10)

Unnamed: 0,term,label,text
0,aminocoumarin resistant gyrb,antibiotic,"dna gyrase, or simply gyrase, is an enzyme wit..."
1,ant(2''),antibiotic,ant-man and the wasp: quantumania is an upcomi...
2,netilmicin,antibiotic,netilmicin is a member of the aminoglycoside f...
3,carbapenem,antibiotic,carbapenems are a class of highly effective an...
4,g418,antibiotic,g418 (geneticin) is an aminoglycoside antibiot...
5,tobramycin,antibiotic,tobramycin is an aminoglycoside antibiotic der...
6,azidamfenicol,antibiotic,"azidamfenicol is an amphenicol antibiotic, whi..."
7,fluoroquinolone resistant parc,antibiotic,a list of antibiotic resistant bacteria is pro...
8,pikromycin,antibiotic,pikromycin was studied by brokmann and hekel i...
9,kanamycin a,antibiotic,"kanamycin a, often referred to simply as kanam..."


In [33]:
# Count distinct observations row-wise
general_dataset_df.nunique(axis=0)

term     8272
label       6
text     8272
dtype: int64

In [34]:
# Show rows that are duplicates (Mark duplicates as True except for the first occurrence.)
general_dataset_df[general_dataset_df.duplicated()]

Unnamed: 0,term,label,text


**Observations**: No Duplicates Present in DF

In [35]:
### Check for class imbalance
general_dataset_df['label'].value_counts(sort=False)

non_science    5908
protein         659
cell            514
reagent         437
drug            470
antibiotic      284
Name: label, dtype: int64

In [38]:
print("Percentage of *non_science* labels: ", 5908 / len(general_dataset_df))
print("Percentage of *protein* labels: ", 659 / len(general_dataset_df))
print("Percentage of *cell* labels: ", 514 / len(general_dataset_df))
print("Percentage of *reagent* labels: ", 437 / len(general_dataset_df))
print("Percentage of *drug* labels: ", 470 / len(general_dataset_df))
print("Percentage of *antibiotic* labels: ", 284 / len(general_dataset_df))

Percentage of *non_science* labels:  0.7142166344294004
Percentage of *protein* labels:  0.07966634429400386
Percentage of *cell* labels:  0.06213733075435203
Percentage of *reagent* labels:  0.05282882011605416
Percentage of *drug* labels:  0.056818181818181816
Percentage of *antibiotic* labels:  0.03433268858800774


In [39]:
print("*non_science* labels: ", 5908 / len(general_dataset_df), " vc *science* labels: ", 
      (len(general_dataset_df)-5908) / len(general_dataset_df))

*non_science* labels:  0.7142166344294004  vc *science* labels:  0.28578336557059963


**Observation**: Clear Labels Imbalance in the dataset. "science" labels are underrepresented compared to "non-science", especially class "antibiotic" is underrepresented. Suggested solution to mitigate this issue is to oversample the underrepresented class or undersample the overrepresented class.

In [61]:
### Fixing Labels Imbalance - Problem

general_dataset_df['label'].value_counts(sort=False)

non_science    5908
protein         659
cell            514
reagent         437
drug            470
antibiotic      284
Name: label, dtype: int64

### Undersampling overrepresented class

In [62]:
non_science_df = general_dataset_df[general_dataset_df['label'] == 'non_science']

In [63]:
# random 50% undersampling of the DataFrame without replacement
non_science_df = non_science_df.sample(frac=0.5, replace=False, random_state=1)

In [64]:
non_science_df['label'].value_counts(sort=False)

non_science    2954
Name: label, dtype: int64

### Oversampling underrepresented class

In [65]:
antibiotic_df = general_dataset_df[general_dataset_df['label'] == 'antibiotic']

In [66]:
antibiotic_df = antibiotic_df.sample(frac=2, replace=True, random_state=1)

In [67]:
antibiotic_df['label'].value_counts(sort=False)

antibiotic    568
Name: label, dtype: int64

### Concatanate together

In [68]:
bal_general_dataset_df = pd.concat([non_science_df, 
                                   antibiotic_df,
                                   general_dataset_df[general_dataset_df['label'] == 'drug'],
                                   general_dataset_df[general_dataset_df['label'] == 'reagent'], 
                                   general_dataset_df[general_dataset_df['label'] == 'cell'], 
                                   general_dataset_df[general_dataset_df['label'] == 'protein']
                                   ])

In [69]:
bal_general_dataset_df['label'].value_counts(sort=False)

non_science    2954
protein         659
cell            514
reagent         437
drug            470
antibiotic      568
Name: label, dtype: int64

In [70]:
print("Percentage of *non_science* labels: ", 2954 / len(bal_general_dataset_df))
print("Percentage of *protein* labels: ", 659 / len(bal_general_dataset_df))
print("Percentage of *cell* labels: ", 514 / len(bal_general_dataset_df))
print("Percentage of *reagent* labels: ", 437 / len(bal_general_dataset_df))
print("Percentage of *drug* labels: ", 470 / len(bal_general_dataset_df))
print("Percentage of *antibiotic* labels: ", 568 / len(bal_general_dataset_df))

Percentage of *non_science* labels:  0.5273116744019993
Percentage of *protein* labels:  0.11763655837201
Percentage of *cell* labels:  0.0917529453766512
Percentage of *reagent* labels:  0.07800785433773652
Percentage of *drug* labels:  0.08389860764012852
Percentage of *antibiotic* labels:  0.10139235987147448


In [71]:
print("*non_science* labels: ", 2954 / len(bal_general_dataset_df), " vc *science* labels: ", 
      (len(bal_general_dataset_df)-2954) / len(bal_general_dataset_df))

*non_science* labels:  0.5273116744019993  vc *science* labels:  0.4726883255980007


**Observation**: 50% of samples belonging to class "non_science" were removed. This has a shortcoming of data loss. Still after performing undersampling and oversampling the label distributions is not ideal (50% - "non_science").

**Future Improvement**: To perfrom well in current multi-classification task it might be better to laverage 2 classifiers, 1st - to separate "non_science" labels from "science", 2nd - if the label is "science", classify text further into scientific labels (num. 5) 

In [72]:
# shuffle rows in the dataset multiple times
bal_general_dataset_df = bal_general_dataset_df.sample(frac=1).reset_index(drop=True)
bal_general_dataset_df = bal_general_dataset_df.sample(frac=1).reset_index(drop=True)

bal_general_dataset_df.head()

Unnamed: 0,term,label,text
0,pe(18:1/0:0),reagent,windows preinstallation environment (also know...
1,fibrinogen / thrombin topical powder,drug,"thrombin (ec 3.4.21.5, fibrinogenase, thrombas..."
2,pancreatic goblet cell,cell,neuroendocrine tumors (nets) are neoplasms tha...
3,farjad nabi,non_science,farjad nabi (urdu: فرجاد نبی‎) is a pakistani ...
4,lance rocks,non_science,the lance rocks (82°52′s 48°19′w) are two rock...


In [73]:
bal_general_dataset_df['label'].value_counts(sort=False)

non_science    2954
protein         659
cell            514
reagent         437
drug            470
antibiotic      568
Name: label, dtype: int64

In [79]:
len(bal_general_dataset_df)

5602

In [80]:
602 / 5602

0.10746162084969654

In [122]:
# split to train (+val) and test (~10%) datasets
train_df, test_df = train_test_split(bal_general_dataset_df, test_size=602)

In [123]:
train_df['label'].value_counts(sort=False)

reagent         385
non_science    2665
protein         569
cell            449
drug            417
antibiotic      515
Name: label, dtype: int64

In [124]:
test_df['label'].value_counts(sort=False)

cell            65
non_science    289
protein         90
antibiotic      53
drug            53
reagent         52
Name: label, dtype: int64

In [125]:
# save processed balanced dataset to file
bal_general_dataset_df.to_csv('balanced_general_dataset.csv', index=False)

In [127]:
# save train test datasets
train_df.to_csv('training_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)