# Adalet Veyis Turgut
# Ufuk Arslan
# Zuhal Didem Aytaç

In [None]:
import string
from typing import List
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import multilabel_confusion_matrix, label_ranking_average_precision_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# case-fold, punctuation removal, stop-word removal
def preprocess_text(text: str):
    text = str(text)
    text = text.casefold()
    text = " ".join(text.split())
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def clean_and_tokenize(text: str, stop_words: List[str]):
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

# Lemmatization is converting the word to its base form or lemma by removing affixes from the inflected words.
def lemmatization(text: str):
    result = []
    wordnet = WordNetLemmatizer()
    for token, tag in pos_tag(text):
        pos = tag[0].lower()
        if pos not in ['a', 'r', 'n', 'v']:
            pos = 'n'
        result.append(wordnet.lemmatize(token, pos))
    return result

# Stemming also reduces the words to their root forms but unlike lemmatization, the stem itself may not a valid word
def stemming(text):
    porter = nltk.PorterStemmer()
    return " ".join([porter.stem(word) for word in text.split()])


def find_frequent_words(col):
    words = []
    for word_list in col.values:
        words.extend(word_list)
    fdist = nltk.FreqDist(words)
    res = fdist.most_common(10)
    return [word[0] for word in res]


def remove_frequent_words(tokens: List[str], frequent_words: List[str]):
    tokens = [word for word in tokens if word not in frequent_words]
    return tokens

In [None]:
# obtain train & test datasets
!gdown --id 1ddjOCKOpVNe55Otrz4p1lWQMU7fPCbO4
!gdown --id 1P9aEeivdMockjdGfH4dwhJoFQgLS4JhY 

# download stopwords to the system
!python -m nltk.downloader stopwords

df_dev = pd.read_csv('BC7-LitCovid-Dev.csv', index_col='pmid')
df_train = pd.read_csv('BC7-LitCovid-Train.csv', index_col='pmid')

# df_train.describe(include='all')
df_dev.describe(include = 'all')

Downloading...
From: https://drive.google.com/uc?id=1ddjOCKOpVNe55Otrz4p1lWQMU7fPCbO4
To: /content/BC7-LitCovid-Dev.csv
100% 11.1M/11.1M [00:00<00:00, 41.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1P9aEeivdMockjdGfH4dwhJoFQgLS4JhY
To: /content/BC7-LitCovid-Train.csv
100% 44.1M/44.1M [00:00<00:00, 94.9MB/s]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,journal,title,abstract,keywords,pub_type,authors,doi,label
count,6239,6239,6239,4754,6239,6212,6100,6239
unique,2132,6239,6234,4742,170,6194,6100,103
top,J Med Virol,Infectious Diseases as Socio-Spatial Processes...,Abstract:,infectious diseases;respiratory medicine,Journal Article,"Siddiqui, Ruqaiyyah;Khan, Naveed Ahmed",10.3390/jcm9092943,Prevention
freq,103,1,3,2,3056,3,1,2256


In [None]:
# PREPROCESS TRAIN DATA
df_train.dropna(subset=['title', 'abstract', 'keywords'],inplace=True)  
stop_words = stopwords.words('english')
df_train['keywords'] = df_train['keywords'].apply(lambda x: x.split(';'))
df_train['label'] = df_train['label'].apply(lambda x: x.split(';'))
for col in ['title', 'abstract', 'keywords']:
    df_train[col] = df_train[col].apply(preprocess_text)
    df_train[col] = df_train[col].apply(stemming)  # todo either lemmatization or stemming
    df_train[col] = df_train[col].apply(lambda x: clean_and_tokenize(x, stop_words))
    frequent_words = find_frequent_words(df_train[col])
    df_train[col] = df_train[col].apply(lambda x: remove_frequent_words(x, frequent_words))
    df_train[col] = df_train[col].apply(lambda x: ' '.join(x))
df_train.describe(include='all')

Unnamed: 0,journal,title,abstract,keywords,pub_type,authors,doi,label
count,18968,18968,18968,18968.0,18968,18912,18759,18968
unique,3014,18895,18932,18595.0,270,18704,18749,165
top,J Med Virol,pregnanc,2019 respiratori tract caus newli emerg first ...,,Journal Article,"Suwanwongse, Kulachanya;Shabarek, Nehad",10.1016/j.jstrokecerebrovasdis.2020.104949,[Prevention]
freq,298,4,3,38.0,9372,5,2,6756


In [None]:
# STATISTICS
df_train.info()
classes = {'Treatment': 0, 'Diagnosis':0, 'Prevention':0, 'Mechanism':0, 'Transmission':0, 'Epidemic Forecasting':0,  'Case Report':0}
classes_words = {'Treatment': 0, 'Diagnosis':0, 'Prevention':0, 'Mechanism':0, 'Transmission':0, 'Epidemic Forecasting':0,  'Case Report':0}
i = 0
for index, row in df_train.iterrows():
  labels = row['label']
  for label in labels:
    classes[label] += 1
    classes_words[label] += (str(row['title']) + " " + str(row['abstract']) +" " + str(row['keywords'])).count(" ")

for c, count in classes.items():
  print(c, count)
print()
for c, count in classes_words.items():
  print(c, count) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18968 entries, 32519164 to 32389144
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   journal   18968 non-null  object
 1   title     18968 non-null  object
 2   abstract  18968 non-null  object
 3   keywords  18968 non-null  object
 4   pub_type  18968 non-null  object
 5   authors   18912 non-null  object
 6   doi       18759 non-null  object
 7   label     18968 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB
Treatment 6710
Diagnosis 4695
Prevention 8300
Mechanism 3518
Transmission 841
Epidemic Forecasting 519
Case Report 1571

Treatment 933809
Diagnosis 679674
Prevention 1110330
Mechanism 458034
Transmission 109585
Epidemic Forecasting 68173
Case Report 145631


In [None]:
df_new_train = pd.DataFrame({'data' : [], 'label':[]})
df_new_train["data"] = df_train["title"].astype(str) +" "+ df_train["abstract"].astype(str) +" "+ df_train["keywords"]
df_new_train["label"] = df_train["label"]
df_new_train.info()
del [df_train] # free RAM

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18968 entries, 32519164 to 32389144
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    18968 non-null  object
 1   label   18968 non-null  object
dtypes: object(2)
memory usage: 444.6+ KB


In [None]:
# PREPROCESS DEV DATA
df_dev.dropna(subset=['title', 'abstract', 'keywords'],inplace=True)  
stop_words = stopwords.words('english')
text_columns = df_dev.columns
df_dev['keywords'] = df_dev['keywords'].apply(lambda x: x.split(';'))
df_dev['label'] = df_dev['label'].apply(lambda x: x.split(';'))
for col in ['title', 'abstract', 'keywords']:
    df_dev[col] = df_dev[col].apply(preprocess_text)
    df_dev[col] = df_dev[col].apply(stemming)  # todo either lemmatization or stemming
    df_dev[col] = df_dev[col].apply(lambda x: clean_and_tokenize(x, stop_words))
    frequent_words = find_frequent_words(df_dev[col])
    df_dev[col] = df_dev[col].apply(lambda x: remove_frequent_words(x, frequent_words))# why?
    df_dev[col] = df_dev[col].apply(lambda x: ' '.join(x))
df_dev.describe(include='all')


Unnamed: 0,journal,title,abstract,keywords,pub_type,authors,doi,label
count,4754,4754,4754,4754.0,4754,4737,4707,4754
unique,1644,4751,4751,4714.0,141,4723,4707,88
top,J Med Virol,triag consider refer structur heart intervent ...,ha markedli chang practic articl analys risk c...,,Journal Article,"Siddiqui, Ruqaiyyah;Khan, Naveed Ahmed",10.3390/jcm9092943,[Prevention]
freq,103,2,2,7.0,2378,3,1,1713


In [None]:
df_new_dev = pd.DataFrame({'data' : [], 'label':[]})
df_new_dev["data"] = df_dev["title"].astype(str) +" "+ df_dev["abstract"].astype(str) +" "+ df_dev["keywords"]
df_new_dev["label"] = df_dev["label"]
df_new_dev.info()
del [df_dev] # free RAM

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4754 entries, 32653511 to 32781167
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    4754 non-null   object
 1   label   4754 non-null   object
dtypes: object(2)
memory usage: 111.4+ KB


In [None]:
count_vector = CountVectorizer()
X_train_counts =  count_vector.fit_transform(df_new_train['data'])
X_dev_counts =  count_vector.transform(df_new_dev['data'])
print(X_train_counts.toarray().shape)
print(X_dev_counts.toarray().shape)

(18968, 64807)
(4754, 64807)


In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tf_idf = tfidf_transformer.fit_transform(X_train_counts)
X_dev_tf_idf = tfidf_transformer.transform(X_dev_counts)
print(X_train_tf_idf.toarray().shape)
print(X_dev_tf_idf.toarray().shape)

(18968, 64807)
(4754, 64807)


In [None]:
type(X_train_tf_idf)


scipy.sparse.csr.csr_matrix

In [None]:
mlb = MultiLabelBinarizer() 
Y_train = mlb.fit_transform(df_new_train['label'])
Y_dev = mlb.fit_transform(df_new_dev['label'])
print(Y_train.shape)
print(Y_dev.shape)

(18968, 7)
(4754, 7)


In [None]:
model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5))
KNN = model.fit(X_train_tf_idf,Y_train)

In [None]:
Y_dev_pred = KNN.predict(X_dev_tf_idf)
Y_dev_true = Y_dev

In [None]:
cf_mtx = multilabel_confusion_matrix(Y_dev_true, Y_dev_pred)

In [None]:
# CONFUSION MATRIXES
for i,submatrix in enumerate(cf_mtx):
  print(mlb.classes_[i])
  print(submatrix)

Case Report
[[4332   58]
 [ 239  125]]
Diagnosis
[[3276  288]
 [ 332  858]]
Epidemic Forecasting
[[4559   38]
 [  56  101]]
Mechanism
[[3797  129]
 [ 226  602]]
Prevention
[[2432  237]
 [ 294 1791]]
Transmission
[[4508   61]
 [ 112   73]]
Treatment
[[2740  333]
 [ 329 1352]]


In [None]:
classes = {'Treatment': 0, 'Diagnosis':0, 'Prevention':0, 'Mechanism':0, 'Transmission':0, 'Epidemic Forecasting':0,  'Case Report':0}
eval = {}
for metric in ['precision', 'recall', 'f1-score' , 'support']:
  eval[metric] = classes.copy()


In [None]:
# ACCURACY
for i,submatrix in enumerate(cf_mtx):
    print(mlb.classes_[i])
    print((submatrix[0][0] + submatrix[1][1]) / (submatrix[0][0]+submatrix[1][0]+submatrix[0][1]+submatrix[1][1]))


Case Report
0.9375262936474548
Diagnosis
0.8695835086243163
Epidemic Forecasting
0.9802271771140093
Mechanism
0.9253260412284392
Prevention
0.8883045856121161
Transmission
0.9636095919225915
Treatment
0.8607488430795119


In [None]:
# PRECISION 
precisions = {mlb.classes_[i]:submatrix[0][0]/(submatrix[0][0]+submatrix[0][1]) for i,submatrix in  enumerate(cf_mtx)}
sumx=0
for key,val in precisions.items():
    sumx +=val
    print(key)
    print(val)
    eval['precision'][key] = float("{:.4f}".format(val))
print("Mean")
print(sumx/len(cf_mtx))
print(eval)

Case Report
0.9867881548974943
Diagnosis
0.9191919191919192
Epidemic Forecasting
0.9917337393952578
Mechanism
0.9671421293937851
Prevention
0.9112026976395654
Transmission
0.98664915736485
Treatment
0.8916368369671331
Mean
0.9506206621214293
{'precision': {'Treatment': 0.8916, 'Diagnosis': 0.9192, 'Prevention': 0.9112, 'Mechanism': 0.9671, 'Transmission': 0.9866, 'Epidemic Forecasting': 0.9917, 'Case Report': 0.9868}, 'recall': {'Treatment': 0, 'Diagnosis': 0, 'Prevention': 0, 'Mechanism': 0, 'Transmission': 0, 'Epidemic Forecasting': 0, 'Case Report': 0}, 'f1-score': {'Treatment': 0, 'Diagnosis': 0, 'Prevention': 0, 'Mechanism': 0, 'Transmission': 0, 'Epidemic Forecasting': 0, 'Case Report': 0}, 'support': {'Treatment': 0, 'Diagnosis': 0, 'Prevention': 0, 'Mechanism': 0, 'Transmission': 0, 'Epidemic Forecasting': 0, 'Case Report': 0}}


In [None]:
# RECALL 
recalls = {mlb.classes_[i]:submatrix[0][0]/(submatrix[0][0]+submatrix[1][0]) for i,submatrix in  enumerate(cf_mtx)}
sumx = 0
for key,val in recalls.items():
    sumx +=val
    print(key)
    print(val)
    eval['recall'][key] = float("{:.4f}".format(val))

print("Mean")
print(sumx/len(cf_mtx))

Case Report
0.9477138481732662
Diagnosis
0.9079822616407982
Epidemic Forecasting
0.9878656554712892
Mechanism
0.9438230176485211
Prevention
0.8921496698459281
Transmission
0.9757575757575757
Treatment
0.8927989573150863
Mean
0.935441569407495


In [None]:
# F1
f_1s = {label:2*(recalls[label]*precisions[label])/(recalls[label]+precisions[label]) for label in mlb.classes_}
for key,val in f_1s.items():
    print(key)
    print(val)
    eval['f1-score'][key] = float("{:.4f}".format(val))


Case Report
0.9668563776364244
Diagnosis
0.9135527049637479
Epidemic Forecasting
0.989795918367347
Mechanism
0.9553402943766511
Prevention
0.9015755329008341
Transmission
0.9811731417999783
Treatment
0.8922175187235428


In [None]:
# AVERAGES
print("MACRO_AVERAGE: ")
print(sum(precisions.values())/len(precisions))
print("MICRO_AVERAGE: ")
print(sum(x[0][0] for x in cf_mtx) / sum(x[0][0]+x[1][0] for x in cf_mtx))

MACRO_AVERAGE: 
0.9506206621214293
MICRO_AVERAGE: 
0.9416862514688602


In [None]:
pd.DataFrame(Y_dev_pred).to_csv("test_predictions.csv", header = mlb.classes_)
pd.DataFrame(Y_dev_true).to_csv("test_true.csv", header = mlb.classes_)

In [None]:
import csv
with open('test_predictions.csv', 'r') as infile, open('reordered_test_predictions.csv', 'a') as outfile:
    # output dict needs a list for new column ordering
    fieldnames = ["","Treatment","Diagnosis","Prevention","Mechanism","Transmission","Epidemic Forecasting","Case Report"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    # reorder the header first
    writer.writeheader()
    for row in csv.DictReader(infile):
        # writes the reordered rows to the new file
        writer.writerow(row)

In [None]:
with open('test_true.csv', 'r') as infile, open('reordered_test_true.csv', 'a') as outfile:
    # output dict needs a list for new column ordering
    fieldnames = ["","Treatment","Diagnosis","Prevention","Mechanism","Transmission","Epidemic Forecasting","Case Report"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    # reorder the header first
    writer.writeheader()
    for row in csv.DictReader(infile):
        # writes the reordered rows to the new file
        writer.writerow(row)