# SETUP

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.metrics import precision_score, recall_score, accuracy_score


# Data: load / explore

In [7]:
df = pd.read_csv('trials.csv')
df.shape

(1759, 3)

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
nctid,NCT02464748,NCT00362362,NCT03154450,NCT03487263,NCT04454892
description,MND is often referred to as amyotrophic latera...,OBJECTIVE:||The causes of sporadic motor neuro...,Non-invasive ventilation (NIV) use in patients...,The objectives of this study are to determine:...,"With the development of supportive measures, t..."
label,ALS,ALS,ALS,ALS,ALS


In [9]:
df.groupby('label').count()

Unnamed: 0_level_0,nctid,description
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ALS,368,368
Dementia,368,368
Obsessive Compulsive Disorder,358,358
Parkinson’s Disease,330,330
Scoliosis,335,335


## features...

In [30]:

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, ngram_range=(1, 3), stop_words='english').fit(df.description)
pickle.dump(tfidf, open("tfidf-descriptions.pkl", "wb"))

features = tfidf.transform(df.description).toarray()
labels = df.label.to_numpy()
print(f'feature of size {features.shape[1]}')

feature of size 55397


In [11]:

N = 3
for condition in np.unique(labels): 
  features_chi2 = chi2(features, labels == condition)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
  print(condition)
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))
  print("  * Most Correlated Trigrams are: %s" %(', '.join(trigrams[-N:])))

ALS
  * Most Correlated Unigrams are: amyotrophic, sclerosis, als
  * Most Correlated Bigrams are: als patients, amyotrophic lateral, lateral sclerosis
  * Most Correlated Trigrams are: functional rating scale, lateral sclerosis als, amyotrophic lateral sclerosis
Dementia
  * Most Correlated Unigrams are: caregiver, caregivers, dementia
  * Most Correlated Bigrams are: patients dementia, alzheimer disease, people dementia
  * Most Correlated Trigrams are: alzheimer disease related, alzheimer disease ad, long term care
Obsessive Compulsive Disorder
  * Most Correlated Unigrams are: obsessive, compulsive, ocd
  * Most Correlated Bigrams are: disorder ocd, compulsive disorder, obsessive compulsive
  * Most Correlated Trigrams are: obsessive compulsive scale, compulsive disorder ocd, obsessive compulsive disorder
Parkinson’s Disease
  * Most Correlated Unigrams are: levodopa, parkinson, pd
  * Most Correlated Bigrams are: pd patients, disease pd, parkinson disease
  * Most Correlated Trigr

# Train / test model

In [12]:
idxToCondition = dict([(k[0], v) for k,v in np.ndenumerate(np.unique(labels))])
conditionToIdx = dict([(v, k[0]) for k,v in np.ndenumerate(np.unique(labels))])

In [13]:
labelsIdx = [conditionToIdx[x] for x in labels]


In [14]:

X_train, X_test, y_train, y_test = train_test_split(features, labelsIdx, test_size=0.2)

In [15]:

D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

In [16]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 5} 

steps = 20  # The number of training iterations
model = xgb.train(param, D_train, steps)


In [17]:

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))

Precision = 0.8897249304788375
Recall = 0.8898784487388527
Accuracy = 0.8892045454545454


In [31]:
model.save_model('v_0_1.model')
# pickle.dump(model, open('v_0_1.pkl', "wb"))


## new data

In [20]:
bst = xgb.Booster()
model2 = bst.load_model('v_0_1.model')
# model2 = pickle.load(open('v_0_1.pkl', "rb"))


tf2 = pickle.load(open("tfidf-descriptions.pkl", 'rb'))
tfidf2 = TfidfVectorizer(sublinear_tf=True, min_df=2, ngram_range=(1, 3), stop_words='english',
                        vocabulary = tf2.vocabulary_)


In [28]:
text = "Patients were randomly administered either real (n = 12) or sham (n = 10) rTMS, once a day, 5 days a week, for 2 weeks. Randomization was performed according to a computer-generated schedule. Subjects and scale-rater physician were blind to treatment status of individuals. Only the rTMS administrator was aware of group allocations. Treatment response was assessed by self-and clinician-rated scales before treatment, immediately after treatment and 3 months thereafter, with the same examiner following a subject throughout the study. All patients included in the study had failed adequate pharmacological treatment for at least 2 antiobsessional drugs. Their prescription drugs were continued without change in dosage regimens throughout the study."
f1 = tfidf2.fit_transform([text]).toarray()
ypred = model.predict(xgb.DMatrix(f1))
print(idxToCondition[np.argmax(ypred)])

ALS
Obsessive Compulsive Disorder
