# About the task:
This notebook represents the first part of the classification task. Given a medical note, the goal is to predict wether the source of infection is explicit or implicit. 

# Imports 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier

# Preprocessing

In [2]:
df = pd.read_csv('KINBIOTICS_CLASSIFICATION.csv')

In [3]:
# filling nan values 

df['other_organs'] =  df['other_organs'].fillna('None of the above')
df['explicit_source'] =  df['explicit_source'].fillna('None of the above')
df['implicit_source'] =  df['implicit_source'].fillna('None of the above')

Model for explicit/implicit classification

In [4]:
# keep only relevant columns + target variable binarization
# 0 --> implicit , 1 --> implicit 
df_binary =  pd.DataFrame({
    'text': df['text'],
    'explicit': (df['explicit_source']!= 'None of the above').astype(int)
                          })

df_binary

Unnamed: 0,text,explicit
0,Chief Complaint:\n s/p falls\n\nHistory of Pre...,0
1,Chief Complaint:\n febrile and neutropenia\n\n...,0
2,Chief Complaint:\n septic shock\n\nHistory of ...,0
3,Chief Complaint:\n hepatic failure\n\nHistory ...,0
4,Chief Complaint:\n ruq pain\n\nHistory of Pres...,1
...,...,...
331,"Chief Complaint:\n fever, hypotension\n\nHisto...",1
332,Chief Complaint:\n abdominal pain\n\nHistory o...,1
333,Chief Complaint:\n 1. ischemic colitis versus ...,0
334,"Chief Complaint:\n hypoxemia, confusion\n\nHis...",0


In [5]:
# tf-idf measures the originaluty of a word, comparing the times a word appears in a doc with the number of docs the word appears in

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text']) # X is now a sparse matrix

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, df_binary['explicit'], test_size=0.2, random_state=44)


We also tried Logistic regression and Random forest, but Gradient Boosting was consistently the best one from an accuracy and f1 micro avg point of view. 

In [6]:
gb = GradientBoostingClassifier()

param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'min_weight_fraction_leaf': [0.1, 0.2, 0.3, 0.4],
}

# RandomizedSearchCV object
random_search = RandomizedSearchCV(gb, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)



Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [8]:
# best parameters and best score
print("Best Parameters: ", random_search.best_params_)
print("Best Accuracy Score: ", random_search.best_score_)
print()

# Use the best model to make predictions on the test set
y_pred = random_search.best_estimator_.predict(X_test)
auc =  roc_auc_score(y_pred, y_test)
# Evaluate the best model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(f"AUC: {auc}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters:  {'subsample': 0.8, 'n_estimators': 50, 'min_weight_fraction_leaf': 0.1, 'max_depth': 10, 'learning_rate': 0.05}
Best Accuracy Score:  0.7386443046820406

Accuracy: 0.7058823529411765
AUC: 0.6982456140350877
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74        40
           1       0.63      0.68      0.66        28

    accuracy                           0.71        68
   macro avg       0.70      0.70      0.70        68
weighted avg       0.71      0.71      0.71        68

