In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

# List of class columns
class_columns = [
    'CARD_ARRHYTHMIA', 'CELL_SKIN_INF', 'CELL_NO_MCC', 
    'CVA_INFARCT', 'PANCREAS_DIS', 'DIGEST_DIS_NO_MCC', 
    'HEART_FAILURE', 'HEART_SHOCK_MCC', 'KIDNEY_UTI', 
    'JOINT_REPLACE_NO_MCC', 'DIGEST_DIS_OTHER', 'GASTRO_NAUSEA', 
    'PNEUMONIA_OTHER', 'PCI_NO_AMI', 'PSYCHOSES', 
    'SEPTICEMIA', 'SYNCOPE'
]

# Load the training data
training_df = pd.read_csv('data/training_text2.csv')

# Prepare the data
X = training_df['training_text']
y = training_df[class_columns]
y

Unnamed: 0,CARD_ARRHYTHMIA,CELL_SKIN_INF,CELL_NO_MCC,CVA_INFARCT,PANCREAS_DIS,DIGEST_DIS_NO_MCC,HEART_FAILURE,HEART_SHOCK_MCC,KIDNEY_UTI,JOINT_REPLACE_NO_MCC,DIGEST_DIS_OTHER,GASTRO_NAUSEA,PNEUMONIA_OTHER,PCI_NO_AMI,PSYCHOSES,SEPTICEMIA,SYNCOPE
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19579,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
19580,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
19581,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
19582,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [8]:
# Remove rows where all class columns are 0 (no diagnosis)
y = y.loc[~(y == 0).all(axis=1)]
X = X[y.index]


In [10]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize the multi-label binarizer
mlb = MultiLabelBinarizer(classes=class_columns)

# Transform the training and test labels
y_train_bin = y_train.values
y_test_bin = y_test.values

# Initialize and train the Logistic Regression model with OneVsRestClassifier
model = OneVsRestClassifier(LogisticRegression(max_iter=5000))  # Increase max_iter if needed
model.fit(X_train_vec, y_train_bin)

# Make predictions on the test set
y_pred_bin = model.predict(X_test_vec)

# Evaluate the model's performance
accuracy = accuracy_score(y_test_bin, y_pred_bin)
print(f"Accuracy: {accuracy}")

# Generate the classification report for each class
report = classification_report(y_test_bin, y_pred_bin, target_names=class_columns, zero_division=0)
print("Classification Report:\n", report)

Accuracy: 0.5693132499361756
Classification Report:
                       precision    recall  f1-score   support

     CARD_ARRHYTHMIA       0.90      0.48      0.63       230
       CELL_SKIN_INF       0.97      0.87      0.92       415
         CELL_NO_MCC       0.89      0.83      0.86       361
         CVA_INFARCT       0.94      0.85      0.89       222
        PANCREAS_DIS       0.89      0.75      0.81       249
   DIGEST_DIS_NO_MCC       0.82      0.61      0.70       436
       HEART_FAILURE       0.88      0.78      0.83       457
     HEART_SHOCK_MCC       0.67      0.34      0.45       313
          KIDNEY_UTI       0.84      0.47      0.60       251
JOINT_REPLACE_NO_MCC       1.00      0.94      0.97       237
    DIGEST_DIS_OTHER       0.78      0.35      0.48       250
       GASTRO_NAUSEA       0.85      0.49      0.62       286
     PNEUMONIA_OTHER       0.83      0.51      0.63       250
          PCI_NO_AMI       0.98      0.73      0.84       179
           PSYCH

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer

# Define the pipeline with a TF-IDF vectorizer and a logistic regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# Define the parameter grid
param_grid = {
    'tfidf__max_features': [5000, 10000, 20000],
    'clf__estimator__max_iter': [500, 1000, 1500],
    'clf__estimator__C': [0.1, 1, 10]  # Regularization parameter
}

# Define the scorer
scorer = make_scorer(f1_score, average='micro')

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=3, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train_bin)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Make predictions on the test set
y_pred_bin = grid_search.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test_bin, y_pred_bin)
print(f"Accuracy: {accuracy}")

# Generate the classification report for each class
report = classification_report(y_test_bin, y_pred_bin, target_names=class_columns, zero_division=0)
print("Classification Report:\n", report)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=5000; total time=  18.1s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=5000; total time=  18.4s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=5000; total time=  18.3s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=10000; total time=  19.0s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=10000; total time=  18.6s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=10000; total time=  18.9s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_features=20000; total time=  19.0s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=1000, tfidf__max_features=5000; total time=  17.4s
[CV] END clf__estimator__C=0.1, clf__estimator__max_iter=500, tfidf__max_featu