In [5]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
# Preprocess data
def preprocess_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Load the data
df = pd.read_csv('new_train.csv')

# Preprocess the data
df['transcription'] = df['transcription'].apply(preprocess_text)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['transcription'], df['medical_specialty'], test_size=0.2, random_state=42)


In [6]:
# Convert the text data into numerical vectors
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [7]:

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance of the model using F1-score with macro averaging
f1_macro = f1_score(y_test, y_pred, average='macro')
print('F1-score (macro):', f1_macro)

F1-score (macro): 0.06364396310620246


In [8]:
from sklearn.model_selection import GridSearchCV

# Define the range of hyperparameters to search over
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [True, False],
}

# Create an instance of the Multinomial Naive Bayes model
nb = MultinomialNB()

# Create an instance of GridSearchCV and fit the training data
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding F1-score on the test set
print("Best hyperparameters: ", grid_search.best_params_)
print("Best F1-score on test set: {:.4f}".format(grid_search.best_score_))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.1s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.1s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s




[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.1s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.1s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.1s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.1s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END ...................

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report

# Load data
df = pd.read_csv('new_train.csv')

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Preprocess data
vectorizer = TfidfVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(train_df["transcription"])
X_test = vectorizer.transform(test_df["transcription"])
y_train = train_df["medical_specialty"]
y_test = test_df["medical_specialty"]

# Hyperparameter tuning
param_grid = {"alpha": [0.01, 0.1, 1.0],
              "fit_prior": [True, False]}
clf = MultinomialNB()
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best hyperparameters: ", grid_search.best_params_)

# Train and evaluate model
clf = MultinomialNB(alpha=grid_search.best_params_["alpha"], fit_prior=grid_search.best_params_["fit_prior"])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Best F1-score on test set: {:.4f}".format(f1_score(y_test, y_pred, average="macro")))
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best hyperparameters:  {'alpha': 1.0, 'fit_prior': False}
Best F1-score on test set: 0.0454
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         1
                       Autopsy       0.00      0.00      0.00         2
                    Bariatrics       0.00      0.00      0.00         1
    Cardiovascular / Pulmonary       0.39      0.16      0.23        57
                  Chiropractic       0.00      0.00      0.00         4
    Consult - History and Phy.       0.24      0.98      0.39        83
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         3
                     Dentistry       0.00      0.00      0.00         1
                   Dermatology       0.00      0.00      0.00         2
             Discharge Summary       0.00      0.00      0.00        17
          ENT - Otolaryngology       0.00      0.00      0.00        23
        Emergency Room Reports       0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
