In [20]:
import pandas as pd


file_path = 'datasets/mtsamples.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [23]:
specialty_counts = data.value_counts("medical_specialty")

In [24]:
specialties_to_keep = specialty_counts[specialty_counts > 100].index

In [25]:
data_filtered = data[data['medical_specialty'].isin(specialties_to_keep)]

In [26]:
data_filtered.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
7,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,..."
11,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,"2-D STUDY,1. Mild aortic stenosis, widely calc...","cardiovascular / pulmonary, 2-d study, doppler..."


In [27]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3923 entries, 3 to 4976
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         3923 non-null   int64 
 1   description        3923 non-null   object
 2   medical_specialty  3923 non-null   object
 3   sample_name        3923 non-null   object
 4   transcription      3894 non-null   object
 5   keywords           3116 non-null   object
dtypes: int64(1), object(5)
memory usage: 214.5+ KB


## Preprocessing

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

data_filtered['transcription'] = data_filtered['transcription'].fillna('')

data_filtered['processed_text'] = data_filtered['transcription'].apply(preprocess_text)

data_filtered.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\helit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\helit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['transcription'] = data_filtered['transcription'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['processed_text'] = data_filtered['transcription'].apply(preprocess_t

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,processed_text
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...","[left, atrial, enlargement, left, atrial, diam..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...","[left, ventricular, cavity, size, wall, thickn..."
7,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...","[echocardiogram, multiple, views, heart, great..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...","[description, normal, cardiac, chambers, norma..."
11,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,"2-D STUDY,1. Mild aortic stenosis, widely calc...","cardiovascular / pulmonary, 2-d study, doppler...","[mild, aortic, stenosis, widely, calcified, mi..."


## Sentiment analysis

In [29]:
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

data_filtered['transcription'] = data_filtered['transcription'].fillna('')

# Apply the function to the 'transcription' column
data_filtered['sentiment'] = data_filtered['transcription'].apply(lambda x: get_sentiment(str(x)))

# Display the sentiment distribution
sentiment_distribution = data_filtered['sentiment'].value_counts()
print(sentiment_distribution)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['transcription'] = data_filtered['transcription'].fillna('')


sentiment
Positive    3199
Negative     651
Neutral       73
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['sentiment'] = data_filtered['transcription'].apply(lambda x: get_sentiment(str(x)))


## Topic classification

In [30]:
topics = data_filtered['medical_specialty'].unique()
topics = [topic.strip() for topic in topics]

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_filtered['processed_text'].apply(lambda x: ' '.join(x)))

y = data_filtered['medical_specialty']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=topics))

                               precision    recall  f1-score   support

   Cardiovascular / Pulmonary       0.53      0.11      0.18        81
                    Neurology       0.35      0.96      0.52       108
                      Urology       0.00      0.00      0.00        15
             General Medicine       0.00      0.00      0.00        45
                      Surgery       0.00      0.00      0.00        48
SOAP / Chart / Progress Notes       0.33      0.02      0.03        56
                    Radiology       0.00      0.00      0.00        24
                   Orthopedic       0.33      0.03      0.05        68
      Obstetrics / Gynecology       0.00      0.00      0.00        54
             Gastroenterology       0.00      0.00      0.00        27
            Discharge Summary       0.48      0.99      0.65       225
   Consult - History and Phy.       0.00      0.00      0.00        34

                     accuracy                           0.43       785
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
