# Introduction

In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Loading the final cleaned datasets

merged_file_df = pd.read_TYPE('Merged_File')

## Tokenization

In [4]:
print(merged_file_df['Exam_Notes'].dtype)
merged_file_df['Image_Info'] = merged_file_df['Image_Info'].astype(str)

object


In [5]:
# Tokenization
merged_file_df['Exam_Notes_tokens'] = merged_file_df['Exam_Notes'].apply(lambda x: word_tokenize(x))
merged_file_df['Image_Info_tokens'] = merged_file_df['Image_Info'].apply(lambda x: word_tokenize(x))

In [None]:
merged_file_df

# PCA with TFIDF

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Assuming you have your dataset loaded into a DataFrame called 'data'
# with columns 'Exam_ID' and 'Exam_Notes'

# Filter the dataset for the 'Active' class
active_data = merged_file_df[merged_file_df['Disease_Severity'] == 'Active']

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(active_data['Exam_Notes'])

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=5)
pca_result = pca.fit_transform(tfidf_matrix.toarray())

# Get the list of important words (features) based on PCA components
feature_names = vectorizer.get_feature_names_out()
component_weights = pca.components_

# Find the most important words for the 'Active' class
top_words = []
for component in component_weights:
    top_word_indices = component.argsort()[-5:]  # Select top 5 words per component
    top_words.extend([feature_names[idx] for idx in top_word_indices])

# Remove duplicates
top_words = list(set(top_words))

# Print the important words
print("Important words for the 'Active' class:")
for word in top_words:
    print(word)


Important words for the 'Active' class:
present
mesalamine
office
discharge
activities
discussed
monitored
continue
per
scheduled
tomorrow
care
propofol
entocort
return
physician
olympus
budesonide
gi
referring
found
op
clinic
home


In [8]:
top_words + ['ileitis', 'ulcer', 'ulceration', 'erosion', 'aphtha', 'aphthae', 'aphthous']

['present',
 'mesalamine',
 'office',
 'discharge',
 'activities',
 'discussed',
 'monitored',
 'continue',
 'per',
 'scheduled',
 'tomorrow',
 'care',
 'propofol',
 'entocort',
 'return',
 'physician',
 'olympus',
 'budesonide',
 'gi',
 'referring',
 'found',
 'op',
 'clinic',
 'home',
 'ileitis',
 'ulcer',
 'ulceration',
 'erosion',
 'aphtha',
 'aphthae',
 'aphthous']

In [9]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(vocabulary=top_words)  # Pass the important words as the vocabulary

# Vectorize the Exam Notes data for the 'Active' class using the important words
X = vectorizer.fit_transform(merged_file_df['Exam_Notes'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, merged_file_df['Disease_Severity'], test_size=0.3, random_state=40)

# Train a machine learning model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Active       0.56      0.43      0.49       104
    Inactive       0.58      0.70      0.63       115

    accuracy                           0.57       219
   macro avg       0.57      0.56      0.56       219
weighted avg       0.57      0.57      0.56       219



In [37]:
from sklearn.ensemble import RandomForestClassifier

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(vocabulary=set(top_words))  # Convert the top_words list to a set

# Vectorize the Exam Notes data using the important words
X = vectorizer.fit_transform(merged_file_df['Exam_Notes'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, merged_file_df['Disease_Severity'], test_size=0.3, random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 400],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest classifier
model = RandomForestClassifier()

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print(classification_report(y_test, y_pred))

Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

      Active       0.58      0.66      0.62       102
    Inactive       0.66      0.59      0.62       117

    accuracy                           0.62       219
   macro avg       0.62      0.62      0.62       219
weighted avg       0.63      0.62      0.62       219



RNN model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import tensorflow as tf
import numpy as np

# Preprocess the data
X = merged_file_df['Exam_Notes'].values
y = merged_file_df['Disease_Severity'].values

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(vocabulary=top_words)  # Pass the important words as the vocabulary

# Vectorize the Exam Notes data for the 'Active' class using the important words
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Convert the dense matrix to a SparseTensor
X_train_sparse = tf.sparse.SparseTensor(
    indices=np.vstack(X_train_vectorized.nonzero()).T,
    values=X_train_vectorized.data,
    dense_shape=X_train_vectorized.shape
)

# Reorder the sparse matrix indices
X_train_vectorized_reordered = tf.sparse.reorder(X_train_sparse)

# Convert SparseTensor to dense matrix
X_train_dense = tf.sparse.to_dense(X_train_vectorized_reordered).numpy()

all_notes = merged_file_df['Exam_Notes'].values
vectorizer = TfidfVectorizer(vocabulary=top_words)
vectorizer.fit(all_notes)
vocab_size = len(vectorizer.get_feature_names_out())

# Determine the maximum sequence length
max_seq_length = X_train_vectorized.shape[1]

# Define the RNN model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length))
model.add(LSTM(units=64, dropout=0.1))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_dense, y_train, epochs=100, batch_size=24, validation_split=0.2)

# Convert the test data to a SparseTensor
X_test_sparse = tf.sparse.SparseTensor(
    indices=np.vstack(X_test_vectorized.nonzero()).T,
    values=X_test_vectorized.data,
    dense_shape=X_test_vectorized.shape
)

# Reorder the sparse matrix indices
X_test_vectorized_reordered = tf.sparse.reorder(X_test_sparse)

# Convert SparseTensor to dense matrix
X_test_dense = tf.sparse.to_dense(X_test_vectorized_reordered).numpy()

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_dense, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


Epoch 1/100

Exception ignored in: <function _xla_gc_callback at 0x7fdf01c711b0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 103, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

## Vectorization

In [None]:
def BagOfWordsVectorization(columns,newColumn):
    # Get the tokenized text from the DataFrame
    tokenized_text = merged_file_df[columns]

    # Convert tokenized text back to strings
    preprocessed_text = [' '.join(tokens) for tokens in tokenized_text]

    # Create an instance of CountVectorizer
    vectorizer = CountVectorizer()
    # Fit the vectorizer on the preprocessed text and transform it
    vectorized_data = vectorizer.fit_transform(preprocessed_text).toarray()

    # Create a new column in the DataFrame with the vectorized data
    merged_file_df[newColumn] = pd.Series(vectorized_data.tolist())

In [None]:
BagOfWordsVectorization('Exam_Notes_tokens','Exam_Notes_vectorized')
BagOfWordsVectorization('Image_Info_tokens','ImageInfo_vectorized')

In [None]:
merged_file_df = merged_file_df.drop(['Exam_ID','Exam_Notes','Exam_Notes_tokens','Image_Info_tokens','Image_Info'],axis=1)

## Encoding categeorical variables

In [None]:
merged_file_df["Disease_Severity"] = merged_file_df["Disease_Severity"].apply(lambda x: 1 if x == "Active" else 0)
merged_file_df["Ulcer"] = merged_file_df["Ulcer"].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
merged_file_df.head()

## Splitting of Training and test data

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(merged_file_df.drop('Disease_Severity', axis=1), merged_file_df['Disease_Severity'], test_size=0.2, random_state=42)

## Model Building SVM

In [None]:
# Convert the word vectors to a numeric format
X_train_exam = pd.DataFrame(X_train['Exam_Notes_vectorized'].apply(pd.Series))
X_train_image = pd.DataFrame(X_train['ImageInfo_vectorized'].apply(pd.Series))
X_train = pd.concat([X_train_exam, X_train_image], axis=1)

X_test_exam = pd.DataFrame(X_test['Exam_Notes_vectorized'].apply(pd.Series))
X_test_image = pd.DataFrame(X_test['ImageInfo_vectorized'].apply(pd.Series))
X_test = pd.concat([X_test_exam, X_test_image], axis=1)

In [None]:
# Create an SVM classifier object
svm_model = SVC(C=1, kernel='poly',degree=1)

# Fit the SVM classifier on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_train = svm_model.predict(X_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Evaluate the performance of the SVM model
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Training accuracy:", accuracy_train)

# Evaluate the performance of the SVM model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Testing Accuracy:", accuracy)
print("Classification Report:")
print(report)

Training accuracy: 0.7243150684931506
Testing Accuracy: 0.6438356164383562
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        69
           1       0.66      0.66      0.66        77

    accuracy                           0.64       146
   macro avg       0.64      0.64      0.64       146
weighted avg       0.64      0.64      0.64       146

