# Text Model

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder



In [13]:
# Load your dataset
data = pd.read_csv("../data/scraped_data.csv",encoding = 'unicode_escape')
data.head()

Unnamed: 0.1,Unnamed: 0,Disease,Keyword,Website,Symptoms,Unnamed: 5,Unnamed: 6
0,0,Urticaria Hives,Urticaria,https://www.mayoclinic.org/diseases-conditions...,Batches of welts (wheals) that can arise anywh...,,
1,0,Urticaria Hives,Dermatographism,,"Raised, inflamed lines where you scratched.\nW...",,
2,0,Urticaria Hives,Angiodema,https://www.mayoclinic.org/diseases-conditions...,"Welts that form in minutes to hours\nSwelling,...",,
3,0,Benign Tumors,Seborrheic Keratosis,https://www.mayoclinic.org/diseases-conditions...,"A round or oval-shaped waxy or rough bump, typ...",,
4,0,Benign Tumors,Epidermal Cyst,https://www.mayoclinic.org/diseases-conditions...,"\nA small, round bump under the skin, usually ...",,


In [14]:
df = data[['Disease','Symptoms']]
df.dropna()

Unnamed: 0,Disease,Symptoms
0,Urticaria Hives,Batches of welts (wheals) that can arise anywh...
1,Urticaria Hives,"Raised, inflamed lines where you scratched.\nW..."
2,Urticaria Hives,"Welts that form in minutes to hours\nSwelling,..."
3,Benign Tumors,"A round or oval-shaped waxy or rough bump, typ..."
4,Benign Tumors,"\nA small, round bump under the skin, usually ..."
...,...,...
2347,Vasculitis,rash of raised red or purple spots. The spots...
2348,Vasculitis,"Granuloma annulare mainly affects children, te..."
2349,Vasculitis,"red or blue coloured blotches on white skin, a..."
2350,Vasculitis,Pyoderma gangrenosum often appears suddenly as...


In [15]:
df['Disease'].unique()
cat = ['Atopic Dermatitis','Lupus and other Connective Tissue diseases','Scabies Lyme Diease and other Infestations and Bites','Benign Tumors']
df= df[df['Disease'].isin(cat)]


In [26]:
df.groupby(df['Disease']).count()

Unnamed: 0_level_0,Symptoms
Disease,Unnamed: 1_level_1
Atopic Dermatitis,160
Benign Tumors,41
Lupus and other Connective Tissue diseases,85
Scabies Lyme Diease and other Infestations and Bites,127


In [16]:
df['Symptoms'] = df['Symptoms'].str.lower()  # Convert to lower case
df['Symptoms'] = df['Symptoms'].str.replace('[^\w\s]', '')  # Remove punctuation
df['Symptoms'] = df['Symptoms'].str.replace('\n', ' ')  # Remove punctuation

# Split the dataset into features and labels
X = df['Symptoms']  # Features (text data)
y = df['Disease']     # Labels (class for each text entry)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [17]:
df['Disease'].unique()

array(['Benign Tumors', 'Lupus and other Connective Tissue diseases',
       'Scabies Lyme Diease and other Infestations and Bites',
       'Atopic Dermatitis'], dtype=object)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [19]:
X_test.shape

(90,)

In [20]:
X_train.shape

(356,)

In [52]:
vectorizer = TfidfVectorizer(max_features=810)
X_train_vectors = vectorizer.fit_transform(X_train.values.astype('U'))
X_test_vectors = vectorizer.transform(X_test.values.astype('U'))

In [53]:
X_test_vectors

<90x810 sparse matrix of type '<class 'numpy.float64'>'
	with 4943 stored elements in Compressed Sparse Row format>

In [54]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [55]:
svmmodel = svm.SVC(kernel='linear', probability=True)  # SVC is often a good starting point

In [56]:
svmmodel.fit(X_train_vectors, y_train)


In [57]:
from sklearn.metrics import classification_report, accuracy_score

# Predict the labels for the test set
y_pred = svmmodel.predict(X_test_vectors)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.9333333333333333
                                                       precision    recall  f1-score   support

                             Atopic Dermatitis Photos       1.00      0.92      0.96        38
           Lupus and other Connective Tissue diseases       0.80      0.80      0.80         5
Scabies Lyme Disease and other Infestations and Bites       0.94      0.88      0.91        17
         Seborrheic Keratoses and other Benign Tumors       0.88      1.00      0.94        30

                                             accuracy                           0.93        90
                                            macro avg       0.90      0.90      0.90        90
                                         weighted avg       0.94      0.93      0.93        90



In [29]:
text_probabilities = svmmodel.predict_proba(X_test_vectors)
text_probabilities

array([[0.16569665, 0.39957295, 0.16356871, 0.27116169],
       [0.89218119, 0.02286193, 0.03327834, 0.05167855],
       [0.0464954 , 0.00928367, 0.03354228, 0.91067865],
       [0.10272886, 0.0905345 , 0.07850627, 0.72823037],
       [0.02191483, 0.0085422 , 0.04559478, 0.92394819],
       [0.91133876, 0.01544534, 0.02382494, 0.04939096],
       [0.07612868, 0.01135808, 0.06984406, 0.84266918],
       [0.91743287, 0.03241195, 0.01620599, 0.03394919],
       [0.92588936, 0.01350514, 0.0117407 , 0.04886479],
       [0.13336709, 0.01438436, 0.78626175, 0.06598681],
       [0.02418568, 0.01290849, 0.94482714, 0.01807869],
       [0.79727914, 0.05550757, 0.07667814, 0.07053514],
       [0.8811127 , 0.05647625, 0.02882919, 0.03358186],
       [0.02370131, 0.01454114, 0.94265373, 0.01910383],
       [0.70861054, 0.03947763, 0.12835789, 0.12355395],
       [0.84537281, 0.08507208, 0.01817848, 0.05137664],
       [0.73068549, 0.06489209, 0.11675375, 0.08766867],
       [0.90239632, 0.01612098,

# CNN Model

In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# CNN libraries
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2023-11-04 23:06:57.956743: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_data = "../data/images/dermnet/train1"
test_data = "../data/images/dermnet/test1/"

In [3]:
# Preprocess
input_shape = (224, 224, 3) 
num_classes = 4

train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_data,
        target_size=input_shape[:2],
        batch_size=32,
        class_mode='categorical')

validation_generator = validation_datagen.flow_from_directory(
        test_data,
        target_size=input_shape[:2],
        batch_size=32,
        class_mode='categorical')

Found 2711 images belonging to 4 classes.
Found 679 images belonging to 4 classes.


In [4]:
def build_model(input_shape, num_classes):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))
    return model

In [5]:
model = build_model(input_shape, num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [7]:
history = model.fit(
      train_generator,
      steps_per_epoch=50, 
      epochs=10,
      validation_data=validation_generator,
      validation_steps=25)

Epoch 1/10


2023-11-04 23:08:30.636705: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2023-11-04 23:09:35.577110: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
test_loss, test_acc = model.evaluate(validation_generator, steps=25)
print('\nTest accuracy:', test_acc)

2023-11-04 23:40:56.760053: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]



Test accuracy: 0.5463917255401611


In [10]:
image_probabilities = model.predict(validation_generator)


2023-11-04 23:44:10.495731: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




In [11]:
image_probabilities

array([[0.17795499, 0.14731216, 0.29345915, 0.38127375],
       [0.0707379 , 0.0477184 , 0.29970348, 0.5818402 ],
       [0.03478829, 0.13660417, 0.45125857, 0.37734893],
       ...,
       [0.02061159, 0.00706999, 0.11229225, 0.8600261 ],
       [0.65240234, 0.05708424, 0.21682031, 0.07369312],
       [0.725041  , 0.13644284, 0.11727757, 0.02123863]], dtype=float32)

# Combining models

In [30]:
testdf = pd.read_csv("../data/test_symptoms_labelled.csv")

In [36]:
test_image_paths = testdf['Full Path']
test_symptoms = testdf['Symptoms']

In [37]:
# preprocess test text
test_symptoms = test_symptoms.str.lower()  # Convert to lower case
test_symptoms = test_symptoms.str.replace('[^\w\s]', '')  # Remove punctuation
test_symptoms = test_symptoms.str.replace('\n', ' ')  # Remove break

In [32]:
validation_generator = validation_datagen.flow_from_directory(
        test_data,
        target_size=input_shape[:2],
        batch_size=32,
        class_mode='categorical')

Found 679 images belonging to 4 classes.


In [38]:
y = testdf['Label']     # Labels (class for each text entry)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [58]:
test_vector = vectorizer.fit_transform(test_symptoms.values.astype('U'))


In [50]:
test_vector

<679x810 sparse matrix of type '<class 'numpy.float64'>'
	with 6245 stored elements in Compressed Sparse Row format>

In [60]:
# Predict the labels for the test set
y_pred = svmmodel.predict(test_vector)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_encoded, y_pred)}')
print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.4374079528718704
                                                       precision    recall  f1-score   support

                             Atopic Dermatitis Photos       0.17      0.15      0.16       123
           Lupus and other Connective Tissue diseases       0.75      0.03      0.06       105
Scabies Lyme Disease and other Infestations and Bites       0.10      0.06      0.07       108
         Seborrheic Keratoses and other Benign Tumors       0.53      0.78      0.63       343

                                             accuracy                           0.44       679
                                            macro avg       0.39      0.26      0.23       679
                                         weighted avg       0.43      0.44      0.37       679



In [64]:
text_probabilities = svmmodel.predict_proba(test_vector)
len(text_probabilities)

679

In [63]:
len(image_probabilities)

679

In [69]:
# average
combined_probabilities = (image_probabilities + text_probabilities) / 2
y_pred = np.argmax(combined_probabilities, axis=1)
print(f'Accuracy: {accuracy_score(y_encoded, y_pred)}')
print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.3946980854197349
                                                       precision    recall  f1-score   support

                             Atopic Dermatitis Photos       0.17      0.14      0.15       123
           Lupus and other Connective Tissue diseases       0.14      0.07      0.09       105
Scabies Lyme Disease and other Infestations and Bites       0.07      0.04      0.05       108
         Seborrheic Keratoses and other Benign Tumors       0.52      0.70      0.59       343

                                             accuracy                           0.39       679
                                            macro avg       0.22      0.24      0.22       679
                                         weighted avg       0.32      0.39      0.35       679



In [74]:
# weighted
combined_probabilities = (0.8* image_probabilities + 0.2*text_probabilities)
y_pred = np.argmax(combined_probabilities, axis=1)
print(f'Accuracy: {accuracy_score(y_encoded, y_pred)}')
print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.3799705449189985
                                                       precision    recall  f1-score   support

                             Atopic Dermatitis Photos       0.18      0.16      0.17       123
           Lupus and other Connective Tissue diseases       0.15      0.10      0.12       105
Scabies Lyme Disease and other Infestations and Bites       0.10      0.06      0.08       108
         Seborrheic Keratoses and other Benign Tumors       0.52      0.64      0.57       343

                                             accuracy                           0.38       679
                                            macro avg       0.24      0.24      0.23       679
                                         weighted avg       0.33      0.38      0.35       679



In [75]:
from sklearn.linear_model import LogisticRegression

stacked_probabilities = np.hstack((image_probabilities, text_probabilities))

fusion_model = LogisticRegression().fit(stacked_probabilities, y_encoded)