<a href="https://colab.research.google.com/github/thesis17/Afaan-Oromoo-chatGPT/blob/main/Medical_Symptoms_Text_Classification_%2B_Web_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
paultimothymooney_medical_speech_transcription_and_intent_path = kagglehub.dataset_download('paultimothymooney/medical-speech-transcription-and-intent')

print('Data source import complete.')


# Medical Symptoms Text Classification

> The adoption of natural language processing in healthcare is rising because of its recognized potential to search, analyze and interpret mammoth amounts of patient datasets. Using advanced medical algorithms, machine learning in healthcare and NLP technology services have the potential to harness relevant insights and concepts from data that was previously considered buried in text form. NLP in healthcare media can accurately give voice to the unstructured data of the healthcare universe, giving incredible insight into understanding quality, improving methods, and better results for patients.

![](https://i.imgur.com/SJPzebD.png)

The web app is available at: https://medical-symptoms-classifier.herokuapp.com/

Source Code: https://github.com/gabbygab1233/Medical-Symptoms-Classifier

In [None]:
#!pip install pyspellchecker
#!pip install neattext

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import altair as alt
import pickle
import string
import spacy
import nltk
import re

from sklearn.naive_bayes import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
from collections import Counter

from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

#nltk.download('stopwords')
sns.set(style='whitegrid')
%matplotlib inline
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('../input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent/overview-of-recordings.csv')
df.head()

# [**Exploratory Data Analysis**](http://)

In [None]:
#Analyze Data
def explore_data(df):
    print(f"The data contains {df.shape[0]} rows and {df.shape[1]} columns.")
    print('\n')
    print('Dataset columns:',df.columns)
    print('\n')
    print(df.info())

explore_data(df)

# [**Checking for Nan Values and duplicates**¶](http://)

In [None]:
df.isna().sum()

In [None]:
def checking_removing_duplicates(df):
    count_dups = df.duplicated().sum()
    print("Number of Duplicates: ", count_dups)
    if count_dups >= 1:
        df.drop_duplicates(inplace=True)
        print('Duplicate values removed!')
    else:
        print('No Duplicate values')
checking_removing_duplicates(df)

# [**Different types of analysis require different data format**](http://)
* **Corpus** - a collection of text
* **Document-Term Matrix** - word counts in matrix format

# **Corpus**

In [None]:
df_text = df[['phrase', 'prompt']]
df_text

# **Document-Term Matrix**

In [None]:
cv = CountVectorizer()
df_cv = cv.fit_transform(df_text.phrase)
data_dtm = pd.DataFrame(df_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df_text.index
data_dtm


In [None]:
# Add features
# Number of characters in the text
df_text['phrase_length'] = df_text['phrase'].apply(len)
# Number of words in the text
df_text['phrase_num_words'] = df_text['phrase'].apply(lambda x: len(x.split()))
# Average length of the words in the text
df_text["mean_word_len"] = df_text["phrase"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Number of non-stopwords in the text
df_text['phrase_non_stopwords'] = df_text['phrase'].apply(lambda x: len([t for t in x.split() if t not in STOP_WORDS]))
df_text.describe().T

In [None]:
cat_dist = df_text['prompt'].value_counts().reset_index().rename(columns={'index':'Label', 'prompt':'Count'})
cat_dist.drop(0, axis=0, inplace=True)
alt.Chart(cat_dist).mark_bar(opacity=0.7).encode(
    x=alt.X('Count', title='Count'),
    y=alt.Y('Label', sort='-x',title='Category'),
    tooltip=['Label','Count']
).properties(height=800,width=700,title="Class Distribution")

In [None]:
target = df_text['prompt'].values
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(df_text).mark_bar(color="violet",opacity=0.7,
    interpolate='step').encode(
    alt.X("phrase_length:Q",  bin=alt.Bin(maxbins=100), title='Phrase Length Class'),
    alt.Y('count()', axis=alt.Axis(labels=False), title='Frequency'),
    tooltip=['phrase_length']
).properties(
    height=400,
    width=700, title="Length Distribution")

# [**Text Cleaning**](http://)
You cannot go straight from raw text to fitting a machine learning or deep learning model. You
must clean your text first, which means splitting it into words and handling punctuation and
case.
* Load the raw text

* Split into tokens

* Convert to lowercase

* Remove punctuation from each token.

* Filter out remaining tokens that are not alphabetic.

* Filter out tokens that are stop words

* Filter out short tokens by checking their length

## **Additional Text Cleaning Considerations**
* Extracting text from markup like HTML, PDF, or other structured document formats.

* Transliteration of characters from other languages into English.

* Decoding Unicode characters into a normalized form, such as UTF8.

* Handling of domain specific words, phrases, and acronyms.

* Handling or removing numbers, such as dates and amounts.

* Locating and correcting common typos and misspellings.

In [None]:
def clean_txt(docs):
    lemmatizer = WordNetLemmatizer()
    # split into words
    speech_words = nltk.word_tokenize(docs)
    # convert to lower case
    lower_text = [w.lower() for w in speech_words]
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in lower_text]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if not w in  list(STOP_WORDS)]
    # filter out short tokens
    words = [word for word in words if len(word) > 2]
    #Stemm all the words in the sentence
    lem_words = [lemmatizer.lemmatize(word) for word in words]
    combined_text = ' '.join(lem_words)
    return combined_text

# Cleaning the text data
df_text['cleaned_phrase'] = df_text['phrase'].apply(clean_txt)
df_text

In [None]:
freq_splits = FreqDist(df_text['phrase'])
print(f"***** 10 most common strings ***** \n{freq_splits.most_common(10)}", "\n")

# [**Wordcloud for each class**](http://)

In [None]:
from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = [10, 6]
# generate word cloud and show it

for x in df_text.prompt.unique():
    wc = WordCloud(background_color="black", colormap="Dark2",
               max_font_size=150, random_state=42)
    wc.generate(df_text.cleaned_phrase[(df_text.cleaned_phrase.notnull()) & (df_text.prompt == x)].to_string())
    plt.imshow(wc, interpolation="bilinear")
    plt.title(x, fontdict={'size':16,'weight':'bold'})
    plt.axis("off")
    plt.show()

# [Text Data Preparation and Model Training](http://)
Text data requires special preparation before you can start using it for predictive modeling. The
text must be parsed to remove words, called tokenization. Then the words need to be encoded
as integers or floating point values for use as input to a machine learning algorithm, called
feature extraction (or vectorization).


In [None]:
# Spot-Check Normalized Text Models
def NormalizedTextModel(nameOfvect):
    if nameOfvect == 'countvect':
        vectorizer = CountVectorizer()
    elif nameOfvect =='tfvect':
        vectorizer = TfidfVectorizer()
    elif nameOfvect == 'hashvect':
        vectorizer = HashingVectorizer()

    pipelines = []
    pipelines.append((nameOfvect+'MultinomialNB'  , Pipeline([('Vectorizer', vectorizer),('NB'  , MultinomialNB())])))
    pipelines.append((nameOfvect+'CCCV' , Pipeline([('Vectorizer', vectorizer),('CCCV' , CalibratedClassifierCV())])))
    pipelines.append((nameOfvect+'KNN' , Pipeline([('Vectorizer', vectorizer),('KNN' , KNeighborsClassifier())])))
    pipelines.append((nameOfvect+'CART', Pipeline([('Vectorizer', vectorizer),('CART', DecisionTreeClassifier())])))
    pipelines.append((nameOfvect+'PAC'  , Pipeline([('Vectorizer', vectorizer),('PAC'  , PassiveAggressiveClassifier())])))
    pipelines.append((nameOfvect+'SVM' , Pipeline([('Vectorizer', vectorizer),('RC' , RidgeClassifier())])))
    pipelines.append((nameOfvect+'AB'  , Pipeline([('Vectorizer', vectorizer),('AB'  , AdaBoostClassifier())])  ))
    pipelines.append((nameOfvect+'GBM' , Pipeline([('Vectorizer', vectorizer),('GMB' , GradientBoostingClassifier())])))
    pipelines.append((nameOfvect+'RF'  , Pipeline([('Vectorizer', vectorizer),('RF'  , RandomForestClassifier())])))
    pipelines.append((nameOfvect+'ET'  , Pipeline([('Vectorizer', vectorizer),('ET'  , ExtraTreesClassifier())])))
    pipelines.append((nameOfvect+'SGD'  , Pipeline([('Vectorizer', vectorizer),('SGD'  , SGDClassifier())])))
    pipelines.append((nameOfvect+'OVRC'  , Pipeline([('Vectorizer', vectorizer),('OVRC'  , OneVsRestClassifier(LogisticRegression()))])))
    pipelines.append((nameOfvect+'Bagging'  , Pipeline([('Vectorizer', vectorizer),('Bagging'  , BaggingClassifier())])))
    pipelines.append((nameOfvect+'NN'  , Pipeline([('Vectorizer', vectorizer),('NN'  , MLPClassifier())])))
    #pipelines.append((nameOfvect+'xgboost', Pipeline([('Vectorizer', vectorizer), ('xgboost', XGBClassifier())])))
    return pipelines

# Traing model
def fit_model(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

# Split data to training and validation set
def read_in_and_split_data(data, features,target):
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

X = 'cleaned_phrase'
target_class = 'prompt'
X_train, X_test, y_train, y_test = read_in_and_split_data(df_text, X, target_class)

# [**Bag of Words Model**](http://)
We cannot work with text directly when using machine learning algorithms. Instead, we need
to convert the text to numbers. We may want to perform classification of documents, so each
document is an input and a class label is the output for our predictive algorithm. Algorithms
take vectors of numbers as input, therefore we need to convert documents to fixed-length vectors
of numbers.

## [**Word Counts with countvectorizer** ](http://)
- The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

In [None]:
# sample text
sample_text_count = X_train[:10]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(sample_text_count)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(sample_text_count)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

# [**Spot-Check Algorithms with Countvectorizer**](http://)

In [None]:
# Contvectorizer
models = NormalizedTextModel('countvect')
fit_model(X_train, y_train, models)

## [**Word Frequencies with TfidfVectorizer** ](http://)
- Word counts are a good starting point, but are very basic. One issue with simple counts is that some words like the will appear many times and their large counts will not be very meaningful in the encoded vectors. An alternative is to calculate word frequencies, and by far the most popular method is called TF-IDF.
	* **Term Frequency**: This summarizes how often a given word appears within a document.
	* **Inverse Document Frequency**: This downscales words that appear a lot across documents.

In [None]:
# sample text
sample_text_Tfid = X_train[:10]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(sample_text_Tfid)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform(sample_text_Tfid)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

# [**Spot-Check Algorithms with TfidfVectorizer**](http://)

In [None]:
# TfidfVectorizer
models = NormalizedTextModel('tfvect')
fit_model(X_train, y_train, models)

## [**Hashing with HashingVectorizer**](http://)
- Counts and frequencies can be very useful, but one limitation of these methods is that the
vocabulary can become very large. This, in turn, will require large vectors for encoding
documents and impose large requirements on memory and slow down algorithms. A clever work
around is to use a one way hash of words to convert them to integers. The clever part is that
no vocabulary is required and you can choose an arbitrary-long fixed length vector.

In [None]:
# sample text
sample_text_hash = X_train[:10]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(sample_text_hash)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

# [**Spot-Check Algorithms  with HashingVectorizer**](http://)

In [None]:
# TfidfVectorizer
#models = NormalizedTextModel('hashvect')
#fit_model(X_train, y_train, models)

# [**Fine tuning**](http://)

In [None]:
vectorizer = TfidfVectorizer()
X_train_1 = vectorizer.fit_transform(X_train)
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
#learning_rate= [0.1, 0.001, 0.0001]
#max_depth = [4,5,6]
#min_child_weight=[4,5,6]

#define grid search
grid = dict(n_estimators=n_estimators)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
grid_result = grid_search.fit(X_train_1, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# [**Predict unseen data**](http://)

In [None]:
def classification_metrics(model, y_test, y_pred):
    print(f"Training Accuracy Score: {model.score(X_train, y_train) * 100:.1f}%")
    print(f"Validation Accuracy Score: {model.score(X_test, y_test) * 100:.1f}%")

    conf_matrix = confusion_matrix(y_test, y_pred)
    fig,ax = plt.subplots(figsize=(8,6))
    sns.heatmap(pd.DataFrame(conf_matrix), annot = True, cmap = 'YlGnBu',fmt = 'g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title('Confusion matrix for Logisitic Regression Model', fontsize=20, y=1.1)
    plt.ylabel('Actual label', fontsize=15)
    plt.xlabel('Predicted label', fontsize=15)
    plt.show()
    print(classification_report(y_test, y_pred))

text_clf = Pipeline([('vect', TfidfVectorizer()),('bagging', BaggingClassifier(n_estimators=10))])
model = text_clf.fit(X_train, y_train)
y_pred = model.predict(X_test)
classification_metrics(model,y_test, y_pred)

# References
* https://www.foreseemed.com/natural-language-processing-in-healthcare
* https://appen.com/datasets/audio-recording-and-transcription-for-medical-scenarios/
* https://www.kaggle.com/paultimothymooney/medical-speech-transcription-and-intent