In [45]:
# !pip install -r ../requirements.txt
# !pip install mlxtend
# !pip install imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
from typing import Union
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud, STOPWORDS
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from math import * 
from collections import *
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import numpy as np
import itertools


# Data Extraction module

In [46]:
# def _load_dataframe(
#     df: Union[str, pd.DataFrame],
#     storage_options: Union[str, dict] = "",
#     delimiter= ','
# ):
#     """
#     Load all the required files for creating the output schema.
#     Args:
#         :param df: Path to the file for reading
#         :param storage_options: storage account details
#     Returns: 
#         a dataframe for the required file/ table
#     """

#     if isinstance(df, pd.DataFrame):
#         return df

#     else:
#         df_ = str(df)
#         if storage_options == "":
#             if df_.endswith(".csv"):
#                 df = pd.read_csv(df_,delimiter=delimiter)
#             elif df_.endswith(".parquet"):
#                 df = pd.read_parquet(df_, engine="pyarrow")
#         else:
#             if df_.endswith(".csv"):
#                 df = pd.read_csv(df_, storage_options=storage_options)
#             elif df_.endswith(".parquet"):
#                 df = pd.read_parquet(
#                     df_, engine="pyarrow", storage_options=storage_options
#                 )
#         return df

# EDA module

In [47]:
def percent_null_values(df: Union[str, pd.DataFrame]):
    """
    Compute null values in the dataset
    Args:
        :param dataset_path: Dataset Path to the file for reading
    Returns: 
        percent of null values per columns
    """

    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', inplace=True)
    missing_value_df.reset_index(drop=True, inplace=True)
    return missing_value_df

In [48]:
def plot_top_ten_categories(df):    
    # Display the top 10 specialties most frequently repeated in the text
    # The top 10 specialties will be most likely to be predicted since the model will be trained mainly on them
    fig, ax = plt.subplots(figsize = (18,8)) # set size of figure
    specialty = df['medical_specialty'].value_counts()
    specialty = specialty[:10,]
    g = sns.barplot(specialty.index, specialty.values, alpha=0.8)

    plt.title("Top 10 Specialties in Medical Transcriptions", fontsize=15)
    plt.ylabel("Frequency", fontsize=15)
    plt.xlabel("Top 10 Specialties out of 40 total", fontsize=15)

    g.set_xticklabels(g.get_xticklabels(), rotation=45, fontsize=17)
    plt.show()




# Text Preprocessing  

In [49]:
# transcription whose length is greater than 50 are selected 
def filter_categories_by_threshold(data_categories, threshold):
    filtered_data_categories = data_categories.filter(lambda x:x.shape[0] > 50)
    final_data_categories = filtered_data_categories.groupby(filtered_data_categories['medical_specialty'])
    i=1
    print('============Reduced Categories ======================')
    for catName,dataCategory in final_data_categories:
        print('Cat:'+str(i)+' '+catName + ' : '+ str(len(dataCategory)) )
        i = i+1

    print('============ Reduced Categories ======================')
    return filtered_data_categories

In [50]:
#Stopwords and punctuation removal 
def clean_text(text): 
    stop_words = ['abc','abcd','cm','ml','mmhg', 'wa', 'patient', 'procedure', 'history','room','ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during','my','on', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than','abcd','abcd general','abcd general hospital','abcd hospital','abdomen abdomen','social history','hospital','history history','patient']
    pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    text = text.replace(pat, '')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text1 = ''.join([w for w in text if not w.isdigit()]) 
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    #BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    
    text2 = text1.lower()
    text2 = REPLACE_BY_SPACE_RE.sub('', text2) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text2 = BAD_SYMBOLS_RE.sub('', text2)
    return text2

In [51]:
def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
#     intial_sentences= sentences[0:1]
#     final_sentences = sentences[len(sentences)-2: len(sentences)-1]
    
#     for sentence in intial_sentences:
#         words=word_tokenize(sentence)
#         for word in words:
#             wordlist.append(lemmatizer.lemmatize(word))
    for sentence in sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))       
    return ' '.join(wordlist) 

In [69]:
def vectorization(data, final_list=None):
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,1), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
    tfIdfMat  = vectorizer.fit_transform(data['transcription'].tolist() )
    feature_names = vectorizer.get_feature_names_out() # if you get an error just delete _out 
    stop_words = ['abc','abcd','cm','ml','mmhg', 'wa', 'patient', 'procedure', 'history','room','ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during','my','on', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than','abcd','abcd general','abcd general hospital','abcd hospital','abdomen abdomen','social history','hospital','history history','patient']
    if final_list:
        feature_names = list(filter(lambda i: i not in stop_words, final_list))
    else:
        feature_names = list(filter(lambda i: i not in stop_words, feature_names))
    return feature_names, tfIdfMat, vectorizer

In [70]:
def plot_confusion_matrix(cm, classes):
    fig = plt.figure(figsize=(20,20))
    ax= fig.add_subplot(1,1,1)
    sns.heatmap(cm, annot=True, cmap="Greens",ax = ax,fmt='g'); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels', fontsize=18);
    ax.set_ylabel('True labels', fontsize=18); 
    ax.set_title('Confusion Matrix', fontsize=18); 
    ax.xaxis.set_ticklabels(classes); 
    ax.yaxis.set_ticklabels(classes);
    plt.setp(ax.get_yticklabels(), rotation=30, horizontalalignment='right', fontsize=18)
    plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right', fontsize=18)     
    plt.show()

# Lemmatizing the List 

In [54]:
def lemmatize_words(words):
    lemmas = []
    for word in words:
        synsets = wordnet.synsets(word)
        if synsets:
            lemma = synsets[0].lemmas()[0].name()
        else:
            lemma = word
        lemmas.append(lemma)
    return lemmas

# Removing Duplicates 

In [55]:
def remove_duplicates(lst):
    return list(set(lst))

# word2vec

In [56]:
def word2vec(word):
    # Count the number of characters in each word.
    count_characters = Counter(word)
    # Gets the set of characters and calculates the "length" of the vector.
    set_characters = set(count_characters)
    length = sqrt(sum(c*c for c in count_characters.values()))
    return count_characters, set_characters, length, word

# Cosine Similarity 

In [57]:
def cosine_similarity(vector1, vector2, ndigits):
    
    # Get the common characters between the two character sets
    common_characters = vector1[1].intersection(vector2[1])
    # Sum of the product of each intersection character.
    product_summation = sum(vector1[0][character] * vector2[0][character] for character in common_characters)
    # Gets the length of each vector from the word2vec output.
    length = vector1[2] * vector2[2]
    # Calculates cosine similarity and rounds the value to ndigits decimal places.
    if length == 0:
        # Set value to 0 if word is empty.
        similarity = 0
    else:
        similarity = round(product_summation/length, ndigits)
    return similarity

In [58]:
def find_similar(full_names_list, similarity_threshold, ndigits):
    # Initiate an empty list to store results.
    results_list = []
    # Apply word2vec function to each name and store them in a list.
    vector_list = [word2vec(str(i)) for i in full_names_list]
    # Two loops to compare each vector with another vector only once.
    for i in range(len(vector_list)):
        # Get first vector
        vector1 = vector_list[i]
        for j in range(i+1, len(vector_list)):
            # Get the next vector
            vector2 = vector_list[j]
            # Calculate cosine similarity
            similarity_score = cosine_similarity(vector1, vector2, ndigits)
            # Append to results list if similarity score is between 1 and the threshold.
            # Note that scores of 1 can be ignored here if we want to exclude people with the same name.
            if 1 >= similarity_score >= similarity_threshold:
                results_list.append([vector1[3], vector2[3], similarity_score])
            else:
                pass
    # Convert list to dataframe.
    results_df = pd.DataFrame(results_list)
    if len(results_df) != 0:
        results_df.columns = ['full_name', 'comparison_name', 'similarity_score']
    else:
    # Can add error here if there's no results to return if desired.
        pass
    return results_df

# Jacard Similarity 

In [59]:

def jaccard_similarity(list_, threshold=0):
    result = []
    for x, y in itertools.combinations(list_, 2):
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        union_cardinality = len(set.union(*[set(x), set(y)]))
        jaccard_similarity = intersection_cardinality / float(union_cardinality)
        if jaccard_similarity >= threshold:
            result.append((x, y, jaccard_similarity))
    return result

# Logistic Regression Modelling 

In [60]:
def fit_logistic_regression(X_train, y_train, X_test, logistic_regression_params):
    clf = LogisticRegression(**logistic_regression_params).fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    return y_pred, clf


# SVM Modelling 

In [61]:


def svm_classification(X_train, y_train, X_test, y_test, svm_params):
    
    SV = svm.SVC(**svm_params).fit(X_train, y_train)
    
    # Predict the labels of test set
    y_pred1 = SV.predict(X_test)

    return y_pred1, SV

# Random Forest

In [62]:

def train_random_forest(X_train, y_train, X_test, y_test, rf_params):
    # create a random forest classifier with the specified number of estimators and maximum depth
    rf = RandomForestClassifier(**rf_params).fit(X_train, y_train)
    
    y_pred2 = rf.predict(X_test)
    
    # return the trained classifier
    return y_pred2, rf

# Gradient Boosting

In [63]:

def train_gbdt(X_train, y_train, X_test, y_test, gbdt_params):
    gbdt_clf = GradientBoostingClassifier(**gbdt_params).fit(X_train, y_train)
    y_pred3 = gbdt_clf.predict(X_test)
    return y_pred3, gbdt_clf

# Categorical Boosting

In [64]:

def train_catboost(X_train, y_train, X_test, y_test, catboost_params):
    cat = CatBoostClassifier(**catboost_params)
    cat.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
    y_pred4 = cat.predict(X_test)
    return y_pred4, cat