In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

In [15]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utkarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utkarsh\AppData\Roaming\nltk_data...


True

In [5]:
df = pd.read_csv("medical_Data.csv", index_col= False)

In [6]:
df.head()

Unnamed: 0,class,description
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,5,Subclavian artery to innominate vein fistula a...
4,4,Effect of local inhibition of gamma-aminobutyr...


In [8]:
df["class"].value_counts()

class
5    4805
1    3163
4    3051
3    1925
2    1494
Name: count, dtype: int64

In [9]:
def get_sentence_word_count(text_list):
    sent_count = 0
    word_count = 0
    vocab = {}
    for text in text_list:
        sentences=sent_tokenize(str(text).lower())
        sent_count = sent_count + len(sentences)
        for sentence in sentences:
            words=word_tokenize(sentence)
            for word in words:
                if(word in vocab.keys()):
                    vocab[word] = vocab[word] +1
                else:
                    vocab[word] =1 
    word_count = len(vocab.keys())
    return sent_count,word_count

In [11]:
df = df[df['description'].notna()]
sent_count,word_count= get_sentence_word_count(df['description'].tolist())
print("Number of sentences in description column: "+ str(sent_count))
print("Number of unique words in description column: "+str(word_count))



data_categories  = df.groupby(df['class'])
i = 1
print('===========Different health condition =======================')
for catName,dataCategory in data_categories:
    print('Cat:'+str(i)+' '+str(catName) + ' : '+ str(len(dataCategory)) )
    i = i+1
print('==================================')

Number of sentences in description column: 122404
Number of unique words in description column: 54889
Cat:1 1 : 3163
Cat:2 2 : 1494
Cat:3 3 : 1925
Cat:4 4 : 3051
Cat:5 5 : 4805


In [13]:
def clean_text(text ): 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text1 = ''.join([w for w in text if not w.isdigit()]) 
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    #BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    
    text2 = text1.lower()
    text2 = REPLACE_BY_SPACE_RE.sub('', text2) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text2 = BAD_SYMBOLS_RE.sub('', text2)
    return text2

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
    intial_sentences= sentences[0:1]
    final_sentences = sentences[len(sentences)-2: len(sentences)-1]
    
    for sentence in intial_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    for sentence in final_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))       
    return ' '.join(wordlist) 

In [18]:
print('Sample description 1:'+df.iloc[5]['description']+'\n')
print('Sample description 2:'+df.iloc[125]['description']+'\n')
print('Sample description 3:'+df.iloc[1000]['description'])

Sample description 1:infection during chronic epidural catheterization  diagnosis and treatment  it wa concluded that use of longterm epidural catheterization is associated with a definable epidural infection rate 

Sample description 2:comparison of collagen propeptides a growth marker in child with inflammatory bowel disease  these observation indicate that the serum concentration of both collagen propeptides reflect growth activity in child with inflammatory bowel disease and suggest that routine measurement of collagen propeptides may have clinical value in monitoring normal and abnormal growth 

Sample description 3:histologic abnormality of large and small coronary artery  neural structure  and the conduction system of the heart found in postmortem study of individual dying from the toxic oil syndrome  based upon observation by others with experimental feeding of rapeseed oil containing either high or low erucic acid  we suggest that this oil must remain a major suspected cause o

In [16]:
df['description'] = df['description'].apply(lemmatize_text)
df['description'] = df['description'].apply(clean_text)

In [19]:
print('Sample description 1:'+df.iloc[5]['description']+'\n')
print('Sample description 2:'+df.iloc[125]['description']+'\n')
print('Sample description 3:'+df.iloc[1000]['description'])

Sample description 1:infection during chronic epidural catheterization  diagnosis and treatment  it wa concluded that use of longterm epidural catheterization is associated with a definable epidural infection rate 

Sample description 2:comparison of collagen propeptides a growth marker in child with inflammatory bowel disease  these observation indicate that the serum concentration of both collagen propeptides reflect growth activity in child with inflammatory bowel disease and suggest that routine measurement of collagen propeptides may have clinical value in monitoring normal and abnormal growth 

Sample description 3:histologic abnormality of large and small coronary artery  neural structure  and the conduction system of the heart found in postmortem study of individual dying from the toxic oil syndrome  based upon observation by others with experimental feeding of rapeseed oil containing either high or low erucic acid  we suggest that this oil must remain a major suspected cause o

In [22]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(df['description'].tolist() )
feature_names = sorted(vectorizer.vocabulary_.keys())
print(feature_names)

['abdominal', 'ability', 'abnormal', 'abnormality', 'abscess', 'absence', 'accuracy', 'achieved', 'acid', 'acquired', 'action', 'activation', 'active', 'activity', 'acute', 'acute myocardial', 'acute myocardial infarction', 'addition', 'additional', 'adenocarcinoma', 'adenoma', 'adjuvant', 'administration', 'adult', 'advanced', 'adverse', 'affect', 'affected', 'age', 'agent', 'aggressive', 'aids', 'airway', 'alcohol', 'alpha', 'alteration', 'altered', 'analysis', 'anastomosis', 'andor', 'aneurysm', 'angina', 'angiographic', 'angiography', 'angioplasty', 'animal', 'antagonist', 'anterior', 'antibiotic', 'antibody', 'antigen', 'aorta', 'aortic', 'appear', 'appearance', 'appeared', 'appears', 'approach', 'appropriate', 'area', 'arrest', 'arrhythmia', 'arterial', 'artery', 'artery bypass', 'artery disease', 'arthritis', 'arthroplasty', 'ascites', 'aspect', 'aspiration', 'assessment', 'associated', 'association', 'asymptomatic', 'atrial', 'atrial fibrillation', 'atrophy', 'author', 'availab

In [25]:
import gc
gc.collect()
tfIdfMatrix = tfIdfMat.todense()
labels = df['description'].tolist()

tsne_results = TSNE(n_components=2,init='random',random_state=0, perplexity=40).fit_transform(np.asarray(tfIdfMatrix))
plt.figure(figsize=(16,10))
palette = sns.hls_palette(21, l=.6, s=.9)
sns.scatterplot(
    x=tsne_results[:,0], y=tsne_results[:,1],
    hue=labels,
    palette= palette,
    legend="full",
    alpha=0.3
)
plt.show()

The palette list has fewer values (21) than needed (11226) and will cycle, which may produce an uninterpretable plot.
  sns.scatterplot(


ValueError: Image size of 7929x235153 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1600x1000 with 1 Axes>