In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pandas import DataFrame as df

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
import ipytest
import pytest

__file__ = "NatureDataset.ipynb"

In [5]:
dataf = pd.read_csv("../datasets/disease.csv", sep=' ', delimiter='\t')

### Unit tests

In [6]:
def test_import_data():
    assert os.path.exists("../datasets/disease.csv") == 1
    assert pd.read_csv("../datasets/disease.csv", sep=' ', delimiter='\t').size != 0
ipytest.run()

platform linux -- Python 3.6.7, pytest-4.3.1, py-1.8.0, pluggy-0.9.0
rootdir: /home/tgey/Medicision/IA/Notebooks, inifile:
collected 1 item

NatureDataset.py .                                                          [100%]



# Data Exploration

## Dataset discover

In [7]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147978 entries, 0 to 147977
Data columns (total 4 columns):
MeSH Symptom Term    147978 non-null object
MeSH Disease Term    147978 non-null object
PubMed occurrence    147978 non-null int64
TFIDF score          147978 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 4.5+ MB


In [8]:
dataf.columns

Index(['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence',
       'TFIDF score'],
      dtype='object')

In [9]:
dataf.columns=['Symptoms', 'Diseases', 'occurrence', 'score']
dataf.head(15)

Unnamed: 0,Symptoms,Diseases,occurrence,score
0,Aging - Premature,Respiratory Syncytial Virus Infections,1,3.464551
1,Aging - Premature,Orthomyxoviridae Infections,1,3.464551
2,Aging - Premature,HIV Infections,3,10.393654
3,Aging - Premature,Acquired Immunodeficiency Syndrome,3,10.393654
4,Aging - Premature,Breast Neoplasms,1,3.464551
5,Aging - Premature,Colonic Neoplasms,1,3.464551
6,Aging - Premature,Mammary Neoplasms - Animal,1,3.464551
7,Aging - Premature,Skin Neoplasms,3,10.393654
8,Aging - Premature,Neoplasms - Experimental,1,3.464551
9,Aging - Premature,Cell Transformation - Neoplastic,3,10.393654


### Missing Values

In [10]:
pd.isnull(dataf).any()

Symptoms      False
Diseases      False
occurrence    False
score         False
dtype: bool

In [11]:
total = dataf.isnull().sum().sort_values(ascending=False)
percent = (dataf.isnull().sum()/dataf.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
score,0,0.0
occurrence,0,0.0
Diseases,0,0.0
Symptoms,0,0.0


In [12]:
def test_missing_values():
    assert missing_data['Total'].max() == 0

ipytest.run()

platform linux -- Python 3.6.7, pytest-4.3.1, py-1.8.0, pluggy-0.9.0
rootdir: /home/tgey/Medicision/IA/Notebooks, inifile:
collected 2 items

NatureDataset.py ..                                                         [100%]



In [13]:
dataf.describe()

Unnamed: 0,occurrence,score
count,147978.0,147978.0
mean,11.886753,16.672997
std,228.031517,222.439257
min,1.0,0.069941
25%,1.0,1.639269
50%,2.0,2.817924
75%,4.0,6.576661
max,50200.0,41794.193312


In [14]:
dfGroup = dataf.groupby("Diseases")[["Symptoms"]]
nbSymptoms = dfGroup.count()
nbSymptoms.sort_values("Symptoms").reset_index()
nbSymptoms.columns=['Number of Symptoms for a disease']
nbSymptoms.head(15)

Unnamed: 0_level_0,Number of Symptoms for a disease
Diseases,Unnamed: 1_level_1
22q11 Deletion Syndrome,4
46 - XX Disorders of Sex Development,1
46 - XY Disorders of Sex Development,1
ACTH Syndrome - Ectopic,12
ACTH-Secreting Pituitary Adenoma,4
AIDS Arteritis - Central Nervous System,3
AIDS Dementia Complex,54
AIDS-Associated Nephropathy,9
AIDS-Related Complex,30
AIDS-Related Opportunistic Infections,88


In [15]:
def test_symptoms_agregation():
    assert ('Number of Symptoms for a disease' in nbSymptoms.columns) == True
    assert nbSymptoms.size != 0

In [16]:
nbDiseases = dataf.groupby("Symptoms")[["Diseases"]].count()
nbDiseases.sort_values("Diseases").reset_index()
nbDiseases.columns=['Number of Diseases']
nbDiseases.head(15)

Unnamed: 0_level_0,Number of Diseases
Symptoms,Unnamed: 1_level_1
Abdomen - Acute,1002
Abdominal Pain,1599
Acute Coronary Syndrome,314
Aerophagy,72
Ageusia,117
Aging - Premature,132
Agnosia,303
Agraphia,148
Akathisia - Drug-Induced,188
Albuminuria,535


In [17]:
def test_diseases_agregation():
    assert ('Number of Diseases' in nbDiseases.columns) == True
    assert nbDiseases.size != 0
