In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

In [None]:
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)

Getting the dataset and determining the target column

In [None]:
dataset = pd.read_excel("./dataset.xlsx")
target = dataset["SARS-Cov-2 exam result"]
dataset = dataset.drop("SARS-Cov-2 exam result", axis=1)

Feature types

In [None]:
dataset.dtypes.value_counts()

Missing values

In [None]:
feature_presence = (dataset.isna().sum() / dataset.shape[0]).sort_values()
print(feature_presence)
sns.heatmap(dataset.isna(), cbar=False)

Target Visualization

In [None]:
target.value_counts().plot.pie()

Removing unimportant features

In [None]:
delete_feature = [x for x in feature_presence.index if feature_presence[x] > 0.9]
delete_feature.append('Patient ID')
dataset = dataset.drop(delete_feature, axis=1)

Visualisation des variables continues

In [None]:
float_features = [col for col in dataset.columns if dataset[col].dtype == 'float']
for feature in float_features:
    sns.displot(x=feature, data=dataset, hue=target, kde=True)

Visualisation des variables categorielles

In [None]:
df_object_features = dataset.select_dtypes('object')
for col in df_object_features:
    print(f'{col :-<40} {dataset[col].unique()}')
dataset.drop('Parainfluenza 2', axis=1, inplace=True)
df_object_features.drop('Parainfluenza 2', axis=1, inplace=True)
for i in range(4):
    plt.figure()
    sns.boxplot(x=df_object_features.columns[i], y=float_features[i], data=dataset, hue=target)
for col in df_object_features:
    plt.figure()
    dataset[col].value_counts().plot.pie()

Divide our features into groups

In [None]:
feature_presence = dataset.isna().sum() / dataset.shape[0]
feature_continuos_category = dataset.columns[(feature_presence < 0.9) & (feature_presence > 0.88)]
feature_categorial_category = dataset.columns[(feature_presence < 0.8) & (feature_presence > 0.7)]

Relation target / features

In [None]:
for col in feature_continuos_category:
    sns.displot(x=col, data=dataset, hue=target, kind='kde')
for col in feature_categorial_category:
    plt.figure()
    sns.heatmap(pd.crosstab(target, dataset[col]), annot=True, fmt='d')

Relation between continuos_features

In [None]:
sns.clustermap(dataset[feature_continuos_category].corr())

Relation between numerical_features

In [None]:
correlation_dataset = dataset.corr()
for col in dataset.select_dtypes('int'):
    print(f'Feature: {col}')
    print(correlation_dataset[col].sort_values(ascending=False))
    print('-' * 50)

How many maladies does a patient have?

In [None]:
dataset['maladies_count'] = (dataset[feature_categorial_category] == 'detected').sum(axis=1)
sns.countplot(x='maladies_count', data=dataset)

Exploring Nan values in depth

In [None]:
continuos_dataset = dataset[feature_continuos_category]
continuos_dataset['covid'] = target
print('Target distribution in continuos_dataset')
print(continuos_dataset.dropna()['covid'].value_counts(normalize=True))
print('-' * 50)
categorial_dataset = dataset[feature_categorial_category]
categorial_dataset['covid'] = target
print('Target distribution in categorial_dataset')
print(categorial_dataset.dropna()['covid'].value_counts(normalize=True))

T-Test

In [None]:
positive_dataset = dataset[target == 'positive']
negative_dataset = dataset[target == 'negative']
negative_dataset = negative_dataset.sample(positive_dataset.shape[0])

def t_test(col):
    limit = 0.02
    stat, p = ttest_ind(positive_dataset[col].dropna(), negative_dataset[col].dropna())
    if p < limit:
        return "HO à rejeter"
    else:
        return  "0"

for col in feature_continuos_category:
    print(f'{col :-<50}{t_test(col)}')