# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Import Data

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df = df.drop(['id', 'Unnamed: 32'], axis=1)

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B':0})

In [None]:
# IN THE MODEL TRAINING CONVERT TARGET TO INT64. I USE CATEGORY HERE TO CREATE THE EDA VISUALIZATIONS
df['diagnosis'] = df['diagnosis'].astype('category')
# df['diagnosis'] = df['diagnosis'].astype('int64')

In [None]:
df

# Dataset Summary

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe(include = 'all')

In [None]:
df.value_counts('diagnosis')

In [None]:
numerical_features = df.select_dtypes(include='number').columns
categorical_features = df.select_dtypes(include='category').columns
target = 'diagnosis'

# Dataset Distributions

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(df)
plt.show()

In [None]:
for column in numerical_features:
    sns.boxplot(df[column])
    plt.title(f'{column} Boxplot')
    plt.show()

In [None]:
for column in numerical_features:
    plt.figure()
    sns.boxplot(data=df, x=column, y=target, hue=target)
    plt.title(f'Boxplot: {column} x {target}')
    plt.show()

In [None]:
for column in numerical_features:
    df.hist(column)
    plt.show()

In [None]:
for column in numerical_features:
    df[column].plot(kind = 'kde')
    plt.title(f'Kernel Density Estimate: {column}')
    plt.xlabel(column)
    plt.show()

# Correlations

In [None]:
plt.figure(figsize=(20,10))

# create an auxiliar df to plot the correlations with the target
df_aux = df
df_aux['diagnosis'] = df_aux['diagnosis'].astype('int64')
sns.heatmap(df_aux.corr(numeric_only = True), annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
from scipy.stats import f_oneway

# Anova test
for column in numerical_features:
    groups = [df[df[target] == i][column] for i in df[target].unique()]
    f_stat, p_value = f_oneway(*groups)
    print(f"{column}: \nF-statistic: {f_stat:.10f}, P-value: {p_value:.10f}")

In [None]:
# when there are too many variables, plot one by one
for column1 in df.columns:
    for column2 in df.columns:
        if column1 == column2:
            continue
        else:
            sns.regplot(x=df[column1], y=df[column2], scatter_kws={'alpha':0.3})
            plt.title(f'{column1} vs {column2}')
            plt.xlabel(column1)
            plt.ylabel(column2)
            plt.tight_layout()
            plt.show()

In [None]:
# when there are too many variables, plot one by one
for column1 in df.columns:
    for column2 in df.columns:
        if column1 == column2:
            continue
        else:
            sns.kdeplot(x=df[column1], y=df[column2], fill=True, thresh=0.05)
            plt.title(f'{column1} vs {column2}')
            plt.xlabel(column1)
            plt.ylabel(column2)
            plt.tight_layout()
            plt.show()

# Categorical Features

In [None]:
for column in categorical_features:
    print(df[column].value_counts())

In [None]:
# plot the value counts as bar plot with respect to the target variable
for column in categorical_features:
    sns.countplot(data=df, x=column, hue=target)
    plt.title(f'{column} Countplot with respect with the target')
    plt.show()

# Feature Relations

In [None]:
# scatter plot to check if there is a relation between the numerical columns and the target variable
for column in numerical_features:
    sns.scatterplot(x=df[column], y=df.index, hue=df[target])
    plt.title(f"Scatter plot of {column} colored by target")
    plt.xlabel(column)
    plt.ylabel(target)  # Replace with any other y-variable if needed
    plt.show()

In [None]:
for column in numerical_features:
    sns.kdeplot(data=df, x=column, hue=target, fill=True)
    plt.title(f'Kernel Density Estimate: {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
for column in numerical_features:
    sns.violinplot(data=df, y=column, hue=target)
    plt.title(f'Violin Plot of {column} by Target')
    plt.show()

In [None]:
for column in numerical_features:
    sns.histplot(df, x=column, hue=target, kde=True, multiple="layer")
    plt.title(f'Histogram of {column} by Target')
    plt.show()

# Dataset Balance

In [None]:
df[target]

In [None]:
sns.countplot(df[target])

# Notes

1 - The dataset doesn't require balance. Even though there are more benign cancers, we have enough malignant cancers to train the model. Instead of balancing, weights with regard to the disporpotion can be used

2 - Feature scaling should be performed, features are not in the same scale and that can impact training

3 - No null values in any column

4 - Not really worried with outliers. The values seem to be of malignant cancers and we need those values to train

5 - There are a lot of features with high correlation
    
    5.1 - Example: area and radius which obviously one is calculated with the other. I will test the models using all the features and then check the feature importances and remove the features accordingly.
    
    5.2 - Additionaly, there are features that are correlated between the different type of measures, i.e., area_mean and area_worst. This is expected but I will evaluate how the models react when using both measurements

6 - As expected, largest tumours seem to be malignant. We have strong correlations between large tumours and the target variable. Mainly in the mean and worst measurements.