In [None]:
from feature_selector import *

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
df_ionizable_train: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_train_divprio.csv", delimiter=';')
df_ionizable_train["TYPE"] = "TRAIN"
df_ionizable_test: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_test_divprio.csv", delimiter=';')
df_ionizable_test["TYPE"] = "TEST"

df_ionizable = pd.concat([df_ionizable_train, df_ionizable_test])
FS_ionizable: FeatureSelector = FeatureSelector(df_ionizable, cols_to_ignore=["TYPE"])

# FS_io_train.scale_data(inplace=True)
# FS_io_test.scale_data(inplace=True)


df_neutral_train: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_train_divprio.csv", delimiter=';')
df_neutral_train["TYPE"] = "TRAIN"
df_neutral_test: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_test_divprio.csv", delimiter=';')
df_neutral_test["TYPE"] = "TEST"

df_neutral = pd.concat([df_neutral_train, df_neutral_test])
FS_neutral: FeatureSelector = FeatureSelector(df_neutral, cols_to_ignore=["TYPE"])

# FS_ne_train.scale_data(inplace=True)
# FS_ne_test.scale_data(inplace=True)

df_full_train: pd.DataFrame = pd.read_csv("../../Data/full_dataset_train_divprio.csv", delimiter=';')
df_full_train["TYPE"] = "TRAIN"
df_full_test: pd.DataFrame = pd.read_csv("../../Data/full_dataset_test_divprio.csv", delimiter=';')
df_full_test["TYPE"] = "TEST"

df_full = pd.concat([df_full_train, df_full_test])
FS_full: FeatureSelector = FeatureSelector(df_full, cols_to_ignore=["TYPE"])


# FS_full_train: FeatureSelector = FeatureSelector(df_full_train)
# FS_full_test: FeatureSelector = FeatureSelector(df_full_test)

In [None]:
# FS_full.df[["vsurf_W8", "vsurf_HB8"]]
# dropped = FS_full.remove_highly_correlated(verbose=True)

In [None]:
# Test de normalité
from scipy.stats import normaltest
from matplotlib import pyplot


data = df_full_test.iloc[:,14]

stat, p = normaltest(data)

print("Stats= ", stat, "\np= ", p)
alpha = 0.05


if p > alpha:
    print("Gaussian")
else:
    print("Not gaussian")


pyplot.hist(data)
pyplot.show()

# Low variance features
We try to detect every feature that has a variance below the threshold

## IONIZABLE

In [None]:
print("Before low variance removal: ", df_ionizable.shape)
df_ionizable_lv, col_ionizable_lv = FS_ionizable.remove_low_variance(variance_threshold=0, cols_to_ignore=["TYPE"], inplace=True)
print(col_ionizable_lv)
print("length of the feature with low variance that are common for train and test: ", len(col_ionizable_lv))

df_ionizable = df_ionizable.drop(list(col_ionizable_lv), axis=1)
print("After low variance removal: ", df_ionizable.shape)

## NEUTRAL

In [None]:
print("Before low variance removal: ", df_neutral.shape)
df_neutral_lv, col_neutral_lv = FS_neutral.remove_low_variance(variance_threshold=0, cols_to_ignore=["TYPE"], inplace=True)
print(col_neutral_lv)
print("length of the feature with low variance that are common for train and test: ", len(col_neutral_lv))

df_neutral = df_neutral.drop(list(col_neutral_lv), axis=1)
print("After low variance removal: ", df_neutral.shape)

## FULL

In [None]:
print("Before low variance removal: ", df_full.shape)
df_full_lv, col_full_lv = FS_full.remove_low_variance(variance_threshold=0, cols_to_ignore=["TYPE"], inplace=True)
print(col_full_lv)
print("length of the feature with low variance that are common for train and test: ", len(col_full_lv))
df_full = df_full.drop(list(col_full_lv), axis=1)
print("After low variance removal: ", df_full.shape)

# Detect binary data

In [None]:
from sklearn.feature_selection import VarianceThreshold

dfs: dict = {
    "ionizable": df_ionizable.loc[:, df_ionizable.isin([0,1]).all()],
    "neutral": df_neutral.loc[:, df_neutral.isin([0,1]).all()],
    "full": df_full.loc[:, df_full.isin([0,1]).all()]
}


for key, value in dfs.items():
    suspect_cols = []
    for i in range(value.shape[1]):
        aled = value[value.columns[i]].value_counts()
        percentage = aled.min() / aled.sum() * 100
        if (percentage < 1).any():
            suspect_cols.append(aled)
    print("===== ", key, " =====")
    print(suspect_cols)


# High correlation feature

## IONIZABLE

In [None]:
df_io_corr = FS_ionizable.get_correlation()
corr_feat_mtx = df_io_corr.to_numpy()


# Determine optimun number of clusters for kmeans
wcss = []
max_num_clusters = 15
for i in range(1, max_num_clusters):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(corr_feat_mtx)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, max_num_clusters), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
display_data_cluster(df_corr=df_io_corr, n_clusters=5)

In [None]:
print("Before high correlation removal: ", df_ionizable.shape)
df_ionizable_correlation_removed = FS_ionizable.remove_highly_correlated(graph=True)
print("After high correlation removal: ", df_ionizable_correlation_removed.shape)
df_ionizable[df_ionizable.columns.difference(df_ionizable_correlation_removed.columns)].columns

In [None]:
df_io_corr = FS_ionizable.get_correlation(df_ionizable_correlation_removed)

display_elbow(df_io_corr)
display_data_cluster(df_corr=df_io_corr, n_clusters=5)

## Neutral

In [None]:
df_ne_corr = FS_neutral.get_correlation()
display_elbow(df_ne_corr)
display_data_cluster(df_corr=df_ne_corr, n_clusters=4)

In [None]:
print("Before high correlation removal: ", df_neutral.shape)
df_neutral_correlation_removed = FS_neutral.remove_highly_correlated(graph=True)
print("After high correlation removal: ", df_neutral_correlation_removed.shape)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_seq_items', None):  # more options can be specified also
    display(df_neutral[df_neutral.columns.difference(df_neutral_correlation_removed.columns)].columns)

In [None]:
df_ne_corr = FS_neutral.get_correlation(df_neutral_correlation_removed)

display_elbow(df_ne_corr)

In [None]:
display_data_cluster(df_corr=df_io_corr, n_clusters=3)

## Full

In [None]:
df_full_corr = FS_full.get_correlation()
display_elbow(df_full_corr)

In [None]:
display_data_cluster(df_corr=df_full_corr, n_clusters=4)

In [None]:
print("Before high correlation removal: ", df_full.shape)
df_full_correlation_removed = FS_full.remove_highly_correlated(graph=True)
print("After high correlation removal: ", df_full_correlation_removed.shape)

In [None]:
df_full[df_full.columns.difference(df_full_correlation_removed.columns)].columns

In [None]:
df_full_corr = FS_full.get_correlation(df_full_correlation_removed)
display_elbow(df_full_corr)
display_data_cluster(df_corr=df_io_corr, n_clusters=3)

# Proof of concept of the **transform** method

In [None]:
# Loading the data and putting it in a FeatureSelector object
df_full_train: pd.DataFrame = pd.read_csv("../../Data/full_dataset_train_divprio.csv", delimiter=';')
df_full_train["TYPE"] = "TRAIN"
df_full_test: pd.DataFrame = pd.read_csv("../../Data/full_dataset_test_divprio.csv", delimiter=';')
df_full_test["TYPE"] = "TEST"

df_full = pd.concat([df_full_train, df_full_test])
FS_full: FeatureSelector = FeatureSelector(df_full, cols_to_ignore=["TYPE"])

In [None]:
# Calling the automatic selection function
final_df: pd.DataFrame = FS_full.transform()

In [None]:
final_df.equals(df_full_correlation_removed)

# Creating the CSV files of the cleaned features

In [None]:
df_ionizable_train: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_train_divprio.csv", delimiter=';')
df_ionizable_train["TYPE"] = "TRAIN"
df_ionizable_test: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_test_divprio.csv", delimiter=';')
df_ionizable_test["TYPE"] = "TEST"

df_ionizable = pd.concat([df_ionizable_train, df_ionizable_test])
FS_ionizable: FeatureSelector = FeatureSelector(df_ionizable, cols_to_ignore=["TYPE"])



df_neutral_train: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_train_divprio.csv", delimiter=';')
df_neutral_train["TYPE"] = "TRAIN"
df_neutral_test: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_test_divprio.csv", delimiter=';')
df_neutral_test["TYPE"] = "TEST"

df_neutral = pd.concat([df_neutral_train, df_neutral_test])
FS_neutral: FeatureSelector = FeatureSelector(df_neutral, cols_to_ignore=["TYPE"])


df_full_train: pd.DataFrame = pd.read_csv("../../Data/full_dataset_train_divprio.csv", delimiter=';')
df_full_train["TYPE"] = "TRAIN"
df_full_test: pd.DataFrame = pd.read_csv("../../Data/full_dataset_test_divprio.csv", delimiter=';')
df_full_test["TYPE"] = "TEST"

df_full = pd.concat([df_full_train, df_full_test])
FS_full: FeatureSelector = FeatureSelector(df_full, cols_to_ignore=["TYPE"])


save_path: str = "../../Data/Filtered/"

In [None]:
ionizable_df: pd.DataFrame = FS_ionizable.transform()
neutral_df: pd.DataFrame = FS_neutral.transform()
full_df: pd.DataFrame = FS_full.transform()

In [None]:
def split_train_test(df: pd.DataFrame, separator: str = "TYPE", y: str = "Log_MP_RATIO") -> tuple[pd.DataFrame, pd.DataFrame]:
    df_train = df.loc[df[separator] == "TRAIN"]
    df_test = df.loc[df[separator] == "TEST"]

    train_MP = df_train["Log_MP_RATIO"]
    test_MP = df_test["Log_MP_RATIO"]

    df_train = df_train.drop(columns=[separator, y])
    df_test = df_test.drop(columns=[separator, y])

    df_train.insert(0, y, train_MP)
    df_test.insert(0, y, test_MP)

    print("Train: ", df_train.shape, "\nTest: ", df_test.shape)
    return df_train, df_test

In [None]:
ionizable_train, ionizable_test = split_train_test(ionizable_df)
ionizable_train.to_csv(save_path + "ionizable_train.csv", index=False, encoding='utf-8', sep=";")
ionizable_test.to_csv(save_path + "ionizable_test.csv", index=False, encoding='utf-8', sep=";")

In [None]:
neutral_train, neutral_test = split_train_test(neutral_df)
neutral_train.to_csv(save_path + "neutral_train.csv", index=False, encoding="utf-8", sep=";")
neutral_test.to_csv(save_path + "neutral_test.csv", index=False, encoding="utf-8", sep=";")

In [None]:
full_train, full_test = split_train_test(full_df)
full_train.to_csv(save_path + "full_train.csv", index=False, encoding="utf-8", sep=";")
full_test.to_csv(save_path + "full_test.csv", index=False, encoding="utf-8", sep=";")

# GRAPHS

In [None]:
FS_full = FeatureSelector(df_full, cols_to_ignore=["TYPE"])
y_corr = FS_full.get_correlation_to_y(method="pearson")
# y_corr = y_corr.abs()
y_corr = y_corr[y_corr.abs() > 0.5]
full_corr = df_full.corr(method="pearson")

In [None]:
df_full_correlated_y = df_full[y_corr.index]
df_full_correlated_y

In [None]:
# df_display = 1 - full_corr.loc[y_corr.index, y_corr.index]
# linkage = hierarchy.linkage(distance.squareform(df_display), method="average")
# print(linkage)
g = sns.clustermap(df_full_correlated_y.corr())

# mask = np.tril(np.ones_like(df_display))
# values = g.ax_heatmap.collections[0].get_array().reshape(df_display.shape)
# new_values = np.ma.array(values, mask=mask)
# g.ax_heatmap.collections[0].set_array(new_values)
# display(g)




def corr_matrix_plot(df):
    f, ax = plt.subplots(figsize=(12, 10))
    corr = df.corr()
    sns.heatmap(corr, mask=np.zeros_like(corr),
                cmap=sns.diverging_palette(220, 10, as_cmap=True),
                square=True, ax=ax)