In [None]:
import pandas as pd
import warnings
from itertools import permutations

In [None]:
excel_sheet = pd.read_excel("../../Data/milk_plasma_all_data_mrg_020821_v3.xlsx", sheet_name=[0, 1, 2])
full_sheet = excel_sheet[0].copy()
neutral_sheet = excel_sheet[1].copy()
ionizable_sheet = excel_sheet[2].copy()
ionizable_full_sheet: pd.DataFrame = full_sheet.loc[full_sheet["CHARGED"] == 1].copy()
neutral_full_sheet: pd.DataFrame = full_sheet.loc[full_sheet["neutral"] == 1].copy()


# df['your_column_name'].isin(["Log_MP_RATIO"]).value_counts()


df_dict: dict = {"full": full_sheet,
                 "neutral": neutral_sheet,
                 "ionizable": ionizable_sheet,
                 "charged verif":ionizable_full_sheet,
                 "neutral verif": neutral_full_sheet
                 }
for key, value in df_dict.items():
    print("====== ", key, " =======")
    print("\t>>> Before: ", value.shape)
    # Drop rows with NaN
    value.dropna(axis=0, inplace=True)
    # Drop non numerical cols
    non_numerical_cols_to_drop = [c for c in value.select_dtypes(exclude="number").columns if c != "set"]
    value.drop(columns=non_numerical_cols_to_drop, inplace=True)
    # Remove all estimated cols
    cols_to_remove: list = [c for c in value.columns if '$' in c]
    value.drop(columns=cols_to_remove, inplace=True)
    # Drop non informative column
    non_informative_cols = ["index", "smiles", "conc",   "name",   "CLASS", "ENV_CHEM",  "MP_RATIO", "FP:MACCS", "neutral",	"TEST",	"CHARGED", "group"]
    non_informative_cols = [c for c in value.columns if c in non_informative_cols]
    value.drop(columns=non_informative_cols, inplace=True)
    print("\t>>> After: ", value.shape)

# Checking we have the right amount of features
for key, value in df_dict.items():
    if value.shape[1] != 480:
        warnings.warn(key, " has not the right feature count\n\texpected: ", 480, "\n\tactual: ", value.shape[1])

In [None]:
comb_list = permutations(df_dict, 2)

for df_name in list(comb_list):
    correlation = df_dict[df_name[0]]["Log_MP_RATIO"].isin(df_dict[df_name[1]]["Log_MP_RATIO"]).value_counts().index.tolist()
    if len(correlation) > 1:
        print(df_name[0], "is partially in", df_name[1])
    elif True in correlation:
        print(df_name[0], " is fully in ", df_name[1])

In [None]:
from feature_selector import *

FS_full = FeatureSelector(df_dict["full"], cols_to_ignore=["set"]).transform()
FS_ionizable = FeatureSelector(df_dict["ionizable"], cols_to_ignore=["set"]).transform()
FS_neutral = FeatureSelector(df_dict["neutral"], cols_to_ignore=["set"]).transform()

print("Shape of:\n\tfull\t\t:\t", FS_full.shape,"\n\tionizable\t:\t", FS_ionizable.shape, "\n\tneutral\t\t:\t", FS_neutral.shape)

In [None]:
# Separating test and train
full_train: pd.DataFrame = FS_full.loc[FS_full["set"] == "TRAINING"].copy().drop(columns=["set"])
full_test: pd.DataFrame = FS_full.loc[FS_full["set"] == "TEST"].copy().drop(columns=["set"])

# Ionized
ionizable_train: pd.DataFrame = FS_ionizable.loc[FS_ionizable["set"] == "TRAINING"].copy().drop(columns=["set"])
ionizable_test: pd.DataFrame = FS_ionizable.loc[FS_ionizable["set"] == "TEST"].copy().drop(columns=["set"])

# Neutral
neutral_train: pd.DataFrame = FS_neutral.loc[FS_neutral["set"] == "TRAINING"].copy().drop(columns=["set"])
neutral_test: pd.DataFrame = FS_neutral.loc[FS_neutral["set"] == "TEST"].copy().drop(columns=["set"])


print("===== FULL =====\n\tTrain:\t", full_train.shape, "\n\tTest:\t", full_test.shape)
print("===== IONIZABLE =====\n\tTrain:\t", ionizable_train.shape, "\n\tTest:\t", ionizable_test.shape)
print("===== NEUTRAL =====\n\tTrain:\t", neutral_train.shape, "\n\tTest:\t", neutral_test.shape)

print("===== CHECK IF TESTS ARE IN OTHER DATAFRAMES =====")
print("\t>>>ionizable test in full train\n\t\t", ionizable_test["Log_MP_RATIO"].isin(full_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>neutral test in full train\n\t\t", neutral_test["Log_MP_RATIO"].isin(full_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>full test in ionizable train\n\t\t", full_test["Log_MP_RATIO"].isin(ionizable_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>full test in ionizable train\n\t\t", full_test["Log_MP_RATIO"].isin(neutral_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))

In [None]:
unfiltered_full_train = df_dict["full"].loc[df_dict["full"]["set"] == "TRAINING"].copy().drop(columns=["set"])
unfiltered_full_test = df_dict["full"].loc[df_dict["full"]["set"] == "TEST"].copy().drop(columns=["set"])

unfiltered_ionizable_train = df_dict["ionizable"].loc[df_dict["ionizable"]["set"] == "TRAINING"].copy().drop(columns=["set"])
unfiltered_ionizable_test = df_dict["ionizable"].loc[df_dict["ionizable"]["set"] == "TEST"].copy().drop(columns=["set"])

unfiltered_neutral_train = df_dict["neutral"].loc[df_dict["neutral"]["set"] == "TRAINING"].copy().drop(columns=["set"])
unfiltered_neutral_test = df_dict["neutral"].loc[df_dict["neutral"]["set"] == "TEST"].copy().drop(columns=["set"])


print("Shape of unfiltered:"
      "\n\tfull\n\t\tTrain\t:\t", unfiltered_full_train.shape,"\n\t\tTest\t:\t", unfiltered_full_test.shape,
      "\n\tionizable\n\t\tTrain\t:\t", unfiltered_ionizable_train.shape,"\n\t\tTest\t:\t", ionizable_test.shape,
      "\n\tneutral\n\t\tTrain\t:\t", neutral_train.shape,"\n\t\tTest\t:\t", neutral_test.shape,
      )

print("===== CHECK IF UNFILTERED TESTS ARE IN OTHER DATAFRAMES =====")
print("\t>>>ionizable test in full train\n\t\t", unfiltered_ionizable_test["Log_MP_RATIO"].isin(unfiltered_full_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>neutral test in full train\n\t\t", unfiltered_neutral_test["Log_MP_RATIO"].isin(unfiltered_neutral_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>full test in ionizable train\n\t\t", unfiltered_full_test["Log_MP_RATIO"].isin(unfiltered_full_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))
print("\t>>>full test in ionizable train\n\t\t", unfiltered_full_test["Log_MP_RATIO"].isin(unfiltered_full_train["Log_MP_RATIO"]).value_counts("Log_MP_RATIO"))

with pd.ExcelWriter("unfiltered_data.xlsx") as writer:
    unfiltered_full_train.to_excel(writer, sheet_name="full_train", index=False)
    unfiltered_full_test.to_excel(writer, sheet_name="full_test", index=False)

    unfiltered_ionizable_train.to_excel(writer, sheet_name="ionizable_train", index=False)
    unfiltered_ionizable_test.to_excel(writer, sheet_name="ionizable_test", index=False)

    unfiltered_neutral_train.to_excel(writer, sheet_name="neutral_train", index=False)
    unfiltered_neutral_test.to_excel(writer, sheet_name="neutral_test", index=False)


with pd.ExcelWriter("filtered_data.xlsx") as writer:
    full_train.to_excel(writer, sheet_name="full_train", index=False)
    full_test.to_excel(writer, sheet_name="full_test", index=False)

    ionizable_train.to_excel(writer, sheet_name="ionizable_train", index=False)
    ionizable_test.to_excel(writer, sheet_name="ionizable_test", index=False)

    neutral_train.to_excel(writer, sheet_name="neutral_train", index=False)
    neutral_test.to_excel(writer, sheet_name="neutral_test", index=False)



In [None]:
from sklearn.preprocessing import StandardScaler


def Corr_list_with_output(df, target=0.5):
    cor = df.corr()      # Using Pearson Correlation
    cor_target = abs(cor['Log_MP_RATIO'])
    #Selecting highly correlated features
    features = cor_target[cor_target>target]
    return features

# unfiltered_full_train = pd.DataFrame(StandardScaler().fit_transform(unfiltered_full_train), columns = unfiltered_full_train.columns)
unfiltered_full_train
# # y = full_train.corrwith(full_train["Log_MP_RATIO"], method="pearson").abs()
# y = Corr_list_with_output(unfiltered_full_train, target=0.5)
# # correlated_to_y = full_train[y.index]
# # correlated_to_y
# y