In [33]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import sys
import os

## Data loading

In [34]:
data_tr = pd.read_csv('../data/train.csv', nrows=1000)
data_te = pd.read_csv('../data/test.csv', nrows=1000)

In [35]:
data_tr.head()
print(data_tr.shape)
data_tr.info()
data_te.info()

(1000, 325)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 325 entries, _STATE to TARGET
dtypes: bool(1), float64(318), int64(6)
memory usage: 2.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 324 entries, _STATE to ID
dtypes: float64(318), int64(6)
memory usage: 2.5 MB


# Data cleaning


In [36]:
all_columns = data_tr.columns.values
# print(all_columns)
column_of_interest = ['SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINSR', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2', 'SLEPTIM1', 'LASTDEN4', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2', 'HAVARTH4', 'DIABETE4', 'WEIGHT2', 'HEIGHT3', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'USENOW3', 'ECIGNOW2', 'LCSFIRST', 'LCSLAST', 'LCSNUMCG', 'ALCDAY4', 'AVEDRNK3', 'DRNK3GE5', 'MAXDRNKS', 'FLUSHOT7', 'HIVRISK5', 'COVIDPOS', 'COVIDSMP', 'COVIDPRM', 'PDIABTS1', 'PREDIAB2', 'DIABTYPE', 'INSULIN1', 'CHKHEMO3', 'EYEEXAM1', 'FEETSORE', 'TOLDCFS', 'HAVECFS', 'WORKCFS', 'COVIDVA1', 'SHINGLE2', 'COPDCOGH', 'COPDFLEM', 'COPDBRTH', 'COPDBTST', 'COPDSMOK', 'CNCRDIFF', 'CNCRAGE', 'CNCRTYP2', 'CSRVDOC1', 'CIMEMLOS', 'CDHOUSE', 'CDASSIST', 'CDHELP', 'CDSOCIAL', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN', 'ACEPUNCH', 'LSATISFY', 'EMTSUPRT', 'SDHISOLT', 'SDHEMPLY', 'FOODSTMP', 'SDHFOOD1', 'SDHSTRE1', 'MARIJAN1', 'MARJSMOK']

columns_to_delete = ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'SEQNO', 'Num', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'NUMADULT', 'RESPSLCT', 'SAFETIME', 'CELLFON5', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'CHILDREN']

target_column = ['TARGET']
id_column = ['ID']

k = 0
for c in column_of_interest :
    if c not in all_columns :
        print(f"column '{c}' is not in dataframe")
    else :
        k+=1
        #print("good")
        pass
print(k)

for c in columns_to_delete :
    if c not in all_columns :
        print(f"column '{c}' is not in dataframe")
    else :
        #print("good")
        pass

X_tr = data_tr[column_of_interest]
X_te = data_te[column_of_interest]

id_te = data_te[id_column]
y_tr = data_tr[target_column].to_numpy().ravel()   # Note, target column is not in test data
# y_te = data_te[target_column].to_numpy().ravel()    # Only, launch if use extra.csv

print(y_tr.shape)
# print(y_te.shape)

84
column 'Num' is not in dataframe
(1000,)


# Data Treatment

For correcting imbalances

In [37]:
print(f"Shape of datatraining : {X_tr.shape}")
print(f"Shape of datatraining : {X_tr.dtypes}\n\n")

print("\nFor dftr")
for c in X_tr.columns.values :
    check_nan = X_tr[c].isnull().values.any()
    print(f"In column {c}\t NaN : {check_nan}".expandtabs(40))

Shape of datatraining : (1000, 84)
Shape of datatraining : SEXVAR      float64
GENHLTH     float64
PHYSHLTH    float64
MENTHLTH    float64
POORHLTH    float64
             ...   
FOODSTMP    float64
SDHFOOD1    float64
SDHSTRE1    float64
MARIJAN1    float64
MARJSMOK    float64
Length: 84, dtype: object



For dftr
In column SEXVAR                         NaN : False
In column GENHLTH                        NaN : False
In column PHYSHLTH                       NaN : False
In column MENTHLTH                       NaN : False
In column POORHLTH                       NaN : True
In column PRIMINSR                       NaN : False
In column PERSDOC3                       NaN : False
In column MEDCOST1                       NaN : False
In column CHECKUP1                       NaN : False
In column EXERANY2                       NaN : False
In column SLEPTIM1                       NaN : False
In column LASTDEN4                       NaN : True
In column CVDSTRK3                       NaN : Fa

# Feature engineering

In [38]:
# Utils

def plot_nan_heatmap(matrix, xlabels, ylabels, width=800, height=500, plot=True, full_plotly_offline=False):
    """
    Heatmap NaN visualization using Plotly.
    Colors will be:
        NaN : black
        else : white
    Args:
        matrix: 2D array containing the data values
        xlabels: Labels for x-axis (columns)
        ylabels: Labels for y-axis (rows)
        plot: Whether to display the plot
        full_plotly_offline: Whether to include full Plotly JS in HTML output
    Returns:
        HTML string of the plot
    """
    # Convert to DataFrame
    df = pd.DataFrame(matrix, columns=xlabels, index=ylabels)

    # Create a NaN indicator matrix: 1 for NaN, 0 for non-NaN
    nan_indicator = df.isna().astype(int)

    # Plot the NaN indicator matrix
    fig = px.imshow(
        nan_indicator,
        width=width,
        height=height,
        template="plotly_white",
        color_continuous_scale="Greys",  # Use a grayscale gradient
        labels={"color": "NaN Proportion"},
        aspect="auto",
        zmin=0,
        zmax=1,
    )
    fig.update_xaxes(side="top", tickangle=45, tickfont=dict(size=5))
    fig.update_layout(
        margin=dict(pad=1),
        # title="Proportion of NaN Values (1 = NaN, 0 = Non-NaN)",

    )

    # Return or show
    if plot:
        fig.show()
    if full_plotly_offline:
        return fig.to_html(full_html=True)
    else:
        return fig.to_html(full_html=False, include_plotlyjs=False)
    
def plot_nan_bar(matrix, columns):
    """
    Bar plot of NaN proportion for each columns
    Args:
        matrix: 2D array containing the values
        columns: columns to visualize
    """
    nans_prop = X_tr.isna().mean()
    plt.bar(columns, height=nans_prop)
    print(nans_prop)
    plt.show()



In [39]:
plot_nan_heatmap(X_tr.head(100), xlabels=column_of_interest, ylabels=range(100))

'<div>                            <div id="069c3ba4-3c79-4767-888f-acd99d61413c" class="plotly-graph-div" style="height:500px; width:800px;"></div>            <script type="text/javascript">                window.PLOTLYENV=window.PLOTLYENV || {};                                if (document.getElementById("069c3ba4-3c79-4767-888f-acd99d61413c")) {                    Plotly.newPlot(                        "069c3ba4-3c79-4767-888f-acd99d61413c",                        [{"coloraxis":"coloraxis","name":"0","x":["SEXVAR","GENHLTH","PHYSHLTH","MENTHLTH","POORHLTH","PRIMINSR","PERSDOC3","MEDCOST1","CHECKUP1","EXERANY2","SLEPTIM1","LASTDEN4","CVDSTRK3","ASTHMA3","ASTHNOW","CHCSCNC1","CHCOCNC1","CHCCOPD3","ADDEPEV3","CHCKDNY2","HAVARTH4","DIABETE4","WEIGHT2","HEIGHT3","DECIDE","DIFFWALK","DIFFDRES","DIFFALON","SMOKE100","SMOKDAY2","USENOW3","ECIGNOW2","LCSFIRST","LCSLAST","LCSNUMCG","ALCDAY4","AVEDRNK3","DRNK3GE5","MAXDRNKS","FLUSHOT7","HIVRISK5","COVIDPOS","COVIDSMP","COVIDPRM","PDIABTS1","PRED

In [40]:
# plot_nan_bar(X_tr, column_of_interest)

In [41]:
def rm_nan_columns(matrix, threshold=0.6):
    """
    Removes columns containing more than `threshold` fraction of NaNs.

    Parameters:
    matrix (pd.DataFrame): Input DataFrame
    threshold (float): Fraction of NaNs above which a column is removed (default: 0.5)

    Returns:
    pd.DataFrame: DataFrame with columns removed
    """
    print(f"We remove columns containing more than {threshold*100}% of NaNs")
    rem_cols = matrix.columns[matrix.isna().mean() > threshold]
    print("Columns to remove:", rem_cols)
    print(f"Number of columns to remove: {len(rem_cols)}")

    # Drop the columns and return the cleaned DataFrame
    cleaned_matrix = matrix.drop(columns=rem_cols)
    return cleaned_matrix

X_tr = rm_nan_columns(X_tr)
print(X_tr.columns)
X_te = X_te[X_tr.columns]   # Make the train and test data columns match

print(X_tr.shape)
print(X_te.shape)

We remove columns containing more than 60.0% of NaNs
Columns to remove: Index(['ASTHNOW', 'SMOKDAY2', 'LCSFIRST', 'LCSLAST', 'LCSNUMCG', 'COVIDSMP',
       'COVIDPRM', 'PDIABTS1', 'PREDIAB2', 'DIABTYPE', 'INSULIN1', 'CHKHEMO3',
       'EYEEXAM1', 'FEETSORE', 'TOLDCFS', 'HAVECFS', 'WORKCFS', 'COVIDVA1',
       'SHINGLE2', 'COPDCOGH', 'COPDFLEM', 'COPDBRTH', 'COPDBTST', 'COPDSMOK',
       'CNCRDIFF', 'CNCRAGE', 'CNCRTYP2', 'CSRVDOC1', 'CIMEMLOS', 'CDHOUSE',
       'CDASSIST', 'CDHELP', 'CDSOCIAL', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS',
       'ACEPRISN', 'ACEPUNCH', 'MARIJAN1', 'MARJSMOK'],
      dtype='object')
Number of columns to remove: 40
Index(['SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINSR',
       'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2', 'SLEPTIM1', 'LASTDEN4',
       'CVDSTRK3', 'ASTHMA3', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3',
       'CHCKDNY2', 'HAVARTH4', 'DIABETE4', 'WEIGHT2', 'HEIGHT3', 'DECIDE',
       'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'S

## Correct imbalances

In [42]:
def increase_positive(df, target_column, v_target=1.0, increase_by=4):
    dfp = df[df[target_column]==v_target]
    return pd.concat([df]+[dfp]*increase_by, axis=0)

increase_positive(X_tr, y_tr)
print(X_tr.shape)
print(X_te.shape)

(1000, 44)
(1000, 44)


Instead of removing all lines with NaNs, which will result on not enough data, we will replace NaNs values that are representative of the columns

In first instance, for categories with NaNs, we will replace those NaNs with the catagoriy that is the most present

In [43]:

def replace_nan(df, replace_by='mode'):
    """
    Replaces NaN values in each column of a DataFrame with the most frequent value (mode),
    mean, or median of that column.

    Parameters:
    df (pd.DataFrame): Input DataFrame
    replace_by (str): Method to replace NaN values. Options: 'mode', 'mean', 'median'

    Returns:
    pd.DataFrame: DataFrame with NaN values replaced by the specified method for each column

    Raises:
    ValueError: If replace_by is not one of 'mode', 'mean', or 'median'
    """
    clean_df = df.copy()

    for column in clean_df.columns:
        if replace_by not in ['mode', 'mean', 'median']:
            clean_df[column] = clean_df[column].fillna(-1)
        if replace_by == 'mode':
            mode = clean_df[column].mode()
            if not mode.empty:
                clean_df[column] = clean_df[column].fillna(mode[0])
            else:
                print(f"Column {column}: Could not calculate mode.")
        elif replace_by == 'mean':
            mean = clean_df[column].mean()
            clean_df[column] = clean_df[column].fillna(mean)
        elif replace_by == 'median':
            median = clean_df[column].median()
            clean_df[column] = clean_df[column].fillna(median)

    return clean_df


X_tr_mode = replace_nan(X_tr)

# plot_nan_proportion_heatmap(X_tr_mode.head(), xlabels=X_tr_mode.columns, ylabels=range(10))


# Prediction tah jai la vision

In [44]:
# rf = RandomForestClassifier(n_estimators=101, criterion='entropy', max_depth=6, random_state=42)
# rf.fit(X_tr_mode, y_tr)
# y_pred = rf.predict(X_te)

In [45]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_tr_mode, y_tr)
# y_pred = xgb.predict(X_te)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [46]:
print(X_tr.shape)
print(X_tr_mode.shape)
print(X_te.shape)

from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix


# print(f"precision:{precision_score(y_te, y_pred)}")
# print(f"f1:{f1_score(y_te, y_pred)}")
# print(f"Recall:{recall_score(y_te, y_pred)}")

# tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()
# print(tn)
# print(fp)

# print(confusion_matrix(y_te, y_pred))

(1000, 44)
(1000, 44)
(1000, 44)


# Si je veux soumettre mais mon PC est guez donc ignore cette cellule

In [47]:

sys.path.append(os.path.abspath("../soumission"))

def soumission_csv(model,df_data,indexes,filename="../data/prediction.csv"):
    """
    Génère un fichier csv de predictions reponse à soumettre à partir d'un model entrainé et d'un data_frame.
        
    Paramètres :
    - model, model entrainé 
    - df_data : data frame des données à predir, dois contenir la colone "ID" et les meme features que le model entrainé
    - filename : nom du fichier créé par la fonction
        
    Retour :
    - X_train_pca, X_test_pca : listes de matrices transformées
    - pca_list : liste des objets PCA par état
    - top_features_dict : dict avec top features par état
    """
    features = df_data.drop(columns=["ID"], errors='ignore')

    prediction = model.predict(features)

    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["ID", "TARGET"])  # En-têtes
        for index,id in enumerate(indexes) :
            writer.writerow([id, prediction[index]])
    print(f"Fichier '{filename}' créé avec succès !")
    return

soumission_csv(xgb, X_te, id_te,"../data/prediction_colon_of_interest.csv")

Fichier '../data/prediction_colon_of_interest.csv' créé avec succès !
