In [26]:
import pandas as pd
import numpy as np


from scipy.stats import pearsonr
from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt


import seaborn as sns



# Exploration of Data

## my_exploration()

In [27]:
def my_exploration(data):
    display(data.describe())

## my_data_visualisation()

In [28]:
def my_data_visualisation(df, variable):
 
    df[variable].hist()
    plt.title(variable)
    plt.show()

# Clean Data

## my_missing_values()

In [29]:
def my_missing_values(df, apply_all=False, strategie=None):
  
    strategies = {}
    
    if apply_all:
        strat = strategie
        if strat == 'most':
            for col in df.columns[df.isna().any()].tolist():
                valeur_plus_frequente = df[col].value_counts().idxmax()
                df[col].fillna(valeur_plus_frequente, inplace=True)
        elif strat == 'constant':
            constante = input("give constant value to replace : ")
            for col in df.columns[df.isna().any()].tolist():
                df[col].fillna(constante, inplace=True)
        elif strat == 'delete':
            df = df.dropna()
        elif strat == 'mean':
            df.fillna(df.mean(), inplace=True)
        elif strat == 'median':
            df.fillna(df.median(), inplace=True)
    else:
        # iput strategy data
        for col in df.columns[df.isna().any()].tolist():
            strat = input(f"chose strategy '{col}':\n - 'delete' : delete nan values\n - 'replace' : replace nana value by mean\n - 'median' : by median\n - 'most' : mode\n - 'constant' : by constante\n")
            strategies[col] = strat

        for col, strat in strategies.items():
            if strat == 'delete':
                df = df.dropna(subset=[col])
            elif strat == 'replace':
                moyenne = df[col].mean()
                df[col].fillna(moyenne, inplace=True)
            elif strat == 'median':
                mediane = df[col].median()
                df[col].fillna(mediane, inplace=True)
            elif strat == 'most':
                valeur = df[col].mode().iloc[0]
                df[col].fillna(valeur, inplace=True)
            elif strat == 'constant':
                valeur_constante = input(f"give constant value for replace '{col}': ")
                df[col].fillna(valeur_constante, inplace=True)

    return df


## my_fill_missing_data()

In [30]:
def my_fill_missing_data(data, method='linear', axis=0):

    if axis != 0 and axis != 1:
        raise ValueError("axe must 0, soit 1.")

    missing = np.isnan(data)
    

    if np.all(missing):
        return data

    
    if not np.any(missing):
        return data

  
    if axis == 1:
        data = data.T
        missing = missing.T

    
    not_missing = np.logical_not(missing)

    # split data
    missing_x = np.argwhere(missing)
    missing_y = data[not_missing]
    not_missing_x = np.argwhere(not_missing)
    not_missing_y = data[not_missing]

    # apply replace method
    if method == 'linear':
        model = LinearRegression()
        model.fit(not_missing_x, not_missing_y)
        missing_y = model.predict(missing_x)
    elif method == 'nearest':
        missing_y = np.interp(missing_x, not_missing_x, not_missing_y)
    elif method == 'zero':
        missing_y = np.zeros(missing_x.shape)
    elif method == 'slinear':
        missing_y = np.interp(missing_x, not_missing_x, not_missing_y, left=0, right=0)
    elif method == 'quadratic':
        p = np.polyfit(not_missing_x.flatten(), not_missing_y, 2)
        missing_y = np.polyval(p, missing_x.flatten())
    elif method == 'cubic':
        p = np.polyfit(not_missing_x.flatten(), not_missing_y, 3)
        missing_y = np.polyval(p, missing_x.flatten())

    # add missing data
    data[missing] = missing_y.flatten()

    
    if axis == 1:
        data = data.T
        
    print(missing)
    print(data)

    return data


## my_data_scaler()

In [31]:
import numpy as np

def my_data_scaler(df, columns=None, method="standard", scaler_all=False):
 
    # check if can scale
    if columns is None and not scaler_all:
        raise ValueError("check the columns to scale.")
    
    if scaler_all:
        columns = df.columns.tolist()
    
    # select sclae method.
    if method == "standard":
        scaler = StandardScaler()
    elif method == "minmax":
        scaler = MinMaxScaler()
    elif method == "robust":
        scaler = RobustScaler()
    elif method == "log":
        def log_scaler(data):
            return np.log1p(data)
        scaler = log_scaler
    else:
        raise ValueError("Method not valid:'standard', 'minmax', 'robust', 'log'.")

    # scale data   
    if method == "log":
        df_scaled = df[columns].apply(scaler)
    else:
        scaler.fit(df[columns])
        scaled_data = scaler.transform(df[columns])
        df_scaled = pd.DataFrame(scaled_data, columns=columns, index=df.index)
    
    # replace by sclaed data.
    df = pd.concat([df.drop(columns, axis=1), df_scaled], axis=1)
    return df




## My_inverse_log

In [32]:
def my_inverse_log(df, columns=None, inverse_log_all=False):

    
    if columns is None and not inverse_log_all:
        raise ValueError("Specefy column.")
    
    if inverse_log_all:
        columns = df.columns.tolist()
    
    
    df_inverse_log = df[columns].apply(lambda x: np.expm1(x))
    
    df_result = pd.concat([df.drop(columns, axis=1), df_inverse_log], axis=1)
    
    return df_result


# EDA

## Univriate Analyse

### my_boxplots()

In [33]:
def my_boxplots(df):
   
    fig, ax = plt.subplots(figsize=(15,10))
    df.boxplot(ax=ax)
    plt.xticks(rotation=90)
    plt.show()

## Bivariate Analyse

### my_corr_heatmap()

In [34]:
def my_corr_heatmap(data, annot=True):

    plt.figure(figsize=(10,6))
    plt.title("Heatmap of pearson correlation")
    sns.heatmap(data.corr(), annot=annot,cmap="coolwarm")
    plt.show()

# Machine Learning

## my_backward_selected()

In [35]:
def my_backward_selected(data, response):
 
    remaining = set(data._get_numeric_data().columns)
    if response in remaining:
        remaining.remove(response)
    cond = True

    while remaining and cond:
        formula = "{} ~ {} + 1".format(response,' + '.join(remaining))
        print('_______________________________')
        print(formula)
        model = smf.ols(formula, data).fit()
        score = model.pvalues[1:]
        toRemove = score[score == score.max()]
        if toRemove.values > 0.05:
            print('remove', toRemove.index[0], '(p-value :', round(toRemove.values[0],3), ')')
            remaining.remove(toRemove.index[0])
        else:
            cond = False
            print('is the final model !')
        print('')
    print(model.summary())
    
    return model

In [36]:
def my_backward_selected_logistic(data, response):
 
    remaining = set(data._get_numeric_data().columns)
    if response in remaining:
        remaining.remove(response)
    cond = True

    while remaining and cond:
        formula = "{} ~ {} + 1".format(response, ' + '.join(remaining))
        print('_______________________________')
        print(formula)
        model = smf.logit(formula, data).fit()
        score = model.pvalues[1:]
        toRemove = score[score == score.max()]
        if toRemove.values > 0.05:
            print('remove', toRemove.index[0], '(p-value:', round(toRemove.values[0], 3), ')')
            remaining.remove(toRemove.index[0])
        else:
            cond = False
            print('is the final model!')
        print('')
    print(model.summary())

    return model
