In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
def mean(df_column):
        return(np.sum(df_column)/len(df_column))
                
def median(df_column):
        df_column = df_column.sort_values(ascending=True).reset_index(drop=True)
        if len(df_column) % 2 == 0:
                return ((df_column[len(df_column)//2]+df_column[len(df_column)//2 - 1])/2)
        else: return df_column[len(df_column)//2]

def mode(df_column):
        freq = {}
        for elem in df_column:
                if elem in freq.keys():
                        freq[elem] += 1
                else : freq[elem] = 1
        return max(freq, key=freq.get)

def tendanceCentrale(df_column): #retourne la moyenne, mode et mediane ainsi que la symétrie
        tendances = []
        mean_ = mean(df_column)
        mode_ = mode(df_column)
        median_ = median(df_column)
        tendances.append(mean_)
        tendances.append(mode_)
        tendances.append(median_)
        if round(mean_,1) == round(mode_,1) and round(mean_,1) == round(median_,1) and round(median_,1) == round(mode_,1):
                tendances.append("Symétrique")
        elif round(mean_,1) > round(median_,1) and round(median_,1) > round(mode_,1):
                tendances.append("Positivement")
        elif round(mean_,1) < round(median_,1) and round(median_,1) < round(mode_,1):
                tendances.append("Négativement")
        else: tendances.append("Inindentifiée")
        return pd.Series(np.array(tendances), index=['mean', 'mode', 'median', 'symetrie'])

In [23]:
def ecartType(df_column):
        return(np.sqrt((np.sum(np.power(df_column-mean(df_column),2)))/len(df_column)))

def variance(df_column):
        return(np.power(ecartType(df_column),2))

def getQuartiles(df_column):
        df_column = df_column.sort_values(ascending=True).reset_index(drop=True)
        return (df_column[len(df_column)//4], df_column[(len(df_column)//4)*3])
        
def ecartInterquartile(df_column):
        Q1, Q3 = getQuartiles(df_column)
        return Q3-Q1
        
def dispersion(df_column): #get outlier data and make it into a pandas series
        mesures = []
        standard_deviation = ecartType(df_column)
        variance_ = variance(df_column)
        inter_q = ecartInterquartile(df_column)
        quart = getQuartiles(df_column)
        mesures.append(standard_deviation)
        mesures.append(variance_)
        mesures.append(inter_q)
        mesures.append(min(df_column))
        mesures.append(quart)
        mesures.append(max(df_column))
        outliers = []
        for each in df_column:
                if each > quart[1]+1.5*inter_q or each < quart[0]-1.5*inter_q:
                        outliers.append(each)
        return (pd.Series(np.array(mesures), index=['ecart_type', 'variance', 'IQR', 'min','quartiles', 'max']), set(outliers))

In [24]:

def z_score(dataframe):
        dataframe = pd.DataFrame(dataframe)
        z_scores = []
        means = []
        std_devs = []
        for j in dataframe.columns:
                means.append(mean(dataframe[[j]]).values[0])
                std_devs.append(ecartType(dataframe[[j]]).values[0])
        for i in range(0, dataframe.shape[0]):
                row = []
                c = 0
                for j in dataframe.columns:
                        row.append(((dataframe[[j]].iloc[i]-means[c])/std_devs[c]).values[0])
                        c+=1
                z_scores.append(row)
        return z_scores
        

In [25]:
def correlation(df_column1, df_column2):
        try:
                N = np.int64(len(df_column1))
                xy_sum = np.int64(np.sum(df_column1*df_column2))
                top_sum = np.int64(N*xy_sum)
                sum_x = np.sum(df_column1)
                sum_y = np.sum(df_column2)
                bottom_x = len(df_column1)*np.sum(list(map(lambda x: x*x, df_column1)))
                bottom_y = len(df_column2)*np.sum(list(map(lambda x: x*x, df_column2)))
                return ((top_sum - (sum_x*sum_y))/np.sqrt((bottom_x - np.power(sum_x,2))*(bottom_y - np.power(sum_y,2))))
        except: print("impossible to calculate")

In [26]:
df = pd.read_csv("dataset1.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [27]:
data_num = df.select_dtypes(exclude=['object'])
data_obj = df.select_dtypes(exclude=['float64', 'int64'])
data_num.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2.0,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3.0,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4.0,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4.0,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1.0,40,3,1,...,4,80,1,6,3,3,2,2,2,2


In [28]:
data_obj.head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No


In [29]:
mode(data_obj.Attrition)

'No'

### A. Traitement des valeurs manquantes et aberrantes

In [44]:
def replace_missing(df_column, method = 'mode'):
        print(df_column.dtype)
        if df_column.dtype == 'object':
                if method != 'mode': print('this method cannot be used on non-numerical attributes')
                else: 
                        for row in df_column.items():
                                if pd.isnull(row[1]) : 
                                        if method == 'mode':
                                                df_column[row[0]] = mode(df_column)
        else:
                for row in df_column.items():
                        #print(df_column[0])
                        if pd.isnull(row[1]) : 
                                if method == 'mode':
                                        df_column[row[0]] = mode(df_column)
                                elif method == 'median':
                                        df_column[row[0]] = median(df_column)
                                elif method == 'mean':
                                        df_column[row[0]] = mean(df_column)
                                else: print("an error has occured")
        return df_column

def delete_outliers(df, df_column, method = "delete"):
        outliers = dispersion(df_column)[1]
        #iterate through all rows
        for i in range(len(df_column)):
                if df_column.iloc[i] in outliers : 
                        if method == "delete":
                                df=df.drop([i])
                        elif method == "null":
                                pass
                        elif method == "replace_mode":
                                pass
                        elif method == "replace_mean":
                                pass
                        elif method == "replace_median":
                                pass
                        else: print("la methode choisie est invalide")

        return df


In [None]:
#replace_missing(df.EnvironmentSatisfaction).isnull().sum()
#delete_outliers(df, df.YearsInCurrentRole)

### B. Réduction des données via la discretisation des données continues

In [50]:
# divide the dataset into n quantiles of the same amount of values/rows
# this means each group will have the same amount of values but different interval sizes
# we can reduce a group into a single row by either doing the mean, the median or the mode
def discretisation_effectifs(df_column, Q, method = "mean"):
        step = (len(df_column)//Q)+1
        column = df_column.sort_values(ascending=True).reset_index(drop=True)
        quantiles = []
        for i in range(0,len(df_column),step):
                quantiles.append(column.iloc[i:i+step])
        for i in range(len(quantiles)): 
                if method == "mean":
                        quantiles[i] = mean(quantiles[i])
                elif method == "mode":
                        quantiles[i] = mode(quantiles[i])
                elif method == "median":
                        quantiles[i] = median(quantiles[i])
                else: print("la methode choisie est invalide")

        # we return the reduced column and then we just make another function to loop over the df?
        # but in this case we need to ask the chosen method for each column so idk about that lol 
        # they must also have the same number of quantiles too so
        return quantiles

In [51]:
discretisation_effectifs(df.Age, 20)

[21.0,
 25.027027027027028,
 26.972972972972972,
 28.513513513513512,
 29.594594594594593,
 30.783783783783782,
 31.87837837837838,
 33.24324324324324,
 34.2027027027027,
 35.148648648648646,
 36.21621621621622,
 37.54054054054054,
 38.945945945945944,
 40.41891891891892,
 42.13513513513514,
 44.2027027027027,
 46.25675675675676,
 49.33783783783784,
 52.75675675675676,
 57.015625]

In [33]:
# divide the dataset into n intervals of values
# this means each group will have the same interval sizes but different amount of values
# we can reduce a group into a single row by either doing the mean, the median or the mode
# if k value is not given, use the formula of huntsberger
import math
def discretisation_amplitude(df_column, df, K = -1):
        if K<0:
                K = 1+3*math.log10(len(df_column))
        column = df_column.sort_values(ascending=True).reset_index(drop=True)
        pass

### C. Réduction des données (redondances) horizontales et verticales

In [34]:
def detect_redundant(df_column):
        pass

In [35]:
def del_row(df):
        pass

def del_column(df):
        pass

### D. Normalisation des données

In [36]:
# follow formula
def min_max_normalisation(df):
        pass

In [37]:
#replace with the z score values, already done really but double check if it's the same formula
def z_score_normalisation(df):
        pass