# Imports

In [1]:
import pandas as pd
import numpy as np
import nbimporter
from scipy.io import arff
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
from algo_implementation import logistic_regression

# Preprocessor class

In [3]:
class Preprocessor:

    def remove_spaces(self, df_path, output_path):
          """
          Remove unnecessary spaces in dataset file.
          params:
          df_path - path to dataset file
          output_path - path where we save processed dataset
          """
          with open(df_path, 'r') as input_file:
            with open(output_path, 'w') as output_file:
                for line in input_file:
                    stripped_line = line.strip()
                    if stripped_line:
                        output_file.write(stripped_line + '\n')
    
    def nan_values_percentage(self, df, thresh=0.1):
        """
        Get percentages of NaN values in each column of dataframe.
        params:
        df - dataframe we process
        thresh - threshold for selecting column (if column has less NaN values than 'thresh', it is appended to 'dc' list)
        return:
        d - dictionary containing {name: NaN percentage} key-value pairs.
        dc - list containing colnames of columns, in which the NaN value precentage is lesser than thresh
        """
        d, dc = dict(), list()
        for name in df.columns:
            nan_values = df[name].isna().sum()
            percentage = nan_values/df.shape[0]
            d[name]= percentage
            if percentage < thresh:
                dc.append(name)
        return d, dc

    def vif(self, df):
        vif_coefs = pd.DataFrame()
        vif_coefs["variables"] = df.columns
        vif_coefs["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
        return vif_coefs

    def get_cat_num_colnames(self, df):
        """
        Get column names of categorical and numerical features
        params:
        df - dataframe we process
        return:
        categorical_cols, numerical_cols - lists of names of categorical and numerical columns
        """
        categorical_cols, numerical_cols = df.columns[df.dtypes == 'object'].tolist(), df.columns[df.dtypes != 'object'].tolist()
        return categorical_cols, numerical_cols
    
    def data_preprocess(self, df, categorical_cols, numerical_cols, num_imputer_strategy='mean'):
        """
        Preprocess dataset before fitting machine learning model. This includes:
        1. Dealing with NaN values
        2. One hot encoding of categorical features
        3. Scaling of numerical features
        params:
        df - dataframe we process
        categorical_cols - list of names of categorical columns
        numerical_cols - list of names of numerical columns
        imputer_strategy - strategy of dealing with NaN values (default - replacing with mean value)
        return:
        X - preprocessed dataset, ready for passing to ML model
        """
        binary_cols = [col for col in categorical_cols if df[col].nunique(dropna=True) == 2]
        multivalue_cols = list(set(categorical_cols) - set(binary_cols))
        print(binary_cols, multivalue_cols)
        
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=num_imputer_strategy)),
            ('scaler', MinMaxScaler())
            ])

        multivalue_transformer = OneHotEncoder()

        binary_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='if_binary'))
            ])

        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('mul', multivalue_transformer, multivalue_cols),
            ('bin', binary_transformer, binary_cols)
            ])

        processed_data = preprocessor.fit_transform(df)
        columns = numerical_cols + list(preprocessor.transformers_[1][1].get_feature_names_out()) + binary_cols

        X = pd.DataFrame(processed_data, columns=columns)

        return X

In [4]:
def decode_bytes(x):
    """
    Function for decoding bytes-type data 
    """
    if isinstance(x, bytes):
        return x.decode()
    else:
        return x

# Dataset 1

In [5]:
preprocessor = Preprocessor()
preprocessor.remove_spaces('chronic_kidney_disease.arff', 'chronic_kidney_disease2.arff')

data, meta = arff.loadarff('chronic_kidney_disease2.arff')
dataset1 = pd.DataFrame(data)
dataset1 = dataset1.applymap(decode_bytes)
dataset1 = dataset1.replace('?', np.nan)

categorical_cols, numerical_cols = preprocessor.get_cat_num_colnames(dataset1)



In [6]:
processed_data = preprocessor.data_preprocess(dataset1, categorical_cols, numerical_cols)
X, y = processed_data.iloc[:, :-1], processed_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size = 0.2)

['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class'] ['sg', 'al', 'su']


## Colinearity detection

### VIF

In [7]:
p = Preprocessor()
vif_coefs = p.vif(X_train)
vif_coefs[vif_coefs['VIF'] >= 10]

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Unnamed: 0,variables,VIF
11,sg_1.005,inf
12,sg_1.010,inf
13,sg_1.015,inf
14,sg_1.020,inf
15,sg_1.025,inf
16,sg_nan,inf
17,al_0,inf
18,al_1,inf
19,al_2,inf
20,al_3,inf


### Correlation

In [8]:
corr_matrix = X_train.corr()
corr_matrix



Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
age,1.0,0.152199,0.236323,0.195399,0.140063,-0.069579,0.043421,-0.184362,-0.187031,0.096293,...,-0.037435,-0.13241,0.169893,0.060294,0.395772,0.4073,0.237916,0.143978,0.108481,0.045511
bp,0.152199,1.0,0.206005,0.14652,0.111594,-0.10527,0.071787,-0.263191,-0.302335,0.01016,...,-0.119272,-0.140634,0.064362,0.060926,0.275136,0.233867,0.065825,0.167141,0.028995,0.164663
bgr,0.236323,0.206005,1.0,0.14069,0.102463,-0.148187,0.071314,-0.291373,-0.280311,0.10049,...,-0.189727,-0.277872,0.197297,0.054702,0.364005,0.50544,0.230908,0.22973,0.09188,0.141118
bu,0.195399,0.14652,0.14069,1.0,0.573234,-0.284353,0.371213,-0.517756,-0.498231,0.067299,...,-0.15833,-0.345455,0.202964,0.124102,0.335019,0.303527,0.174674,0.257183,0.319522,0.400471
sc,0.140063,0.111594,0.102463,0.573234,1.0,-0.641833,0.219745,-0.313843,-0.314718,-0.014338,...,-0.085235,-0.148174,0.051734,0.040131,0.263962,0.198009,0.20371,0.156525,0.161721,0.198469
sod,-0.069579,-0.10527,-0.148187,-0.284353,-0.641833,1.0,0.075369,0.321203,0.329839,0.015646,...,0.111963,0.155426,-0.131046,-0.063453,-0.298546,-0.283913,-0.231388,-0.153113,-0.149995,-0.182446
pot,0.043421,0.071787,0.071314,0.371213,0.219745,0.075369,1.0,-0.103645,-0.129761,-0.085081,...,0.020492,-0.168392,-0.026567,-0.004777,0.061357,0.063677,-0.009521,-0.026297,0.062154,0.112296
hemo,-0.184362,-0.263191,-0.291373,-0.517756,-0.313843,0.321203,-0.103645,1.0,0.853551,-0.139558,...,0.263103,0.41801,-0.265093,-0.182825,-0.554404,-0.476939,-0.259377,-0.382935,-0.369874,-0.545979
pcv,-0.187031,-0.302335,-0.280311,-0.498231,-0.314718,0.329839,-0.129761,0.853551,1.0,-0.174392,...,0.265172,0.444125,-0.283935,-0.158721,-0.542449,-0.476929,-0.261583,-0.382634,-0.395165,-0.497155
wbcc,0.096293,0.01016,0.10049,0.067299,-0.014338,0.015646,-0.085081,-0.139558,-0.174392,1.0,...,-0.010013,-0.130245,0.178629,0.124502,0.097674,0.130672,-0.020628,0.141528,0.126905,0.043194


### Feature selection

In [9]:
selector = SelectKBest(chi2, k=10)
X_chi2_train= selector.fit_transform(X_train, y_train)
X_chi2_test = selector.transform(X_test)

In [27]:

pca = PCA(n_components=10)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)
print(pca.explained_variance_)

[0.96874205 0.43055454 0.24752179 0.2029795  0.17968111 0.16080584
 0.14222498 0.12611737 0.12339236 0.11393265]


## Fit irls model

In [29]:
model = logistic_regression()
model.fit(X_pca_train, y_train)
accuracy = model.accuracy(X_pca_test, y_test)
print('pca_accuracy: {:.2f}'.format(accuracy))

pca_accuracy: 0.95


## Export dataset to $.csv$

In [None]:

kidney_disease = pd.concat([X, pd.DataFrame(y, columns=["class"])], axis=1)
kidney_disease.to_csv('kidney_disease.csv', index=False)