In [79]:
import machineLearning
import preprocess

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

class Resample():
    
    def __init__(self):
        self.df = preprocess.Preprocessing().getDataset()
        # Split dataframe by death event
        # Majority and minority have been pre-determined by examining frequency
        self.majority = self.df[self.df['DEATH_EVENT'] == 0]
        self.minority = self.df[self.df['DEATH_EVENT'] == 1]
    
    def undersample(self):
        ''' Reduces the size of majority class to that of minority class '''
        # replace is set to False such that the same element cannot be selected more than once
        reducedData = self.majority.sample(replace = False, n=len(self.minority), random_state=42)
        
        # Concatanate the two classes back together
        return pd.concat([reducedData,self.minority])
    
    def oversample(self):
        ''' Increases the size of the minority class to that of the majority class '''
        # replace is set to True as elements must be selected multiple times
        increasedData = self.minority.sample(replace = True, n=len(self.majority), random_state=42)
        
        # Concatanate the two classes back together
        return pd.concat([increasedData,self.majority])
    
class SyntheticSampling(Resample):
    
    def __init__(self):
        super().__init__()
    
    def smote(self):
        ''' Synthetic Minority Oversampling Technique '''
        
        #TODO: Ensure ages don't get rediculous
        #TODO: correct 
        
        model = NearestNeighbors(n_neighbors=3)
        model = model.fit(self.minority.drop('DEATH_EVENT',axis=1))

        # Establish synthetic dataset
        synthDataset = []

        for i in range(len(self.majority)-len(self.minority)):
            # Select random sample
            sample = (self.minority.sample().drop('DEATH_EVENT',axis=1))
            # Get k-nearest neighbor 
            dist, index = model.kneighbors(sample)
            # Select random index
            randN = index[0][np.random.randint(0,2)]
            # Produce synthetic sample by calculating difference between sample and neighbour
            synthSample = self.minority.drop('DEATH_EVENT',axis=1) + self.df.drop('DEATH_EVENT',axis=1).iloc[randN] - self.minority.drop('DEATH_EVENT',axis=1)*np.random.random()# Everything is converted to float
            # Append to dataset
            synthDataset.append(synthSample.iloc[[np.random.randint(0,len(synthSample))]])
        # Concatanate all the dataframes into a one      
        newList = pd.concat(synthDataset)
        return pd.concat([self.minority,newList]).fillna(1)
        
    def adasyn(self):
        ''' Adaptive Synthetic Sampling '''
        #TODO: This requires Feature selection, hence, come back to it!
        pass

s = SyntheticSampling()
s.smote()


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.000000,0.000000,582.000000,0.000000,20.000000,1.000000,265000.000000,1.900000,130.000000,1.000000,0.000000,4.000000,1.0
1,55.000000,0.000000,7861.000000,0.000000,38.000000,0.000000,263358.030000,1.100000,136.000000,1.000000,0.000000,6.000000,1.0
2,65.000000,0.000000,146.000000,0.000000,20.000000,0.000000,162000.000000,1.300000,129.000000,1.000000,1.000000,7.000000,1.0
3,50.000000,1.000000,111.000000,0.000000,20.000000,0.000000,210000.000000,1.900000,137.000000,1.000000,0.000000,7.000000,1.0
4,65.000000,1.000000,160.000000,1.000000,20.000000,0.000000,327000.000000,2.700000,116.000000,0.000000,0.000000,8.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,111.804521,1.000000,149.573593,1.668636,90.088620,1.668636,493892.626071,1.669227,230.934497,1.668636,0.668636,96.384536,1.0
163,76.139000,1.322780,829.368500,1.322780,36.297300,0.000000,220208.499355,2.790502,177.834759,0.000000,0.000000,117.670279,1.0
217,103.175530,0.799547,2997.406507,1.000000,85.968280,0.799547,425731.575033,9.495922,246.537919,1.000000,0.000000,186.711183,1.0
39,135.638738,0.000000,365.918392,1.927312,73.237868,0.000000,454085.748582,4.681937,275.678347,1.000000,1.000000,50.819369,1.0
