In [1]:
import numpy as np
import pandas as pd
import os
import math

from pycausal.pycausal import pycausal as pc

import random
import re
import copy

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score



  from numpy.core.umath_tests import inner1d


In [2]:
foldername = "data"
filename = "german_credit_raw.csv"

sample_number = 5000
protected_attribute = 'personal_status'

In [3]:
data = pd.read_csv(foldername + '/' + filename)

print(data.dtypes)
# drop rows with any NA values


checking_status           object
duration                   int64
credit_history            object
purpose                   object
credit_amount              int64
savings_status            object
employment                object
installment_commitment     int64
personal_status           object
other_parties             object
residence_since            int64
property_magnitude        object
age                        int64
other_payment_plans       object
housing                   object
existing_credits           int64
job                       object
num_dependents             int64
own_telephone             object
foreign_worker            object
class                     object
dtype: object


In [4]:
# Data generator
def find_range_cols(data):
        range_col = [] # columns that fit numerical values into ranges\
        for col in data.select_dtypes(exclude=['int64']).columns:
            if any(item in data[col][0] for item in ['>', '<']): 
                range_col.append(col)
        #print(range_col)
        return range_col
    
class DataGene(object):
    
    def __init__(self, data, sample_num=10, class_col='class'):
        self.data = data
        self.sample_num = sample_num
        self.class_col = 'class'
        
    def get_samples(self):
        features = self.data#.drop([self.class_col], axis=1)
        range_cols = find_range_cols(self.data)
        cat_cols = features.select_dtypes(exclude=['int64']).drop(range_cols, axis=1).columns
        num_cols = features.select_dtypes(include=['int64']).columns
        print(num_cols)
        output = []
        for var in num_cols:
            output.append(var)
        #print(type(output))
        samples = []
        
        for i in range(self.sample_num):
            sample_cat = [
                random.choice( list(set(features[cat_name])) )
                for cat_name in cat_cols
            ]
            sample_num = [
                random.choice( list(set(features[num_name])) )
                for num_name in num_cols
            ]
            sample_range = [
                random.choice( list(set(features[range_name])) )
                for range_name in range_cols
            ]
            sample = sample_cat + sample_num + sample_range
            samples.append(sample)
            
        
        samples = pd.DataFrame(samples, columns=list(cat_cols)+list(num_cols)+range_cols)
        return samples              
    

In [5]:
# generate new samples
dataGene = DataGene(data, sample_num=sample_number)
new_samples = dataGene.get_samples()
new_samples
# encode sample data
#x_samples, y_samples = encoder.transform(new_samples)


Index(['duration', 'credit_amount', 'installment_commitment',
       'residence_since', 'age', 'existing_credits', 'num_dependents'],
      dtype='object')


Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,...,class,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status,employment
0,'delayed previously',education,100<=X<500,'male single',none,car,bank,rent,unemp/unskilled non res,none,...,bad,33,1980,1,1,45,2,2,no checking,'1<=X<4'
1,delayed previously,used car,'100<=X<500',male single,none,no known property,bank,'for free','high qualif/self emp/mgmt',yes,...,good,26,3380,1,2,55,3,1,>=200,'<1'
2,existing paid,domestic appliance,'>=1000',male mar/wid,none,'no known property',bank,own,'high qualif/self emp/mgmt',yes,...,good,11,2483,3,1,57,4,1,'>=200','4<=X<7'
3,'critical/other existing credit',retraining,no known savings,'male mar/wid',none,car,none,own,'unskilled resident',none,...,good,21,1301,1,2,70,3,2,'0<=X<200',unemployed
4,'all paid',new car,'500<=X<1000','male div/sep',guarantor,'life insurance',none,rent,high qualif/self emp/mgmt,none,...,good,21,2969,1,4,53,2,1,0<=X<200,'1<=X<4'
5,'delayed previously',repairs,'500<=X<1000','male single','co applicant','no known property',none,own,'unskilled resident',none,...,bad,28,3878,4,2,42,1,1,'>=200','<1'
6,'critical/other existing credit',retraining,no known savings,'male single',co applicant,'life insurance',none,own,'high qualif/self emp/mgmt',none,...,good,54,454,4,3,56,2,2,no checking,4<=X<7
7,'no credits/all paid',retraining,no known savings,female div/dep/mar,co applicant,real estate,none,rent,'high qualif/self emp/mgmt',none,...,good,4,3357,1,4,58,3,1,'0<=X<200','1<=X<4'
8,'delayed previously',other,500<=X<1000,male mar/wid,co applicant,car,bank,'for free',high qualif/self emp/mgmt,yes,...,bad,33,3275,1,3,28,1,1,'<0',>=7
9,'existing paid','new car','>=1000',male mar/wid,'co applicant',real estate,none,'for free',high qualif/self emp/mgmt,yes,...,bad,48,1893,2,3,38,4,2,>=200,'1<=X<4'


In [6]:
import pandas as pd
import os
from collections import OrderedDict
import math





def findRange(thresholds, v):
    for i, th in enumerate(thresholds):
        if(v <= th):
            if i==0:
                return "x<{}".format(th)
            elif i == len(thresholds)-1:
                return "x>{}".format(thresholds[i-1])
            else:
                return "{}<x<{}".format(thresholds[i-1], thresholds[i])

def convert_cate(arr):
    n = 4 #parts to be divided
    maxValue = max(arr)
    minValue = min(arr)
    thresholds = [ math.floor(i*(maxValue-minValue)/n)+minValue for i in range(n+1)]

    #print([findRange(thresholds, i) for i in arr])
    
    return pd.Series([findRange(thresholds, i) for i in arr])


def num2cate(dataIn):
    dfc = dataIn[:]
#     new_data = pd.DataFrame()
    for k in dfc.columns:
        if(k in dfc.select_dtypes(include=['int64'])):
            values = pd.to_numeric(dfc[k])
            dfc[k] = convert_cate(values.tolist())
        
    return dfc

# num to cate
new_samples = num2cate(new_samples)
new_samples

Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,...,class,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status,employment
0,'delayed previously',education,100<=X<500,'male single',none,car,bank,rent,unemp/unskilled non res,none,...,bad,21<x<38,250<x<4793,x<1,x<1,33<x<47,1<x<2,x>1,no checking,'1<=X<4'
1,delayed previously,used car,'100<=X<500',male single,none,no known property,bank,'for free','high qualif/self emp/mgmt',yes,...,good,21<x<38,250<x<4793,x<1,1<x<2,47<x<61,2<x<3,x<1,>=200,'<1'
2,existing paid,domestic appliance,'>=1000',male mar/wid,none,'no known property',bank,own,'high qualif/self emp/mgmt',yes,...,good,4<x<21,250<x<4793,2<x<3,x<1,47<x<61,x>3,x<1,'>=200','4<=X<7'
3,'critical/other existing credit',retraining,no known savings,'male mar/wid',none,car,none,own,'unskilled resident',none,...,good,4<x<21,250<x<4793,x<1,1<x<2,x>61,2<x<3,x>1,'0<=X<200',unemployed
4,'all paid',new car,'500<=X<1000','male div/sep',guarantor,'life insurance',none,rent,high qualif/self emp/mgmt,none,...,good,4<x<21,250<x<4793,x<1,x>3,47<x<61,1<x<2,x<1,0<=X<200,'1<=X<4'
5,'delayed previously',repairs,'500<=X<1000','male single','co applicant','no known property',none,own,'unskilled resident',none,...,bad,21<x<38,250<x<4793,x>3,1<x<2,33<x<47,x<1,x<1,'>=200','<1'
6,'critical/other existing credit',retraining,no known savings,'male single',co applicant,'life insurance',none,own,'high qualif/self emp/mgmt',none,...,good,38<x<55,250<x<4793,x>3,2<x<3,47<x<61,1<x<2,x>1,no checking,4<=X<7
7,'no credits/all paid',retraining,no known savings,female div/dep/mar,co applicant,real estate,none,rent,'high qualif/self emp/mgmt',none,...,good,x<4,250<x<4793,x<1,x>3,47<x<61,2<x<3,x<1,'0<=X<200','1<=X<4'
8,'delayed previously',other,500<=X<1000,male mar/wid,co applicant,car,bank,'for free',high qualif/self emp/mgmt,yes,...,bad,21<x<38,250<x<4793,x<1,2<x<3,19<x<33,x<1,x<1,'<0',>=7
9,'existing paid','new car','>=1000',male mar/wid,'co applicant',real estate,none,'for free',high qualif/self emp/mgmt,yes,...,bad,38<x<55,250<x<4793,1<x<2,2<x<3,33<x<47,x>3,x>1,>=200,'1<=X<4'


In [7]:

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

class DataEncoder(object):


    def __init__(self, class_column='class', cat_columns=None):
        self.class_column = class_column
        self.cat_columns = cat_columns

        # these will be trained with fit_encoders()
        self.column_encoders = {} # label encoder
        self.cat_encoder = None # one-hot encoder
        self.label_encoder = None # label encoder

    def fit(self, data):
        """
        Fit one-hot encoders for categorical features and an integer encoder for
        the label. These can be used later to transform raw data into a form
        that ATM can work with.

        data: pd.DataFrame of unprocessed data
        """
        if self.class_column not in data.columns:
            raise KeyError('Class column "%s" not found in dataset!' %
                           self.class_column)
            
        range_col = find_range_cols(data)
                
        self.range_col = range_col
            

        # encode categorical columns, leave ordinal values alone
        if self.cat_columns is None:
            cats = data.drop([self.class_column]+range_col, axis=1).select_dtypes(exclude=['int64'])
            self.cat_columns = cats.columns
        else:
            cats = data[self.cat_columns].drop(range_col, axis=1).select_dtypes(exclude=['int64'])
            
        self.cat_cols = cats.columns
        
        for cat_name in cats.columns:   
        # save the indices of categorical columns for one-hot encoding

            # encode each feature as an integer in range(unique_vals)
            le = LabelEncoder()
            cats[cat_name] = le.fit_transform(cats[cat_name])
            self.column_encoders[cat_name] = le

        # One-hot encode the whole feature matrix.
        # Set sparse to False so that we can test for NaNs in the output
        self.cat_encoder = OneHotEncoder(sparse=False)
        self.cat_encoder.fit(cats)

        # Train an encoder for the label as well
        labels = np.array(data[[self.class_column]])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)
        

    def transform(self, data):
        """
        Convert a DataFrame of labeled data to a feature matrix in the form
        that ATM can use.
        """
        y = self.transform_y(data)
        X = self.transform_x(data)

        return X, y
    
    def transform_x(self, data):
        """
        only transform x, for the generated data
        """
        cats = data[self.cat_columns]

        # encode each categorical feature as an integer
        for column, encoder in list(self.column_encoders.items()):
            cats[column] = encoder.transform(cats[column])

        # one-hot encode the categorical features
        X = self.cat_encoder.transform(cats)
        
        if self.class_column in data:
            nums = data.drop([self.class_column], axis=1).select_dtypes(include=['int64']).values
        else:
            nums = data.select_dtypes(include=['int64']).values
        # transform range cols into integrate. e.g., <4 -> 1; 4<x<7 -> 2
        ranges = []
        for col in self.range_col:
            values = data[col]
            ranges.append( self.range2int(values) )
        if(ranges==[]):
            X = np.concatenate((X, nums), axis=1)
        else:
            ranges = np.transpose( np.array(ranges) )
            X = np.concatenate((X, nums, ranges), axis=1)
        
        return X
    
    def transform_y(self, data):
        if self.class_column in data:
            # pull labels into a separate series and transform them to integers
            labels = np.array(data[[self.class_column]])
            y = self.label_encoder.transform(labels)
            # drop the label column and transform the remaining features
        else:
            y = None
            
        return y
    
    def range2int(self, values):
        ranges = []
        for v in values:
            if v not in ranges:
                ranges.append(v)
        
        def sort_key(x):
            num_strings = re.findall('\d+', x)
            # 'undefined' is in the front
            if len(num_strings)==0:
                return -1
            # x> 1, x<7
            elif len(num_strings)==1:
                return int(num_strings[0])*2
            # 1<x<7
            else:
                nums = map(int, num_strings) # string to number
                return sum(nums)
                
        ranges.sort(key=sort_key)
        return list(map(lambda x: ranges.index(x), values))
        
        
    
    def fit_transform(self, data):
        """ Process data into a form that ATM can use. """
        self.fit(data)
        return self.transform(data)
    

knn = KNeighborsClassifier(
                algorithm = "ball_tree",
                leaf_size = 40,
                metric = "manhattan",
                n_neighbors = 17
            )
    
encoder = DataEncoder()
encoder.fit(new_samples)
x_train, y_train = encoder.transform(new_samples)
knn.fit(x_train,y_train)
score = cross_val_score(knn, x_train, y_train, scoring='accuracy', cv=10) 
data

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad
5,no checking,36,existing paid,education,9055,no known savings,1<=X<4,2,male single,none,...,no known property,35,none,for free,1,unskilled resident,2,yes,yes,good
6,no checking,24,existing paid,furniture/equipment,2835,500<=X<1000,>=7,3,male single,none,...,life insurance,53,none,own,1,skilled,1,none,yes,good
7,0<=X<200,36,existing paid,used car,6948,<100,1<=X<4,2,male single,none,...,car,35,none,rent,1,high qualif/self emp/mgmt,1,yes,yes,good
8,no checking,12,existing paid,radio/tv,3059,>=1000,4<=X<7,2,male div/sep,none,...,real estate,61,none,own,1,unskilled resident,1,none,yes,good
9,0<=X<200,30,critical/other existing credit,new car,5234,<100,unemployed,4,male mar/wid,none,...,car,28,none,own,2,high qualif/self emp/mgmt,1,none,yes,bad


In [8]:
def generate_model_samples(data, sample_num, model, encoder):
    """
    models behavior on generated sample data:
    Args:
        data(pandas DataFrame): training data
        sample_num(int): 
        model(sklearn model object): already trained model
        encoder(instance of DataGene): instance of DataGene, already fit 
    Return:
        model_samples(pandas DataFrame): generated samples + model prected labels
    """
    dataGene = DataGene(data, sample_num)
    new_samples = dataGene.get_samples()
    samplesInit = new_samples
    print(samplesInit)
    samples = num2cate(samplesInit)
    print(samplesInit)
    # model predict
    #print(samples)
    x_samples, _ = encoder.transform(samples)
    #print(x_samples.size)
    y_samples = knn.predict(x_samples)
    
    #  concate 
    model_samples = samples.copy()
    model_samples['class'] = pd.Series(np.asarray(y_samples), index= samples.index) 

    return model_samples, samplesInit

x,y = generate_model_samples(data,sample_number ,knn, encoder)
y

Index(['duration', 'credit_amount', 'installment_commitment',
       'residence_since', 'age', 'existing_credits', 'num_dependents'],
      dtype='object')
                        credit_history               purpose  \
0                  no credits/all paid               new car   
1                        existing paid              radio/tv   
2                   delayed previously                 other   
3                      'existing paid'             education   
4                'no credits/all paid'               new car   
5                 'delayed previously'            retraining   
6                   delayed previously                 other   
7                 'delayed previously'               repairs   
8                             all paid  'domestic appliance'   
9                 'delayed previously'            retraining   
10                          'all paid'              business   
11                 no credits/all paid              business   
12          

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,...,class,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status,employment
0,no credits/all paid,new car,'<100','male div/sep',guarantor,life insurance,bank,'for free',high qualif/self emp/mgmt,yes,...,good,54,1471,1,4,74,2,2,no checking,'<1'
1,existing paid,radio/tv,>=1000,male single,'co applicant',car,none,'for free',skilled,none,...,bad,26,11590,2,2,49,1,2,'>=200','>=7'
2,delayed previously,other,<100,'male div/sep',guarantor,'life insurance',none,'for free',unskilled resident,yes,...,bad,26,5234,4,4,70,3,2,'<0',<1
3,'existing paid',education,'>=1000',male single,co applicant,no known property,none,for free,skilled,none,...,good,36,2746,3,3,20,1,2,no checking,'4<=X<7'
4,'no credits/all paid',new car,'100<=X<500','male div/sep','co applicant',no known property,stores,for free,unskilled resident,yes,...,good,30,1237,3,2,46,4,1,'0<=X<200',<1
5,'delayed previously',retraining,100<=X<500,female div/dep/mar,none,'life insurance',stores,'for free',unskilled resident,none,...,good,5,3108,3,4,54,3,2,<0,>=7
6,delayed previously,other,no known savings,'male div/sep',none,life insurance,stores,'for free',unskilled resident,yes,...,bad,30,3416,1,4,64,1,2,>=200,'1<=X<4'
7,'delayed previously',repairs,'<100','male single',co applicant,'life insurance',bank,own,high qualif/self emp/mgmt,yes,...,bad,27,3650,1,4,41,3,2,0<=X<200,'>=7'
8,all paid,'domestic appliance','>=1000','male single',co applicant,car,none,'for free',high qualif/self emp/mgmt,none,...,good,28,1980,3,3,41,4,2,'0<=X<200','<1'
9,'delayed previously',retraining,>=1000,'male mar/wid',guarantor,'life insurance',none,rent,'high qualif/self emp/mgmt',none,...,good,21,1288,3,1,29,2,2,no checking,<1
