In [1]:
import numpy as np
import pandas as pd
import os
import math

from pycausal.pycausal import pycausal as pc

import random
import re
import copy

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score



In [2]:
foldername = "data"
filename = "credit_clean.csv"

sample_number = 5000
protected_attribute = 'sex'

In [3]:
data = pd.read_csv(foldername + '/' + filename)

print(data.dtypes)
# drop rows with any NA values


marriage                  object
gender                    object
checking_status           object
duration                  object
credit_history            object
purpose                   object
credit_amount             object
savings_status            object
employment                object
installment_commitment     int64
other_parties             object
residence_since            int64
property_magnitude        object
age                       object
other_payment_plans       object
housing                   object
existing_credits           int64
job                       object
num_dependents             int64
own_telephone             object
foreign_worker            object
class                     object
dtype: object


In [4]:
# Data generator
def find_range_cols(data):
        range_col = [] # columns that fit numerical values into ranges\
        for col in data.select_dtypes(exclude=['int64']).columns:
            if any(item in data[col][0] for item in ['>', '<']): 
                range_col.append(col)
        #print(range_col)
        return range_col
    
class DataGene(object):
    
    def __init__(self, data, sample_num=10, class_col='class'):
        self.data = data
        self.sample_num = sample_num
        self.class_col = 'class'
        
    def get_samples(self):
        features = self.data#.drop([self.class_col], axis=1)
        range_cols = find_range_cols(self.data)
        cat_cols = features.select_dtypes(exclude=['int64']).drop(range_cols, axis=1).columns
        num_cols = features.select_dtypes(include=['int64']).columns
        print(num_cols)
        output = []
        for var in num_cols:
            output.append(var)
        #print(type(output))
        samples = []
        
        for i in range(self.sample_num):
            sample_cat = [
                random.choice( list(set(features[cat_name])) )
                for cat_name in cat_cols
            ]
            sample_num = [
                random.choice( list(set(features[num_name])) )
                for num_name in num_cols
            ]
            sample_range = [
                random.choice( list(set(features[range_name])) )
                for range_name in range_cols
            ]
            sample = sample_cat + sample_num + sample_range
            samples.append(sample)
            
        
        samples = pd.DataFrame(samples, columns=list(cat_cols)+list(num_cols)+range_cols)
        return samples              
    

In [5]:
# generate new samples
dataGene = DataGene(data, sample_num=sample_number)
new_samples = dataGene.get_samples()
new_samples
# encode sample data
#x_samples, y_samples = encoder.transform(new_samples)


Index(['installment_commitment', 'residence_since', 'existing_credits',
       'num_dependents'],
      dtype='object')


Unnamed: 0,marriage,gender,credit_history,purpose,savings_status,other_parties,property_magnitude,other_payment_plans,housing,job,...,class,installment_commitment,residence_since,existing_credits,num_dependents,checking_status,duration,credit_amount,employment,age
0,mar/wid,male,all paid,new car,>=1000,guarantor,no known property,stores,for free,skilled,...,good,1,1,1,1,>=200,x<4,9337<x<13880,>=7,47<x<61
1,mar/wid,male,all paid,used car,100<=X<500,none,life insurance,bank,for free,high qualif/self emp/mgmt,...,good,1,1,2,1,no checking,21<x<38,4793<x<9337,<1,47<x<61
2,mar/wid,female,critical/other existing credit,radio/tv,500<=X<1000,guarantor,real estate,stores,for free,unskilled resident,...,good,2,1,3,2,>=200,38<x<55,x<250,>=7,47<x<61
3,div/sep,male,existing paid,used car,500<=X<1000,co applicant,car,bank,own,skilled,...,good,4,4,3,1,>=200,x>55,x<250,<1,x>61
4,div/dep/mar,female,critical/other existing credit,used car,500<=X<1000,co applicant,real estate,stores,for free,unskilled resident,...,bad,2,1,2,1,0<=X<200,4<x<21,9337<x<13880,1<=X<4,x<19
5,single,male,critical/other existing credit,domestic appliance,>=1000,co applicant,life insurance,stores,own,unskilled resident,...,good,2,2,4,1,>=200,x>55,9337<x<13880,>=7,33<x<47
6,div/dep/mar,female,critical/other existing credit,retraining,<100,co applicant,no known property,stores,own,high qualif/self emp/mgmt,...,bad,1,2,1,2,>=200,4<x<21,4793<x<9337,4<=X<7,x>61
7,div/dep/mar,female,existing paid,other,no known savings,co applicant,car,none,for free,skilled,...,good,2,4,3,2,no checking,x>55,x>13880,1<=X<4,x>61
8,single,male,critical/other existing credit,used car,>=1000,none,no known property,bank,for free,unskilled resident,...,bad,4,2,2,1,no checking,38<x<55,x<250,1<=X<4,x>61
9,div/sep,female,critical/other existing credit,new car,100<=X<500,none,life insurance,bank,own,unemp/unskilled non res,...,bad,1,1,4,2,0<=X<200,21<x<38,250<x<4793,unemployed,47<x<61


In [6]:
import pandas as pd
import os
from collections import OrderedDict
import math





def findRange(thresholds, v):
    for i, th in enumerate(thresholds):
        if(v <= th):
            if i==0:
                return "x<{}".format(th)
            elif i == len(thresholds)-1:
                return "x>{}".format(thresholds[i-1])
            else:
                return "{}<x<{}".format(thresholds[i-1], thresholds[i])

def convert_cate(arr):
    n = 4 #parts to be divided
    maxValue = max(arr)
    minValue = min(arr)
    thresholds = [ math.floor(i*(maxValue-minValue)/n)+minValue for i in range(n+1)]

    #print([findRange(thresholds, i) for i in arr])
    
    return pd.Series([findRange(thresholds, i) for i in arr])


def num2cate(dataIn):
    dfc = dataIn[:]
#     new_data = pd.DataFrame()
    for k in dfc.columns:
        if(k in dfc.select_dtypes(include=['int64'])):
            values = pd.to_numeric(dfc[k])
            dfc[k] = convert_cate(values.tolist())
        
    return dfc

# num to cate
new_samples = num2cate(new_samples)
new_samples

Unnamed: 0,marriage,gender,credit_history,purpose,savings_status,other_parties,property_magnitude,other_payment_plans,housing,job,...,class,installment_commitment,residence_since,existing_credits,num_dependents,checking_status,duration,credit_amount,employment,age
0,mar/wid,male,all paid,new car,>=1000,guarantor,no known property,stores,for free,skilled,...,good,x<1,x<1,x<1,x<1,>=200,x<4,9337<x<13880,>=7,47<x<61
1,mar/wid,male,all paid,used car,100<=X<500,none,life insurance,bank,for free,high qualif/self emp/mgmt,...,good,x<1,x<1,1<x<2,x<1,no checking,21<x<38,4793<x<9337,<1,47<x<61
2,mar/wid,female,critical/other existing credit,radio/tv,500<=X<1000,guarantor,real estate,stores,for free,unskilled resident,...,good,1<x<2,x<1,2<x<3,x>1,>=200,38<x<55,x<250,>=7,47<x<61
3,div/sep,male,existing paid,used car,500<=X<1000,co applicant,car,bank,own,skilled,...,good,x>3,x>3,2<x<3,x<1,>=200,x>55,x<250,<1,x>61
4,div/dep/mar,female,critical/other existing credit,used car,500<=X<1000,co applicant,real estate,stores,for free,unskilled resident,...,bad,1<x<2,x<1,1<x<2,x<1,0<=X<200,4<x<21,9337<x<13880,1<=X<4,x<19
5,single,male,critical/other existing credit,domestic appliance,>=1000,co applicant,life insurance,stores,own,unskilled resident,...,good,1<x<2,1<x<2,x>3,x<1,>=200,x>55,9337<x<13880,>=7,33<x<47
6,div/dep/mar,female,critical/other existing credit,retraining,<100,co applicant,no known property,stores,own,high qualif/self emp/mgmt,...,bad,x<1,1<x<2,x<1,x>1,>=200,4<x<21,4793<x<9337,4<=X<7,x>61
7,div/dep/mar,female,existing paid,other,no known savings,co applicant,car,none,for free,skilled,...,good,1<x<2,x>3,2<x<3,x>1,no checking,x>55,x>13880,1<=X<4,x>61
8,single,male,critical/other existing credit,used car,>=1000,none,no known property,bank,for free,unskilled resident,...,bad,x>3,1<x<2,1<x<2,x<1,no checking,38<x<55,x<250,1<=X<4,x>61
9,div/sep,female,critical/other existing credit,new car,100<=X<500,none,life insurance,bank,own,unemp/unskilled non res,...,bad,x<1,x<1,x>3,x>1,0<=X<200,21<x<38,250<x<4793,unemployed,47<x<61


In [7]:

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

class DataEncoder(object):


    def __init__(self, class_column='class', cat_columns=None):
        self.class_column = class_column
        self.cat_columns = cat_columns

        # these will be trained with fit_encoders()
        self.column_encoders = {} # label encoder
        self.cat_encoder = None # one-hot encoder
        self.label_encoder = None # label encoder

    def fit(self, data):
        """
        Fit one-hot encoders for categorical features and an integer encoder for
        the label. These can be used later to transform raw data into a form
        that ATM can work with.

        data: pd.DataFrame of unprocessed data
        """
        if self.class_column not in data.columns:
            raise KeyError('Class column "%s" not found in dataset!' %
                           self.class_column)
            
        range_col = find_range_cols(data)
                
        self.range_col = range_col
            

        # encode categorical columns, leave ordinal values alone
        if self.cat_columns is None:
            cats = data.drop([self.class_column]+range_col, axis=1).select_dtypes(exclude=['int64'])
            self.cat_columns = cats.columns
        else:
            cats = data[self.cat_columns].drop(range_col, axis=1).select_dtypes(exclude=['int64'])
            
        self.cat_cols = cats.columns
        
        for cat_name in cats.columns:   
        # save the indices of categorical columns for one-hot encoding

            # encode each feature as an integer in range(unique_vals)
            le = LabelEncoder()
            cats[cat_name] = le.fit_transform(cats[cat_name])
            self.column_encoders[cat_name] = le

        # One-hot encode the whole feature matrix.
        # Set sparse to False so that we can test for NaNs in the output
        self.cat_encoder = OneHotEncoder(sparse=False)
        self.cat_encoder.fit(cats)

        # Train an encoder for the label as well
        labels = np.array(data[[self.class_column]])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)
        

    def transform(self, data):
        """
        Convert a DataFrame of labeled data to a feature matrix in the form
        that ATM can use.
        """
        y = self.transform_y(data)
        X = self.transform_x(data)

        return X, y
    
    def transform_x(self, data):
        """
        only transform x, for the generated data
        """
        cats = data[self.cat_columns]

        # encode each categorical feature as an integer
        for column, encoder in list(self.column_encoders.items()):
            cats[column] = encoder.transform(cats[column])

        # one-hot encode the categorical features
        X = self.cat_encoder.transform(cats)
        
        if self.class_column in data:
            nums = data.drop([self.class_column], axis=1).select_dtypes(include=['int64']).values
        else:
            nums = data.select_dtypes(include=['int64']).values
        # transform range cols into integrate. e.g., <4 -> 1; 4<x<7 -> 2
        ranges = []
        for col in self.range_col:
            values = data[col]
            ranges.append( self.range2int(values) )
        if(ranges==[]):
            X = np.concatenate((X, nums), axis=1)
        else:
            ranges = np.transpose( np.array(ranges) )
            X = np.concatenate((X, nums, ranges), axis=1)
        
        return X
    
    def transform_y(self, data):
        if self.class_column in data:
            # pull labels into a separate series and transform them to integers
            labels = np.array(data[[self.class_column]])
            y = self.label_encoder.transform(labels)
            # drop the label column and transform the remaining features
        else:
            y = None
            
        return y
    
    def range2int(self, values):
        ranges = []
        for v in values:
            if v not in ranges:
                ranges.append(v)
        
        def sort_key(x):
            num_strings = re.findall('\d+', x)
            # 'undefined' is in the front
            if len(num_strings)==0:
                return -1
            # x> 1, x<7
            elif len(num_strings)==1:
                return int(num_strings[0])*2
            # 1<x<7
            else:
                nums = map(int, num_strings) # string to number
                return sum(nums)
                
        ranges.sort(key=sort_key)
        return list(map(lambda x: ranges.index(x), values))
        
        
    
    def fit_transform(self, data):
        """ Process data into a form that ATM can use. """
        self.fit(data)
        return self.transform(data)
    

knn = KNeighborsClassifier(
                algorithm = "ball_tree",
                leaf_size = 40,
                metric = "manhattan",
                n_neighbors = 17
            )
    
encoder = DataEncoder()
encoder.fit(new_samples)
x_train, y_train = encoder.transform(new_samples)
knn.fit(x_train,y_train)
score = cross_val_score(knn, x_train, y_train, scoring='accuracy', cv=10) 
data

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,marriage,gender,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,single,male,<0,4<x<21,critical/other existing credit,radio/tv,250<x<4793,no known savings,>=7,4,...,real estate,x>61,none,own,2,skilled,1,yes,yes,good
1,div/dep/mar,female,0<=X<200,38<x<55,existing paid,radio/tv,4793<x<9337,<100,1<=X<4,2,...,real estate,19<x<33,none,own,1,skilled,1,none,yes,bad
2,single,male,no checking,4<x<21,critical/other existing credit,education,250<x<4793,<100,4<=X<7,2,...,real estate,47<x<61,none,own,1,unskilled resident,2,none,yes,good
3,single,male,<0,38<x<55,existing paid,furniture/equipment,4793<x<9337,<100,4<=X<7,2,...,life insurance,33<x<47,none,for free,1,skilled,2,none,yes,good
4,single,male,<0,21<x<38,delayed previously,new car,4793<x<9337,<100,1<=X<4,3,...,no known property,47<x<61,none,for free,2,skilled,2,none,yes,bad
5,single,male,no checking,21<x<38,existing paid,education,4793<x<9337,no known savings,1<=X<4,2,...,no known property,33<x<47,none,for free,1,unskilled resident,2,yes,yes,good
6,single,male,no checking,21<x<38,existing paid,furniture/equipment,250<x<4793,500<=X<1000,>=7,3,...,life insurance,47<x<61,none,own,1,skilled,1,none,yes,good
7,single,male,0<=X<200,21<x<38,existing paid,used car,4793<x<9337,<100,1<=X<4,2,...,car,33<x<47,none,rent,1,high qualif/self emp/mgmt,1,yes,yes,good
8,div/sep,male,no checking,4<x<21,existing paid,radio/tv,250<x<4793,>=1000,4<=X<7,2,...,real estate,47<x<61,none,own,1,unskilled resident,1,none,yes,good
9,mar/wid,male,0<=X<200,21<x<38,critical/other existing credit,new car,4793<x<9337,<100,unemployed,4,...,car,19<x<33,none,own,2,high qualif/self emp/mgmt,1,none,yes,bad


In [8]:
def generate_model_samples(data, sample_num, model, encoder):
    """
    models behavior on generated sample data:
    Args:
        data(pandas DataFrame): training data
        sample_num(int): 
        model(sklearn model object): already trained model
        encoder(instance of DataGene): instance of DataGene, already fit 
    Return:
        model_samples(pandas DataFrame): generated samples + model prected labels
    """
    dataGene = DataGene(data, sample_num)
    new_samples = dataGene.get_samples()
    samplesInit = new_samples
    print(samplesInit)
    samples = num2cate(samplesInit)
    print(samplesInit)
    # model predict
    #print(samples)
    x_samples, _ = encoder.transform(samples)
    #print(x_samples.size)
    y_samples = knn.predict(x_samples)
    
    #  concate 
    model_samples = samples.copy()
    model_samples['class'] = pd.Series(np.asarray(y_samples), index= samples.index) 

    return model_samples, samplesInit

x,y = generate_model_samples(data,sample_number ,knn, encoder)
y

Index(['installment_commitment', 'residence_since', 'existing_credits',
       'num_dependents'],
      dtype='object')
         marriage  gender                  credit_history  \
0         div/sep  female                        all paid   
1         div/sep  female              delayed previously   
2         div/sep    male             no credits/all paid   
3     div/dep/mar    male              delayed previously   
4          single    male                        all paid   
5          single  female  critical/other existing credit   
6          single    male                        all paid   
7         div/sep    male                   existing paid   
8         mar/wid    male                   existing paid   
9          single  female  critical/other existing credit   
10         single    male                   existing paid   
11    div/dep/mar    male                   existing paid   
12    div/dep/mar  female                   existing paid   
13        div/sep  female 

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,marriage,gender,credit_history,purpose,savings_status,other_parties,property_magnitude,other_payment_plans,housing,job,...,class,installment_commitment,residence_since,existing_credits,num_dependents,checking_status,duration,credit_amount,employment,age
0,div/sep,female,all paid,repairs,>=1000,guarantor,car,none,own,unemp/unskilled non res,...,bad,4,2,3,2,no checking,x<4,x<250,1<=X<4,x<19
1,div/sep,female,delayed previously,radio/tv,100<=X<500,guarantor,no known property,none,for free,unemp/unskilled non res,...,bad,2,2,1,2,<0,38<x<55,x>13880,>=7,19<x<33
2,div/sep,male,no credits/all paid,domestic appliance,500<=X<1000,guarantor,no known property,none,for free,unemp/unskilled non res,...,good,1,1,4,1,<0,38<x<55,9337<x<13880,4<=X<7,33<x<47
3,div/dep/mar,male,delayed previously,used car,100<=X<500,none,life insurance,stores,rent,unskilled resident,...,good,3,1,2,1,<0,x>55,250<x<4793,<1,33<x<47
4,single,male,all paid,furniture/equipment,500<=X<1000,none,car,none,own,unskilled resident,...,good,2,1,2,1,0<=X<200,x<4,4793<x<9337,4<=X<7,x>61
5,single,female,critical/other existing credit,retraining,no known savings,guarantor,life insurance,bank,rent,unemp/unskilled non res,...,good,1,3,4,2,<0,21<x<38,250<x<4793,<1,33<x<47
6,single,male,all paid,furniture/equipment,>=1000,co applicant,real estate,stores,rent,unskilled resident,...,bad,1,3,2,2,<0,4<x<21,x<250,1<=X<4,33<x<47
7,div/sep,male,existing paid,radio/tv,100<=X<500,guarantor,no known property,none,rent,unskilled resident,...,good,3,3,1,2,no checking,4<x<21,x>13880,unemployed,33<x<47
8,mar/wid,male,existing paid,used car,no known savings,none,life insurance,none,rent,skilled,...,bad,1,4,1,1,<0,x<4,9337<x<13880,<1,19<x<33
9,single,female,critical/other existing credit,repairs,<100,co applicant,real estate,stores,own,skilled,...,bad,4,1,2,1,0<=X<200,38<x<55,x>13880,4<=X<7,x<19
