In [8]:
import numpy as np
import matplotlib.pyplot as plt

def load_from_csv(file):
    """file: filename
       return: 
           features: np.array[ncases, nfeatures]
           descriptions: list[nfeatures]
           correct_answers: np.array[ncases]
    """
    raw_data = np.genfromtxt(file, delimiter=',', names=True)
    raw_descriptions = list(raw_data.dtype.names)
    raw_data = np.array(list(map(list, raw_data)))
    
    features = raw_data[:, 1:-1]
    descriptions = raw_descriptions[1:-1]
    correct_answers = raw_data[:,-1]  
    return features, descriptions, correct_answers

def save_to_csv(descriptions, features, correct_answers, file):
    """descriptions: list[nfeatures]
       features: np.array[ncases, nfeatures]
       correct_answers: np.array[ncases]
       file: filename
    """
    f = open(file, 'wt')
    print(','.join(descriptions), file=f)
    for i in range(features.shape[0]):
        print("%i,%s,%f" % (i+1, ','.join(map(str, features[i])), correct_answers[i]), file=f)
    f.close()
    
def add_feature(features, descriptions, new_feature, new_description):
    """
        features: np.array[ncases, nfeatures]
        descriptions: list[nfeatures]
        new_feature: np.array[ncases]
        new_description: str
        return:
            new_features: np.array[ncases, nfeatures+1]
            new_descriptions: list[nfeatures+1]
            
    """
    new_features, new_descriptions = add_features(features, descriptions, new_feature[:,np.newaxis], [new_description])
    return new_features, new_descriptions

def add_features(features, descriptions, new_features, new_descriptions):
    """
        features: np.array[ncases, nfeatures]
        descriptions: list[nfeatures]
        new_features: np.array[ncases, nnewfeatures]
        new_descriptions: list[nnewfeatures]
        return:
            new_new_features: np.array[ncases, nfeatures+nnewfeatures]
            new_new_descriptions: list[nfeatures+nnewfeatures]
    """
    new_new_features = np.concatenate([features, new_features],1)
    new_new_descriptions = descriptions + new_descriptions
    return new_new_features, new_new_descriptions
    

def add_one_hot_encoded_feature(features, descriptions, feature, description):
    """
        features: np.array[ncases, nfeatures]
        descriptions: list[nfeatures]
        feature: np.array[ncases]
        description: str
        return:
            new_features: np.array[ncases, nfeatures+1]
            new_descriptions: list[nfeatures+1]
    """
    hot_features, hot_descriptions = one_hot_encoded_feature(feature, description)
    new_features = np.concatenate([features, hot_features], 1)
    new_descriptions = descriptions + hot_descriptions
    return new_features, new_descriptions

def one_hot_encoded_feature(feature, description):
    """
        feature: np.array[ncases]
        description: str
        return:
            new_features: np.array[ncases,n_one_hot_encoded_features]
            new_descriptions: list[n_one_hot_encoded_features]
    """
    unique_values = sorted(set(feature)) #сколько столбцов
    new_descriptions = []
    for i in range(len(unique_values)):
        new_descriptions.append(description + " = " + str(unique_values[i]))
    feature = feature[:,np.newaxis]
    unique_values = np.array(unique_values)
    unique_values = unique_values[np.newaxis,:]      
    return (feature == unique_values).astype(np.float), new_descriptions

def make_uniform_borders(feature, c_diapasones):
    """
    feature: np.array[ncases]
    c_diapasones: int
    return:
        uniform_borders: list[c_diapasones-1]
    """
    max_diap = np.max(feature)
    min_diap = np.min(feature)
    diap = max_diap - min_diap
    step = diap / c_diapasones##20
    uniform_borders=[]
    for i in range(1, c_diapasones):
        uniform_borders.append(min_diap + step * i)   
    return uniform_borders

def make_quantile_borders(feature, c_diapasones):
    """
    feature: np.array[ncases]
    c_diapasones: int
    return:
        quantile_borders: list[c_diapasones-1]
    """
    quantile_numbers = make_uniform_borders([0, 1], c_diapasones)
    feature = sorted(feature)
    quantile_borders = []
    for i in range(0, c_diapasones - 1):
        quantile_borders.append(get_quantile_element(feature, quantile_numbers[i]))
    return quantile_borders

def get_quantile_element(sorted_feature, quantile_number):
    """
    sorted_feature: sorted list[ncases]
    quantile_number: int
    return:
        quantile_element: int
    """
    return sorted_feature[int(quantile_number * len(sorted_feature))]
    
def quantificator(feature, description, borders):
    """
    feature: np.array[ncases]
    description: str
    borders: list[nborders]
    return:
        new_features: np.array[ncases,nborders+1]
        new_descriptions: list[nborders+1]
    """
    borders = sorted(borders)
    f1 = feature < borders[0]
    d1 = description + " < %f" %borders[0]
    
    fi = []
    di = []
    for i in range(0, len(borders)-1):
        fi.append(np.logical_and(feature >= borders[i], feature < borders[i+1]))
        di.append(("%f <="%borders[i]) + description + " < %f" %borders[i+1])
    flast = feature >= borders[-1]
    dlast = ("%f <="%borders[-1]) + description
    
    new_features = np.stack([f1] + fi + [flast], axis=1)
    new_descriptions = [d1] + di + [dlast]
    
    return new_features.astype(np.float32), new_descriptions

def add_qr_feature(features, descriptions, feature, description, c_diapasones):
    """
    features: np.array[ncases, nfeatures]
    descriptions: list[nfeatures]
    feature: np.array[ncases]
    description: str
    c_diapasones: int
    return:
        new_features: np.array[ncases,2*c_diapasones-1]
        new_descriptions: list[2*c_diapasones-1]
    
    """
    uniform_borders = []#make_uniform_borders(feature, c_diapasones)
    quantile_borders = make_quantile_borders(feature, c_diapasones)
    all_borders = uniform_borders + quantile_borders
    new_features, new_descriptions = quantificator(feature, description, all_borders)  
    new_features, new_descriptions = add_features(features, descriptions, new_features, new_descriptions)
    return new_features, new_descriptions
    

LIMIT_BAL = 0
SEX = 1
EDUCATION = 2
MARRIAGE = 3
AGE = 4
PAY_0 = 5 
PAY_2 = 6 
PAY_3 = 7
PAY_4 = 8
PAY_5 = 9
PAY_6 = 10
BILL_AMT1 = 11
BILL_AMT2 = 12
BILL_AMT3 = 13
BILL_AMT4 = 14
BILL_AMT5 = 15
BILL_AMT6 = 16
PAY_AMT1 = 17
PAY_AMT2 = 18
PAY_AMT3 = 19
PAY_AMT4 = 20
PAY_AMT5 = 21
PAY_AMT6 = 22

In [15]:
np.set_printoptions(300000)
features, descriptions, correct_answers = load_from_csv('ccard.csv')

# Feature 0.
nf = np.ones((features.shape[0], 1))
nd = ["1"]

# Feature 1.
f = features[:,LIMIT_BAL]
f = np.log(f)
nf, nd = add_feature(nf, nd, f, "LOG(%s)"%descriptions[LIMIT_BAL])

# Feature 2.
nf, nd = add_one_hot_encoded_feature(nf, nd, features[:,SEX], descriptions[SEX]) 

# Feature 3.
f = features[:, EDUCATION]
f[f>3] = 0
nf, nd = add_one_hot_encoded_feature(nf, nd, f, descriptions[EDUCATION])

# Feature 4.
f = features[:, MARRIAGE]
f[f==3] = 0
nf, nd = add_one_hot_encoded_feature(nf, nd, f, descriptions[MARRIAGE]) 

# Feature 5.
nf, nd = add_qr_feature(nf, nd, features[:, AGE], descriptions[AGE], 5)



# Feature 6.
for i in range(PAY_0, PAY_6+1):
    f = features[:, i]
    f[f>3]=3   
    nf, nd = add_one_hot_encoded_feature(nf, nd, f, descriptions[i]) 
    
# Feature 7.
for i in range(BILL_AMT1, BILL_AMT6+1):
    nf, nd = add_qr_feature(nf, nd, features[:, i], descriptions[i], 5)
    
# Feature 8.
for i in range(BILL_AMT1, BILL_AMT6+1):
    nf, nd = add_qr_feature(nf, nd, features[:, i], descriptions[i], 5)
   

# Debug printing.
print(nf.shape)
print(np.sum(nf==0,axis=0))
print(nd)
print("ok")



(30000, 80)
[    0     0 18112 11888 29532 19415 15970 25083 29623 16341 14036 24873
 24114 23096 24573 23344 27241 24314 15263 26312 27333 29537 26218 23950
 14270 29972 26073 29517 25915 24062 14236 29996 26181 29610 25652 24313
 13545 29998 26841 29651 25454 24461 13053 27374 29658 25105 24260 13714
 27234 29687 24000 24000 24000 24000 24000 24000 24000 24000 24000 24000
 24000 24000 24001 23999 24000 24001 24000 23999 24001 23999 24001 23999
 24000 24000 24000 24005 23995 24000 24000 24000]
['1', 'LOG(LIMIT_BAL)', 'SEX = 1.0', 'SEX = 2.0', 'EDUCATION = 0.0', 'EDUCATION = 1.0', 'EDUCATION = 2.0', 'EDUCATION = 3.0', 'MARRIAGE = 0.0', 'MARRIAGE = 1.0', 'MARRIAGE = 2.0', 'AGE < 27.000000', '27.000000 <=AGE < 31.000000', '31.000000 <=AGE < 37.000000', '37.000000 <=AGE < 43.000000', '43.000000 <=AGE', 'PAY_0 = -2.0', 'PAY_0 = -1.0', 'PAY_0 = 0.0', 'PAY_0 = 1.0', 'PAY_0 = 2.0', 'PAY_0 = 3.0', 'PAY_2 = -2.0', 'PAY_2 = -1.0', 'PAY_2 = 0.0', 'PAY_2 = 1.0', 'PAY_2 = 2.0', 'PAY_2 = 3.0', 'PAY_