In [25]:
import numpy as np
import matplotlib.pyplot as plt

def load_from_csv(file):
    """file: filename
       return: 
           features: np.array[ncases, nfeatures]
           descriptions: list[nfeatures]
           correct_answers: np.array[ncases]
    """
    raw_data = np.genfromtxt(file, delimiter=',', names=True)
    descriptions = list(raw_data.dtype.names)
    raw_data = np.array(list(map(list, raw_data)))

    features = raw_data[:, 1:-1]
    correct_answers = raw_data[:,-1]  
    return features, descriptions, correct_answers

def save_to_csv(descriptions, features, correct_answers, file):
    """descriptions: list[nfeatures]
       features: np.array[ncases, nfeatures]
       correct_answers: np.array[ncases]
       file: filename
    """
    f = open(file, 'wt')
    print(','.join(descriptions), file=f)
    for i in range(features.shape[0]):
        print("%i,%s,%f" % (i+1, ','.join(map(str, features[i])), correct_answers[i]), file=f)
    f.close()
    
def add_feature(features, descriptions, feature, description):
    """
        features: np.array[ncases, nfeatures]
        descriptions: list[nfeatures]
        feature: np.array[ncases]
        description: str
        return:
            new_features: np.array[ncases, nfeatures+1]
            new_descriptions: list[nfeatures+1]
            
    """
    new_features = features.append(feature, 1)
    new_descriptions = descriptions + [description]
    return new_features, new_descriptions

def add_one_hot_encoded_feature(features, descriptions, feature, description):
    """
        features: np.array[ncases, nfeatures]
        descriptions: list[nfeatures]
        feature: np.array[ncases]
        description: str
        return:
            new_features: np.array[ncases, nfeatures+1]
            new_descriptions: list[nfeatures+1]
    """
    hot_features, hot_descriptions = one_hot_encoded_feature(feature, description)
    print("features ", features.shape)
    print("hot features", hot_features.shape)
    new_features = np.concatenate([features, hot_features], 1)
    new_descriptions = descriptions + hot_descriptions
    return new_features, new_descriptions

def one_hot_encoded_feature(feature, description):
    """
        feature: np.array[ncases]
        description: str
        return:
            new_features: np.array[ncases,n_one_hot_encoded_features]
            new_descriptions: list[n_one_hot_encoded_features]
    """
    unique_values = sorted(set(feature)) #сколько столбцов
    new_descriptions = []
    for i in range(len(unique_values)):
        new_descriptions.append(description + " = " + str(unique_values[i]))
    feature = feature[:,np.newaxis]
    unique_values = np.array(unique_values)
    unique_values = unique_values[np.newaxis,:]      
    return (feature == unique_values).astype(np.float), new_descriptions


LIMIT_BAL = 0
SEX = 1
EDUCATION = 2
MARRIAGE = 3
AGE = 4
PAY_0 = 5 
PAY_2 = 6 
PAY_3 = 7
PAY_4 = 8
PAY_5 = 9
PAY_6 = 10
BILL_AMT1 = 11
BILL_AMT2 = 12
BILL_AMT3 = 13
BILL_AMT4 = 14
BILL_AMT5 = 15
BILL_AMT6 = 16
PAY_AMT1 = 17
PAY_AMT2 = 18
PAY_AMT3 = 19
PAY_AMT4 = 20
PAY_AMT5 = 21
PAY_AMT6 = 22

In [26]:
features, descriptions, correct_answers = load_from_csv('ccard.csv')
nf = np.ones((features.shape[0], SEX))
nd = ["1"]
nf, nd = add_one_hot_encoded_feature(nf, nd, features[SEX], descriptions[SEX]) 
print("ok")


features  (30000, 1)
hot features (23, 12)


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [None]:
a = np.array([[1,2],[3,4]])
a[:,-1:-1] = np.array([5,6])
print(a)