In [1]:
import pandas as pd
import numpy as np

In [2]:
def input2df(train_input_file, test_input_file):
    """
    Args:
        input_train_file:
        input_test_file:
    Return:
         pd.DataFrame train_df
         pd.DataFrame test_df
    """
    dtype_dict = {"age": np.int32,
                  "education-num": np.int32,
                  "capital - gain": np.int32,
                  "capital - loss": np.int32,
                  "hours - per - week": np.int32}
    use_list = list(range(15))
    use_list.remove(2)
    train_df = pd.read_csv(train_input_file, dtype=dtype_dict, na_values="?",usecols=use_list)
    print(train_df.shape)
    train_df.dropna(axis=0, how="any", inplace=True)
    test_df = pd.read_csv(test_input_file, dtype=dtype_dict, na_values="?",usecols=use_list)
    print(test_df.shape)
    test_df.dropna(axis=0, how="any", inplace=True)
    return train_df, test_df

def con_split_list(data_series):
    """
    Args:
        data_series: pandas.core.series.Series about con feature
    Return:
        split_dic: {'min': 17.0, '25%':28.0, ...}
    """
    split_list = []
    con_split = ['min', '25%', '50%', '75%', 'max']
    for k in con_split:
        split_list.append(data_series.describe()[k])
    return split_list

def dis_split_list(data_series):
    split_list = sorted(data_series.value_counts().to_dict().keys())
    return split_list

def dis_feature2one(x, split_list):
    """
    return :0,0,0,1,0
    """
    length = len(split_list)
    zero_list = [0]*length
    if x in split_list:
        idx = split_list.index(x)
        zero_list[idx] = 1
    else:
        print("error")
    return ",".join([str(j) for j in zero_list])

def dis_featur_process(df, dis_feat_list):
    for feature in dis_feat_list:
        split_list = dis_split_list(df[feature])
        print("split_list:", split_list)
        df[feature] = df[feature].apply(dis_feature2one, args=(split_list, ))
    return df

def label_featur_process(df, label):
    def lable2one(x, one_label):
        if x == one_label:
            return 1
        else:
            return 0
    df[label] =  df[label].apply(lable2one, args=('>50K', ))
    return df

def con_feature2dim(x, split_list):
    """
    return :0,0,0,1,0
    """
    length = len(split_list)
    zero_list = [0]*(length-1)
    for i in range(length-1):
        if x >= split_list[i] and x < split_list[i+1]:
            zero_list[i] = 1
    else:
        if x == split_list[-1]:
            zero_list[-1] = 1
    return ",".join([str(j) for j in zero_list])
    
def con_feature_process(df, con_feat_list):
    for feature in con_feat_list:
        split_list = con_split_list(df[feature])
        print("split_dic:", split_list)
        df[feature] = df[feature].apply(con_feature2dim, args=(split_list, ))
    return df

def add_total_feature(df):
    columns = [i for i in df.columns if i != 'label']
    onehot_series_list = [df[col].apply(lambda x :x.split(",")) for col in columns]
    total = onehot_series_list[0]
    for i in range(1, len(onehot_series_list)):
        total += onehot_series_list[i]
    df['total'] = total
    return df

def write_series(filename, data_series):
    with open(filename, "w+") as f:
        for line in data_series:
            f.write("{}\n".format(",".join(line)))

In [8]:
def data_process(input_df, out_feature_file, out_label_file):
    label = 'label'
    con_features_li = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    dis_features_li = ['education', 'workclass', 'marital-status', 'occupation', 'relationship', 
                       'race', 'sex','native-country']
    df = label_featur_process(input_df, label)
    df = con_feature_process(df, con_features_li)
    df = dis_featur_process(df, dis_features_li)
    df = add_total_feature(df)
    write_series(out_feature_file, df['total'])
    with open(out_label_file, "w+") as f:
        for line in df["label"]:
            f.write("{}\n".format(line))
    print("Done!")

In [9]:
train_df, test_df = input2df("./data/train.txt", "./data/test.txt")
data_process(train_df, "./data/train_onehot.txt", "./data/train_label.txt")
data_process(test_df, "./data/test_onehot.txt", "./data/test_label.txt")

(32561, 14)
(16281, 14)
split_dic: [17.0, 28.0, 37.0, 47.0, 90.0]
split_dic: [1.0, 9.0, 10.0, 13.0, 16.0]
split_dic: [0.0, 0.0, 0.0, 0.0, 99999.0]
split_dic: [0.0, 0.0, 0.0, 0.0, 4356.0]
split_dic: [1.0, 40.0, 40.0, 45.0, 99.0]
split_list: ['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college']
split_list: ['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay']
split_list: ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed']
split_list: ['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving']
split_list: ['Husband', 'Not-in-family', 'Other-relative', 'Own-chi