In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from tqdm import tqdm

pd.options.mode.chained_assignment = None

In [3]:
train_base = pd.read_csv('data/train_base.csv')
test_a_base = pd.read_csv('data/test_a_base.csv')

train_op = pd.read_csv('data/train_op.csv')
test_a_op = pd.read_csv('data/test_a_op.csv')

train_trans = pd.read_csv('data/train_trans.csv')
test_a_trans = pd.read_csv('data/test_a_trans.csv')

train_label = pd.read_csv('data/train_label.csv')

train_base = pd.read_csv('data/train_base.csv')
train_base = pd.merge(train_base, train_label, on='user', how='left')

In [4]:
train_base_no_nan = train_base.drop(columns=['service3_level'],axis=1)
# 离散值填充众数
train_base_no_nan['sex'].fillna('category 0',inplace=True)
train_base_no_nan['balance_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance1_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance2_avg'].fillna('level 1',inplace=True)

user = train_base_no_nan['user'].values
train_base_no_user = train_base_no_nan.drop(columns=['user'],axis=1)

In [5]:
def mean_coding(data, feature, target='label'):
    n_folds = 20
    n_inner_folds = 10
    mean_coded = pd.Series()

    # 所有数据的label均值
    default_mean = data[target].mean()
    kf = KFold(n_splits=n_folds, shuffle=True)

    out_mean_cv = pd.DataFrame()
    split = 0
    # 对所有数据做CV
    for in_fold, out_fold in tqdm(kf.split(data[feature])):

        impact_coded_cv = pd.Series()
        kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
        inner_split = 0
        inner_mean_cv = pd.DataFrame()

        # in_fold数据的label均值
        default_inner_mean = data.iloc[in_fold][target].mean()

        # 对in_fold数据做CV
        for in_fold_inner, out_fold_inner in kf_inner.split(data.iloc[in_fold]):
            # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)

            # 对in_fold_inner做group_by求出mean:feature-mean
            in_fold_inner_mean = data.iloc[in_fold_inner].groupby(by=feature)[target].mean()

            # in_fold的mean使用cv后的in_fold_inner的mean值，如果没有在里面，就用in_fold的label均值
            impact_coded_cv = impact_coded_cv.append(data.iloc[in_fold].apply(
                lambda x: in_fold_inner_mean[x[feature]]
                if x[feature] in in_fold_inner_mean.index
                else default_inner_mean
                , axis=1))

            # Also populate mapping (this has all group -> mean for all inner CV folds)
            inner_mean_cv = inner_mean_cv.join(pd.DataFrame(in_fold_inner_mean), rsuffix=inner_split, how='outer')
            inner_mean_cv.fillna(value=default_inner_mean, inplace=True)
            inner_split += 1

        # Also populate mapping
        out_mean_cv = out_mean_cv.join(pd.DataFrame(inner_mean_cv), rsuffix=split, how='outer')
        out_mean_cv.fillna(value=default_mean, inplace=True)
        split += 1

        mean_coded = mean_coded.append(data.iloc[out_fold].apply(
            lambda x: inner_mean_cv.loc[x[feature]].mean()
            if x[feature] in inner_mean_cv.index
            else default_mean
            , axis=1))

    return mean_coded, out_mean_cv.mean(axis=1), default_mean

In [6]:
city_mean, city_mapping, city_default_means = mean_coding(train_base_no_user, 'city')
province_mean, province_mapping, province_default_means = mean_coding(train_base_no_user, 'province')
regist_type_mean, regist_type_mapping, regist_type_default_means = mean_coding(train_base_no_user, 'regist_type')
balance_mean, balance_mapping, balance_default_means = mean_coding(train_base_no_user, 'balance')
balance_avg_mean, balance_avg_mapping, balance_avg_default_means = mean_coding(train_base_no_user, 'balance_avg')
balance1_mean, balance1_mapping, balance1_default_means = mean_coding(train_base_no_user, 'balance1')
balance1_avg_mean, balance1_avg_mapping, balance1_avg_default_means = mean_coding(train_base_no_user, 'balance1_avg')
balance2_mean, balance2_mapping, balance2_default_means = mean_coding(train_base_no_user, 'balance2')
balance2_avg_mean, balance2_avg_mapping, balance2_avg_default_means = mean_coding(train_base_no_user, 'balance2_avg')
product1_amount_mean, product1_amount_mapping, product1_amount_default_means = mean_coding(train_base_no_user, 'product1_amount')
product2_amount_mean, product2_amount_mapping, product2_amount_default_means = mean_coding(train_base_no_user, 'product2_amount')
product6_amount_mean, product6_amount_mapping, product6_amount_default_means = mean_coding(train_base_no_user, 'product6_amount')

20it [03:39, 10.96s/it]
20it [03:38, 10.95s/it]
20it [03:38, 10.90s/it]
20it [03:36, 10.83s/it]
20it [03:39, 10.98s/it]
20it [03:36, 10.85s/it]
20it [03:37, 10.90s/it]
20it [03:39, 10.95s/it]
20it [03:38, 10.94s/it]
20it [03:38, 10.94s/it]
20it [03:46, 11.35s/it]
20it [03:40, 11.00s/it]


In [7]:
joblib.dump([city_mean, city_mapping, city_default_means],'city_mean')
joblib.dump([province_mean, province_mapping, province_default_means],'province_mean')
joblib.dump([regist_type_mean, regist_type_mapping, regist_type_default_means],'regist_type_mean')
joblib.dump([balance_mean, balance_mapping, balance_default_means],'balance_mean')
joblib.dump([balance_avg_mean, balance_avg_mapping, balance_avg_default_means],'balance_avg_mean')
joblib.dump([balance1_mean, balance1_mapping, balance1_default_means],'balance1_mean')
joblib.dump([balance1_avg_mean, balance1_avg_mapping, balance1_avg_default_means],'balance1_avg_mean')
joblib.dump([balance2_mean, balance2_mapping, balance2_default_means],'balance2_mean')
joblib.dump([balance2_avg_mean, balance2_avg_mapping, balance2_avg_default_means],'balance2_avg_mean')
joblib.dump([product1_amount_mean, product1_amount_mapping, product1_amount_default_means],'product1_amount_mean')
joblib.dump([product2_amount_mean, product2_amount_mapping, product2_amount_default_means],'product2_amount_mean')
joblib.dump([product6_amount_mean, product6_amount_mapping, product6_amount_default_means],'product6_amount_mean')

['product6_amount_mean']