In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn import cluster
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
import inspect, re
from sklearn.preprocessing import PolynomialFeatures
from itertools import product
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
import gc
from sklearn.feature_selection import VarianceThreshold,r_regression,SelectKBest,f_regression,SelectPercentile,chi2,f_classif,mutual_info_regression,mutual_info_classif,RFE,RFECV,SequentialFeatureSelector,SelectFromModel
from scipy import stats
import scipy
import random
from scipy.special import digamma
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,RepeatedKFold,KFold

In [2]:
def cate_colName(Transformer, category_cols, drop='if_binary'):
    """
    离散字段独热编码后字段名创建函数
    
    :param Transformer: 独热编码转化器
    :param category_cols: 输入转化器的离散变量
    :param drop: 独热编码转化器的drop参数
    """
    
    cate_cols_new = []
    col_value = Transformer.categories_
    
    for i, j in enumerate(category_cols):
        if (drop == 'if_binary') & (len(col_value[i]) == 2):
            cate_cols_new.append(j)
        else:
            for f in col_value[i]:
                feature_name = str(j) + '_' + str(f)
                cate_cols_new.append(feature_name)
    return(cate_cols_new)

def result_df(model, X_train, y_train, X_test, y_test, metrics=
              [accuracy_score, recall_score, precision_score, f1_score, roc_auc_score]):
    res_train = []
    res_test = []
    col_name = []
    for fun in metrics:
        res_train.append(fun(model.predict(X_train), y_train))
        res_test.append(fun(model.predict(X_test), y_test)) 
        col_name.append(fun.__name__)
    idx_name = ['train_eval', 'test_eval']
    res = pd.DataFrame([res_train, res_test], columns=col_name, index=idx_name)
    return res

def colName(ColumnTransformer, numeric_cols, category_cols):
    col_name = []
    col_value = ColumnTransformer.named_transformers_['cat'].categories_
    
    for i, j in enumerate(category_cols):
        if len(col_value[i]) == 2:
            col_name.append(j)
        else:
            for f in col_value[i]:
                feature_name = j + '_' + f
                col_name.append(feature_name)
    col_name.extend(numeric_cols)
    return(col_name)
def Cross_Combination(colSet, df):
    newDf_l = []
    col_name_l = []
    
    for col in colSet:
        for col_sub in colSet:
            if col == col_sub:
                continue
            else:
                col_name = col+'&'+col_sub
                newDf_l.append(pd.Series(df[col].astype('str')+'&'+df[col_sub].astype('str'), name=col_name))
                col_name_l.append(col_name)
    
    newDF = pd.concat(newDf_l, axis=1)
    return newDF, col_name_l

def IV(new_features, DataFrame, target):
    """new_features:新特征
       DataFrame:数据集
       target：标签的名称
    """
    count_result = DataFrame[target].value_counts().values
    
    def IV_cal(features_name, target, df_temp):
        IV_l = []
        for i in features_name:
            IV_temp_l = []
            for values in df_temp[i].unique():
                data_temp = df_temp[df_temp[i] == values][target]
                PB, PG = data_temp.value_counts().values / count_result
                IV_temp = (PG-PB) * np.log(PG/PB)
                IV_temp_l.append(IV_temp)
            IV_l.append(np.array(IV_temp_l).sum())
        return(IV_l)
            
    if type(new_features) == np.ndarray:
        features_name = ['new_features']
        new_features = pd.Series(new_features, name=features_name[0])
    elif type(new_features) == pd.Series:
        features_name = [new_features.name]
    else:
        features_name = new_features.columns

    df_temp = pd.concat([new_features, DataFrame], axis=1)
    df_temp = df_temp.loc[:, ~df_temp.columns.duplicated()]
    IV_l = IV_cal(features_name=features_name, target=target, df_temp=df_temp)

    res = pd.DataFrame(IV_l, columns=['IV'], index=features_name)
    return(res)

def Binary_Cross_Combination(colNames, features, OneHot=True):
    """
    分类变量两两交叉组合衍生函数
    
    :param colNames: 参与交叉组合衍生的列名称
    :param features: 原始数据集的特征矩阵
    :param OneHot: 是否进行独热编码
    
    :return：交叉衍生后的新特征和新列名称
    """
    
    # 创建空列表存储器
    colNames_new_l = []
    features_new_l = []
    
    # 提取需要进行交叉组合的特征
    features = features[colNames]
    
    # 逐个创造新特征名称、新特征
    for col_index, col_name in enumerate(colNames):
        for col_sub_index in range(col_index+1, len(colNames)):
            
            newNames = col_name + '&' + colNames[col_sub_index]
            colNames_new_l.append(newNames)
            
            newDF = pd.Series(features[col_name].astype('str')  
                              + '&'
                              + features[colNames[col_sub_index]].astype('str'), 
                              name=col_name)
            features_new_l.append(newDF)
    
    # 拼接新特征矩阵
    features_new = pd.concat(features_new_l, axis=1)
    features_new.columns = colNames_new_l
    colNames_new = colNames_new_l
    
    # 对新特征矩阵进行独热编码
    if OneHot == True:
        enc = preprocessing.OneHotEncoder()
        enc.fit_transform(features_new)
        colNames_new = cate_colName(enc, colNames_new_l, drop=None)
        features_new = pd.DataFrame(enc.fit_transform(features_new).toarray(), columns=colNames_new)
        
    return features_new, colNames_new


def Binary_Group_Statistics(keyCol, 
                            features, 
                            col_num=None, 
                            col_cat=None, 
                            num_stat=['mean', 'var', 'max', 'min', 'skew', 'median'], 
                            cat_stat=['mean', 'var', 'max', 'min', 'median', 'count', 'nunique'], 
                            quant=True):
    """
    双变量分组统计特征衍生函数
    
    :param keyCol: 分组参考的关键变量
    :param features: 原始数据集
    :param col_num: 参与衍生的连续型变量
    :param col_cat: 参与衍生的离散型变量
    :param num_stat: 连续变量分组统计量
    :param cat_num: 离散变量分组统计量  
    :param quant: 是否计算分位数  

    :return：交叉衍生后的新特征和新特征的名称
    """
    
    # 当输入的特征有连续型特征时
    if col_num != None:
        aggs_num = {}
        colNames = col_num
        
        # 创建agg方法所需字典
        for col in col_num:
            aggs_num[col] = num_stat 
            
        # 创建衍生特征名称列表
        cols_num = [keyCol]
        for key in aggs_num.keys():
            cols_num.extend([key+'_'+keyCol+'_'+stat for stat in aggs_num[key]])
            
        # 创建衍生特征df
        features_num_new = features[col_num+[keyCol]].groupby(keyCol).agg(aggs_num).reset_index()
        features_num_new.columns = cols_num 
        
        # 当输入的特征有连续型也有离散型特征时
        if col_cat != None:        
            aggs_cat = {}
            colNames = col_num + col_cat

            # 创建agg方法所需字典
            for col in col_cat:
                aggs_cat[col] = cat_stat

            # 创建衍生特征名称列表
            cols_cat = [keyCol]
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+keyCol+'_'+stat for stat in aggs_cat[key]])    

            # 创建衍生特征df
            features_cat_new = features[col_cat+[keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat
    
            # 合并连续变量衍生结果与离散变量衍生结果
            df_temp = pd.merge(features_num_new, features_cat_new, how='left',on=keyCol)
            features_new = pd.merge(features[keyCol], df_temp, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_num + cols_cat
            colNames_new.remove(keyCol)
            colNames_new.remove(keyCol)
         
        # 当只有连续变量时
        else:
            # merge连续变量衍生结果与原始数据，然后删除重复列
            features_new = pd.merge(features[keyCol], features_num_new, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_num
            colNames_new.remove(keyCol)
    
    # 当没有输入连续变量时
    else:
        # 但存在分类变量时，即只有分类变量时
        if col_cat != None:
            aggs_cat = {}
            colNames = col_cat

            for col in col_cat:
                aggs_cat[col] = cat_stat

            cols_cat = [keyCol]
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+keyCol+'_'+stat for stat in aggs_cat[key]])    

            features_cat_new = features[col_cat+[keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat            
             
            features_new = pd.merge(features[keyCol], features_cat_new, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_cat
            colNames_new.remove(keyCol) 
    
    if quant:
        # 定义四分位计算函数
        def q1(x):
            """
            下四分位数
            """
            return x.quantile(0.25)

        def q2(x):
            """
            上四分位数
            """
            return x.quantile(0.75)

        aggs = {}
        for col in colNames:
            aggs[col] = ['q1', 'q2']

        cols = [keyCol]
        for key in aggs.keys():
            cols.extend([key+'_'+keyCol+'_'+stat for stat in aggs[key]])    

        aggs = {}
        for col in colNames:
            aggs[col] = [q1, q2]    

        features_temp = features[colNames+[keyCol]].groupby(keyCol).agg(aggs).reset_index()
        features_temp.columns = cols

        features_new = pd.merge(features_new, features_temp, how='left',on=keyCol)
        features_new.loc[:, ~features_new.columns.duplicated()]
        colNames_new = colNames_new + cols
        colNames_new.remove(keyCol)     
    
    features_new.drop([keyCol], axis=1, inplace=True)
        
    return features_new, colNames_new

def Binary_PolynomialFeatures(colNames, degree, features):
    """
    连续变量两变量多项式衍生函数
    
    :param colNames: 参与多项式衍生的列名称
    :param degree: 多项式最高阶
    :param features: 原始数据集
    
    :return：交叉衍生后的新特征和新列名称
    """
    
    
    # 创建空列表存储器
    colNames_new_l = []
    features_new_l = []
    
    # 提取需要进行多项式衍生的特征
    features = features[colNames]
    
    # 逐个进行多项式特征组合
    for col_index, col_name in enumerate(colNames):
        for col_sub_index in range(col_index+1, len(colNames)):
            col_temp = [col_name] + [colNames[col_sub_index]]
            array_new_temp = PolynomialFeatures(degree=degree, include_bias=False).fit_transform(features[col_temp])
            features_new_l.append(pd.DataFrame(array_new_temp[:, 2:]))
    
            # 逐个创建衍生多项式特征的名称
            for deg in range(2, degree+1):
                for i in range(deg+1):
                    col_name_temp = col_temp[0] + '**' + str(deg-i) + '*'+ col_temp[1] + '**' + str(i)
                    colNames_new_l.append(col_name_temp)
            
    
    # 拼接新特征矩阵
    features_new = pd.concat(features_new_l, axis=1)
    features_new.columns = colNames_new_l
    colNames_new = colNames_new_l
    
    return features_new, colNames_new

def Group_Statistics_Extension(colNames, keyCol, features):
    """
    双变量分组统计二阶特征衍生函数
    
    :param colNames: 参与衍生的特征名称
    :param keyCol: 分组参考的关键变量
    :param features: 原始数据集
    
    :return：交叉衍生后的新特征和新列名称
    """
    
    # 定义四分位计算函数
    def q1(x):
        """
        下四分位数
        """
        return x.quantile(0.25)

    def q2(x):
        """
        上四分位数
        """
        return x.quantile(0.75)   
    
    # 一阶特征衍生
    # 先定义用于生成列名称的aggs
    aggs = {}    
    for col in colNames:
        aggs[col] = ['mean', 'var', 'median', 'q1', 'q2']       
    cols = [keyCol]
    for key in aggs.keys():
        cols.extend([key+'_'+keyCol+'_'+stat for stat in aggs[key]])

    # 再定义用于进行分组汇总的aggs
    aggs = {}   
    for col in colNames:
        aggs[col] = ['mean', 'var', 'median', q1, q2] 
           
    features_new = features[colNames+[keyCol]].groupby(keyCol).agg(aggs).reset_index()
    features_new.columns = cols
             
    features_new = pd.merge(features[keyCol], features_new, how='left',on=keyCol)
    features_new.loc[:, ~features_new.columns.duplicated()]
    colNames_new = cols
    colNames_new.remove(keyCol)
    col1 = colNames_new.copy()
    print(col1)
    
    # 二阶特征衍生
    # 流量平滑特征
    for col_temp in colNames:
        col = col_temp+'_'+keyCol+'_'+'mean'
        features_new[col_temp+'_dive1_'+col] = features_new[keyCol] / (features_new[col] + 1e-5)
        colNames_new.append(col_temp+'_dive1_'+col)
        col = col_temp+'_'+keyCol+'_'+'median'
        features_new[col_temp+'_dive2_'+col] = features_new[keyCol] / (features_new[col] + 1e-5)
        colNames_new.append(col_temp+'_dive2_'+col)
        
    # 黄金组合特征
    for col_temp in colNames:
        col = col_temp+'_'+keyCol+'_'+'mean'
        features_new[col_temp+'_minus1_'+col] = features_new[keyCol] - features_new[col] 
        colNames_new.append(col_temp+'_minus1_'+col)
        features_new[col_temp+'_minus2_'+col] = features_new[keyCol] - features_new[col] 
        colNames_new.append(col_temp+'_minus2_'+col)
        
    # 组内归一化特征
    for col_temp in colNames:
        col_mean = col_temp+'_'+keyCol+'_'+'mean'
        col_var = col_temp+'_'+keyCol+'_'+'var'
        features_new[col_temp+'_norm_'+keyCol] = (features_new[keyCol] - features_new[col_mean]) / (np.sqrt(features_new[col_var]) + 1e-5)      
        colNames_new.append(col_temp+'_norm_'+keyCol)
    
    # Gap特征
    for col_temp in colNames:
        col_q1 = col_temp+'_'+keyCol+'_'+'q1'
        col_q2 = col_temp+'_'+keyCol+'_'+'q2'
        features_new[col_temp+'_gap_'+keyCol] = features_new[col_q2] - features_new[col_q1]  
        colNames_new.append(col_temp+'_gap_'+keyCol)
        
    # 数据倾斜特征
    for col_temp in colNames:
        col_mean = col_temp+'_'+keyCol+'_'+'mean'
        col_median = col_temp+'_'+keyCol+'_'+'median'
        features_new[col_temp+'_mag1_'+keyCol] = features_new[col_median] - features_new[col_mean]    
        colNames_new.append(col_temp+'_mag1_'+keyCol)
        features_new[col_temp+'_mag2_'+keyCol] = features_new[col_median] / (features_new[col_mean] + 1e-5)
        colNames_new.append(col_temp+'_mag2_'+keyCol)
        
    # 变异系数
    for col_temp in colNames:
        col_mean = col_temp+'_'+keyCol+'_'+'mean'
        col_var = col_temp+'_'+keyCol+'_'+'var'
        features_new[col_temp+'_cv_'+keyCol] = np.sqrt(features_new[col_var]) / (features_new[col_mean] + 1e-5)
        colNames_new.append(col_temp+'_cv_'+keyCol)

    features_new.drop([keyCol], axis=1, inplace=True)
    features_new.drop(col1, axis=1, inplace=True)
    colNames_new = list(features_new.columns)
    
    return features_new, colNames_new

def Multi_Cross_Combination(colNames, features, OneHot=True):
    """
    多变量组合交叉衍生
    
    :param colNames: 参与组合交叉衍生的列名称
    :param features: 原始数据集
    :param OneHot: 是否进行独热编码
    
    :return：交叉衍生后的新特征和新列名称
    """
    
    
    # 创建组合特征
    colNames_new = '&'.join([str(i) for i in colNames])
    features_new = features[colNames[0]].astype('str')

    for col in colNames[1:]:
        features_new = features_new + '&' + features[col].astype('str') 
    
    # 将组合特征转化为DataFrame
    features_new = pd.DataFrame(features_new, columns=[colNames_new])
    
    # 对新的特征列进行独热编码
    if OneHot == True:
        enc = preprocessing.OneHotEncoder()
        enc.fit_transform(features_new)
        colNames_new = cate_colName(enc, [colNames_new], drop=None)
        features_new = pd.DataFrame(enc.fit_transform(features_new).toarray(), columns=colNames_new)
        
    return features_new, colNames_new

def Multi_Group_Statistics(keyCol, 
                           features, 
                           col_num=None, 
                           col_cat=None, 
                           num_stat=['mean', 'var', 'max', 'min', 'skew', 'median'], 
                           cat_stat=['mean', 'var', 'max', 'min', 'median', 'count', 'nunique'], 
                           quant=True):
    """
    多变量分组统计特征衍生函数
    
    :param keyCol: 分组参考的关键变量
    :param features: 原始数据集
    :param col_num: 参与衍生的连续型变量
    :param col_cat: 参与衍生的离散型变量
    :param num_stat: 连续变量分组统计量
    :param cat_num: 离散变量分组统计量  
    :param quant: 是否计算分位数  

    :return：交叉衍生后的新特征和新特征的名称
    """
    # 生成原数据合并的主键
    features_key1, col1 = Multi_Cross_Combination(keyCol, features, OneHot=False)
    
    # 当输入的特征有连续型特征时
    if col_num != None:
        aggs_num = {}
        colNames = col_num
        
        # 创建agg方法所需字典
        for col in col_num:
            aggs_num[col] = num_stat 
            
        # 创建衍生特征名称列表
        cols_num = keyCol.copy()

        for key in aggs_num.keys():
            cols_num.extend([key+'_'+col1+'_'+stat for stat in aggs_num[key]])
            
        # 创建衍生特征df
        features_num_new = features[col_num+keyCol].groupby(keyCol).agg(aggs_num).reset_index()
        features_num_new.columns = cols_num 
        
        # 生成主键
        features_key2, col2 = Multi_Cross_Combination(keyCol, features_num_new, OneHot=False)
        
        # 创建包含合并主键的数据集
        features_num_new = pd.concat([features_key2, features_num_new], axis=1)
        
        
        # 当输入的特征有连续型也有离散型特征时
        if col_cat != None:        
            aggs_cat = {}
            colNames = col_num + col_cat

            # 创建agg方法所需字典
            for col in col_cat:
                aggs_cat[col] = cat_stat

            # 创建衍生特征名称列表
            cols_cat = keyCol.copy()
            
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+col1+'_'+stat for stat in aggs_cat[key]])    

            # 创建衍生特征df
            features_cat_new = features[col_cat+keyCol].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat
            
            # 生成主键
            features_key3, col3 = Multi_Cross_Combination(keyCol, features_cat_new, OneHot=False)

            # 创建包含合并主键的数据集
            features_cat_new = pd.concat([features_key3, features_cat_new], axis=1)            
    
    
            # 合并连续变量衍生结果与离散变量衍生结果
            # 合并新的特征矩阵
            df_temp = pd.concat([features_num_new, features_cat_new], axis=1)
            df_temp = df_temp.loc[:, ~df_temp.columns.duplicated()]
            # 将新的特征矩阵与原始数据集合并
            features_new = pd.merge(features_key1, df_temp, how='left',on=col1)
         
        
        # 当只有连续变量时
        else:
            # merge连续变量衍生结果与原始数据，然后删除重复列
            features_new = pd.merge(features_key1, features_num_new, how='left',on=col1)
            features_new = features_new.loc[:, ~features_new.columns.duplicated()]
    
    # 当没有输入连续变量时
    else:
        # 但存在分类变量时，即只有分类变量时
        if col_cat != None:
            aggs_cat = {}
            colNames = col_cat

            for col in col_cat:
                aggs_cat[col] = cat_stat

            cols_cat = keyCol.copy()
            
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+col1+'_'+stat for stat in aggs_cat[key]])    

            features_cat_new = features[col_cat+keyCol].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat            
             
            features_new = pd.merge(features_key1, features_cat_new, how='left',on=col1)
            features_new = features_new.loc[:, ~features_new.columns.duplicated()]
    
    if quant:
        # 定义四分位计算函数
        def q1(x):
            """
            下四分位数
            """
            return x.quantile(0.25)

        def q2(x):
            """
            上四分位数
            """
            return x.quantile(0.75)

        aggs = {}
        for col in colNames:
            aggs[col] = ['q1', 'q2']

        cols = keyCol.copy()
        
        for key in aggs.keys():
            cols.extend([key+'_'+col1+'_'+stat for stat in aggs[key]])    

        aggs = {}
        for col in colNames:
            aggs[col] = [q1, q2]    

        features_temp = features[colNames+keyCol].groupby(keyCol).agg(aggs).reset_index()
        features_temp.columns = cols
        features_new.drop(keyCol, axis=1, inplace=True)
    
        # 生成主键
        features_key4, col4 = Multi_Cross_Combination(keyCol, features_temp, OneHot=False)
        
        # 创建包含合并主键的数据集
        features_temp = pd.concat([features_key4, features_temp], axis=1)        

        # 合并新特征矩阵
        features_new = pd.merge(features_new, features_temp, how='left',on=col1)
        features_new = features_new.loc[:, ~features_new.columns.duplicated()]
  

    features_new.drop(keyCol+[col1], axis=1, inplace=True)
    colNames_new = list(features_new.columns)
    
    return features_new, colNames_new

def Multi_PolynomialFeatures(colNames, degree, features):
    """
    连续变量多变量多项式衍生函数
    
    :param colNames: 参与衍生的列名称
    :param degree: 多项式最高阶
    :param features: 原始数据集
    
    :return：交叉衍生后的新特征和新列名称
    """
    
    
    # 创建空列表容器
    colNames_new_l = []
    
    # 计算带入多项式计算的特征数
    n = len(colNames)
    
    # 提取需要进行多项式衍生的特征
    features = features[colNames]
    
    # 进行多项式特征组合
    array_new_temp = PolynomialFeatures(degree=degree, include_bias=False).fit_transform(features)
    # 选取衍生的特征
    array_new_temp = array_new_temp[:, n:]
    
    
    # 创建列名称列表
    deg = 2
    while deg <= degree:
        m = 1
        a1 = range(deg, -1, -1)
        a2 = []
        while m < n:
            a1 = list(product(a1, range(deg, -1, -1)))
            if m > 1:
                for i in a1:
                    i_temp = list(i[0])
                    i_temp.append(i[1])
                    a2.append(i_temp)
                a1 = a2.copy()
                a2 = []    
            m += 1
            
        a1 = np.array(a1)
        a3 = a1[a1.sum(1) == deg]
        
        for i in a3:
            colNames_new_l.append('&'.join(colNames) + '_' + ''.join([str(i) for i in i]))    
        
        deg += 1
    
    # 拼接新特征矩阵
    features_new = pd.DataFrame(array_new_temp, columns=colNames_new_l)
    colNames_new = colNames_new_l
    
    return features_new, colNames_new

def Features_Padding(features_train_new, 
                     features_test_new, 
                     colNames_train_new, 
                     colNames_test_new):
    """
    特征零值填补函数
    
    :param features_train_new: 训练集衍生特征
    :param features_test_new: 测试集衍生特征
    :param colNames_train_new: 训练集衍生列名称
    :param colNames_test_new: 测试集衍生列名称
    
    :return：0值填补后的新特征和特征名称
    """
    if len(colNames_train_new) > len(colNames_test_new):
        sub_colNames = list(set(colNames_train_new) - set(colNames_test_new))
        
        for col in sub_colNames:
            features_test_new[col] = 0
        
        features_test_new = features_test_new[colNames_train_new]
        colNames_test_new = list(features_test_new.columns)
            
    elif len(colNames_train_new) < len(colNames_test_new):
        sub_colNames = list(set(colNames_test_new) - set(colNames_train_new))
        
        for col in sub_colNames:
            features_train_new[col] = 0
        
        features_train_new = features_train_new[colNames_test_new]
        colNames_train_new = list(features_train_new.columns)    
    assert colNames_train_new  == colNames_test_new
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new        



def test_features(keyCol,
                  X_train, 
                  X_test,
                  features_train_new,
                  multi=False):
    """
    测试集特征填补函数
    
    :param keyCol: 分组参考的关键变量
    :param X_train: 训练集特征
    :param X_test: 测试集特征
    :param features_train_new: 训练集衍生特征
    :param multi: 是否多变量参与分组
    
    :return：分组统计衍生后的新特征和新特征的名称
    """
    
    # 创建主键
    # 创建带有主键的训练集衍生特征df
    # 创建只包含主键的test_key
    if multi == False:
        keyCol = keyCol
        features_train_new[keyCol] = X_train[keyCol].reset_index()[keyCol]
        test_key = pd.DataFrame(X_test[keyCol])
    else:
        train_key, train_col = Multi_Cross_Combination(colNames=keyCol, features=X_train, OneHot=False)
        test_key, test_col = Multi_Cross_Combination(colNames=keyCol, features=X_test, OneHot=False)
        assert train_col == test_col
        keyCol = train_col
        features_train_new[keyCol] = train_key[train_col].reset_index()[train_col]
        
    # 利用groupby进行去重
    features_test_or = features_train_new.groupby(keyCol).mean().reset_index()
    
    # 和测试集进行拼接
    features_test_new = pd.merge(test_key, features_test_or, on=keyCol, how='left')
    
    # 删除keyCol列，只保留新衍生的列
    features_test_new.drop([keyCol], axis=1, inplace=True)
    features_train_new.drop([keyCol], axis=1, inplace=True)
    
    # 输出列名称
    colNames_train_new = list(features_train_new.columns)
    colNames_test_new = list(features_test_new.columns)
    
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new



def Cross_Combination(colNames, 
                      X_train,
                      X_test,
                      multi=False,
                      OneHot=True):
    """
    交叉组合特征衍生函数
    
    :param colNames: 参与交叉衍生的列名称
    :param X_train: 训练集特征
    :param X_test: 测试集特征
    :param multi: 是否进行多变量交叉组合
    :param OneHot: 是否进行独热编码
    
    :return：交叉衍生后的新特征和特征名称
    """
    # 首先，训练集和测试集单独进行交叉组合特征衍生
    if multi == False:
        features_train_new, colNames_train_new = Binary_Cross_Combination(colNames=colNames, features=X_train, OneHot=OneHot)
        features_test_new, colNames_test_new = Binary_Cross_Combination(colNames=colNames, features=X_test, OneHot=OneHot)
    else:
        features_train_new, colNames_train_new = Multi_Cross_Combination(colNames=colNames, features=X_train, OneHot=OneHot)
        features_test_new, colNames_test_new = Multi_Cross_Combination(colNames=colNames, features=X_test, OneHot=OneHot)
        
    # 然后判断训练集和测试集的衍生特征是否存在差异
    if colNames_train_new != colNames_test_new:
        features_train_new, features_test_new, colNames_train_new, colNames_test_new = Features_Padding(features_train_new = features_train_new, 
                                                                                                        features_test_new = features_test_new, 
                                                                                                        colNames_train_new = colNames_train_new, 
                                                                                                        colNames_test_new = colNames_test_new)
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new


def Polynomial_Features(colNames, 
                        degree, 
                        X_train, 
                        X_test, 
                        multi=False):   
    
    """
    多项式特征衍生函数
    
    :param colNames: 参与交叉衍生的列名称
    :param degree: 多项式最高阶
    :param X_train: 训练集特征
    :param X_test: 测试集特征
    :param multi: 是否进行多变量多项式组衍生
    
    :return：多项式衍生后的新特征和新列名称
    """
    if multi == False:
        features_train_new, colNames_train_new = Binary_PolynomialFeatures(colNames=colNames, degree=degree, features=X_train)
        features_test_new, colNames_test_new = Binary_PolynomialFeatures(colNames=colNames, degree=degree, features=X_test)
    else:
        features_train_new, colNames_train_new = Multi_PolynomialFeatures(colNames=colNames, degree=degree, features=X_train)
        features_test_new, colNames_test_new = Multi_PolynomialFeatures(colNames=colNames, degree=degree, features=X_test)
        
    assert colNames_train_new  == colNames_test_new
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new


def Group_Statistics(keyCol, 
                     X_train, 
                     X_test, 
                     col_num=None, 
                     col_cat=None, 
                     num_stat=['mean', 'var', 'max', 'min', 'skew', 'median'], 
                     cat_stat=['mean', 'var', 'max', 'min', 'median', 'count', 'nunique'], 
                     quant=True, 
                     multi=False, 
                     extension=False):
    """
    分组统计特征衍生函数
    
    :param keyCol: 分组参考的关键变量
    :param X_train: 训练集特征
    :param X_test: 测试集特征
    :param col_num: 参与衍生的连续型变量
    :param col_cat: 参与衍生的离散型变量
    :param num_stat: 连续变量分组统计量
    :param cat_num: 离散变量分组统计量  
    :param quant: 是否计算分位数  
    :param multi: 是否进行多变量的分组统计特征衍生
    :param extension: 是否进行二阶特征衍生

    :return：分组统计衍生后的新特征和新特征的名称
    """
    
    # 进行训练集的特征衍生
    if multi == False:
        # 进行双变量的交叉衍生
        features_train_new, colNames_train_new = Binary_Group_Statistics(keyCol = keyCol, 
                                                                         features = X_train, 
                                                                         col_num = col_num, 
                                                                         col_cat = col_cat, 
                                                                         num_stat = num_stat, 
                                                                         cat_stat = cat_stat, 
                                                                         quant = quant)
        # 是否进一步进行二阶特征衍生
        if extension == True:
            if col_num == None:
                colNames = col_cat
            elif col_cat == None:
                colNames = col_num
            else:
                colNames = col_num + col_cat
                
            features_train_new_ex, colNames_train_new_ex = Group_Statistics_Extension(colNames = colNames,
                                                                                      keyCol = keyCol,
                                                                                      features = X_train)
            
            features_train_new = pd.concat([features_train_new, features_train_new_ex], axis=1)
            colNames_train_new.extend(colNames_train_new_ex)
            
        
    else:
        # 进行多变量的交叉衍生
        features_train_new, colNames_train_new = Multi_Group_Statistics(keyCol = keyCol, 
                                                                        features = X_train, 
                                                                        col_num = col_num, 
                                                                        col_cat = col_cat, 
                                                                        num_stat = num_stat, 
                                                                        cat_stat = cat_stat, 
                                                                        quant = quant)

    
    # 对测试集结果进行填补
    features_train_new, features_test_new, colNames_train_new, colNames_test_new = test_features(keyCol, 
                                                                                                 X_train, 
                                                                                                 X_test, 
                                                                                                 features_train_new,
                                                                                                 multi=multi)
    # 如果特征不一致，则进行0值填补
    # 对于分组统计特征来说一般不会出现该情况
    if colNames_train_new != colNames_test_new:
        features_train_new, features_test_new, colNames_train_new, colNames_test_new = Features_Padding(features_train_new = features_train_new, 
                                                                                                        features_test_new = features_test_new, 
                                                                                                        colNames_train_new = colNames_train_new, 
                                                                                                        colNames_test_new = colNames_test_new)
    
    
    assert colNames_train_new  == colNames_test_new
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new


def Target_Encode(keyCol, 
                  X_train, 
                  y_train,
                  X_test, 
                  col_num=None, 
                  col_cat=None, 
                  num_stat=['mean', 'var', 'max', 'min', 'skew', 'median'], 
                  cat_stat=['mean', 'var', 'max', 'min', 'median', 'count', 'nunique'], 
                  quant=True, 
                  multi=False, 
                  extension=False,
                  n_splits=5, 
                  random_state=42):
    """
    目标编码
    
    :param keyCol: 分组参考的关键变量
    :param X_train: 训练集特征
    :param y_train: 训练集标签
    :param X_test: 测试集特征
    :param col_num: 参与衍生的连续型变量
    :param col_cat: 参与衍生的离散型变量
    :param num_stat: 连续变量分组统计量
    :param cat_num: 离散变量分组统计量  
    :param quant: 是否计算分位数  
    :param multi: 是否进行多变量的分组统计特征衍生
    :param extension: 是否进行二阶特征衍生
    :param n_splits: 进行几折交叉统计  
    :param random_state: 随机数种子  

    :return：目标编码后的新特征和新特征的名称
    """
        
    # 获取标签名称
    target = y_train.name
    # 合并同时带有特征和标签的完整训练集
    train = pd.concat([X_train, y_train], axis=1)
    
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # 每一折验证集的结果存储容器
    df_l = []
    
    # 进行交叉统计
    for trn_idx, val_idx in folds.split(train):
        trn_temp = train.iloc[trn_idx]
        val_temp = train.iloc[val_idx]
        trn_temp_new, val_temp_new, colNames_trn_temp_new, colNames_val_temp_new = Group_Statistics(keyCol, 
                                                                                                    X_train = trn_temp, 
                                                                                                    X_test = val_temp, 
                                                                                                    col_num = col_num, 
                                                                                                    col_cat = col_cat, 
                                                                                                    num_stat = num_stat, 
                                                                                                    cat_stat = cat_stat, 
                                                                                                    quant = quant, 
                                                                                                    multi = multi, 
                                                                                                    extension = extension)
        val_temp_new.index = val_temp.index
        df_l.append(val_temp_new)
    
    # 创建训练集的衍生特征
    features_train_new = pd.concat(df_l).sort_index(ascending=True) 
    colNames_train_new = [col + '_kfold' for col in features_train_new.columns]
    features_train_new.columns = colNames_train_new
    
    # 对测试集结果进行填补
    features_train_new, features_test_new, colNames_train_new, colNames_test_new = test_features(keyCol = keyCol, 
                                                                                                 X_train = X_train, 
                                                                                                 X_test = X_test, 
                                                                                                 features_train_new = features_train_new,
                                                                                                 multi = multi)
   
    # 如果特征不一致，则进行0值填补
    if colNames_train_new != colNames_test_new:
        features_train_new, features_test_new, colNames_train_new, colNames_test_new = Features_Padding(features_train_new = features_train_new, 
                                                                                                        features_test_new = features_test_new, 
                                                                                                        colNames_train_new = colNames_train_new, 
                                                                                                        colNames_test_new = colNames_test_new)
        
    assert colNames_train_new  == colNames_test_new
    return features_train_new, features_test_new, colNames_train_new, colNames_test_new



# 数据准备

In [3]:
# 读取数据
data = pd.read_csv("D:\\机器学习文献\\数据集\\OULAD\\quarter4_noFeatureEng.csv", index_col = 0)
# 标注连续/离散字段
# 离散字段
category_cols = ["gender", "region", "highest_education", "disability", "imd_band", "age_band"]

# 连续字段
numeric_cols = ["num_of_prev_attempts", "studied_credits", "total_number_of_assessments", "dataplus"
               ,"dualpane", "externalquiz", "folder", "forumng", "glossary", "homepage"
               ,"htmlactivity", "oucollaborate", "oucontent", "ouelluminate", "ouwiki"
               ,"page", "questionnaire", "quiz", "repeatactivity", "resource", "sharedsubpage"
               ,"subpage", "url", "TA"]
 
# 标签
target = "final_result"

#ID列
# ID列
ID_col = ["code_module", "code_presentation", "id_student"]

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 4 == data.shape[1]

# 连续字段转化
data[numeric_cols]= data[numeric_cols].astype(float)

#缺失值填补
si = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
si.fit(data.loc[:, "imd_band"].values.reshape(-1, 1))
data.loc[:, "imd_band"] = si.transform(data.loc[:, "imd_band"].values.reshape(-1, 1))

# 标签值手动转化 
data['final_result'].replace(to_replace=["Fail", "Withdrawn"], value=1, inplace=True)
data['final_result'].replace(to_replace=["Pass", "Distinction"],  value=0, inplace=True)

In [4]:
train, test = train_test_split(data, random_state=55914)
X_train = train.drop(columns=["code_module", "code_presentation", "id_student", "final_result"]).copy()
X_test = test.drop(columns=["code_module", "code_presentation", "id_student", "final_result"]).copy()

y_train = train['final_result'].copy()
y_test = test['final_result'].copy()

ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(ord_enc.transform(X_train[category_cols]), columns=category_cols)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(ord_enc.transform(X_test[category_cols]), columns=category_cols)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

# 特征工程

## 多项式特征衍生

In [5]:
Poly_train, Poly_test, colNames_train_new, colNames_test_new = Polynomial_Features(numeric_cols, 
                                                                                   3, 
                                                                                   X_train_OE, 
                                                                                   X_test_OE, 
                                                                                   )

In [6]:
Poly_train.head()

Unnamed: 0,num_of_prev_attempts**2*studied_credits**0,num_of_prev_attempts**1*studied_credits**1,num_of_prev_attempts**0*studied_credits**2,num_of_prev_attempts**3*studied_credits**0,num_of_prev_attempts**2*studied_credits**1,num_of_prev_attempts**1*studied_credits**2,num_of_prev_attempts**0*studied_credits**3,num_of_prev_attempts**2*total_number_of_assessments**0,num_of_prev_attempts**1*total_number_of_assessments**1,num_of_prev_attempts**0*total_number_of_assessments**2,...,subpage**2*TA**1,subpage**1*TA**2,subpage**0*TA**3,url**2*TA**0,url**1*TA**1,url**0*TA**2,url**3*TA**0,url**2*TA**1,url**1*TA**2,url**0*TA**3
0,1.0,90.0,8100.0,1.0,90.0,8100.0,729000.0,1.0,13.0,169.0,...,19275518.0,96009270.0,478211800.0,36.0,4692.0,611524.0,216.0,28152.0,3669144.0,478211800.0
1,1.0,120.0,14400.0,1.0,120.0,14400.0,1728000.0,1.0,6.0,36.0,...,16184.0,53312.0,175616.0,0.0,0.0,3136.0,0.0,0.0,0.0,175616.0
2,0.0,0.0,3600.0,0.0,0.0,0.0,216000.0,0.0,0.0,121.0,...,1117800.0,46276920.0,1915864000.0,2025.0,55890.0,1542564.0,91125.0,2515050.0,69415380.0,1915864000.0
3,4.0,120.0,3600.0,8.0,240.0,7200.0,216000.0,4.0,12.0,36.0,...,367670376.0,1748592000.0,8316074000.0,961.0,62806.0,4104676.0,29791.0,1946986.0,127244956.0,8316074000.0
4,0.0,0.0,900.0,0.0,0.0,0.0,27000.0,0.0,0.0,81.0,...,99.0,363.0,1331.0,0.0,0.0,121.0,0.0,0.0,0.0,1331.0


In [7]:
Poly_train.shape

(24444, 1932)

## 多项式特征衍生的特征筛选

In [11]:
#方差过滤

In [17]:
sel = VarianceThreshold()
sel.fit(Poly_train)

In [18]:
Poly_cols = Poly_train.columns[sel.variances_ > 0]
Poly_cols = list(Poly_cols)
len(Poly_cols)

1863

In [15]:
Poly_train[Poly_cols].to_csv('fff/X_train_Poly.csv', index=False)
Poly_test[Poly_cols].to_csv('fff/X_test_Poly.csv', index=False)

## 分组统计特征衍生

In [19]:
#KeyCol筛选

In [20]:
cat_all = category_cols.copy()

In [21]:
MI = mutual_info_classif(X_train_OE[cat_all], y_train, discrete_features=True, random_state=22) #互信息法
MI

array([0.00027978, 0.00354377, 0.01101387, 0.00162818, 0.00673372,
       0.00247127])

In [22]:
MI_select_cols = []
MI_threshold = MI.mean() * 1

for MIvalue, colname in zip(MI, cat_all):
    if MIvalue > MI_threshold:
        MI_select_cols.append(colname)

print(len(MI_select_cols))        
MI_select_cols

2


['highest_education', 'imd_band']

In [23]:
keycol = MI_select_cols
keycol

['highest_education', 'imd_band']

In [24]:
# 创建一个未被选中离散变量的list
cat_rest = []
for col in cat_all:
    if col not in keycol:
        cat_rest.append(col)

cat_rest

['gender', 'region', 'disability', 'age_band']

In [25]:
#分组统计特征衍生

In [26]:
# 创建容器
col_temp = keycol.copy()
GroupStat_train = pd.DataFrame()
GroupStat_test = pd.DataFrame()

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      X_train_OE,
                                                                                      X_test_OE,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=col_temp+cat_rest, 
                                                                                      extension=True)
    
    GroupStat_train = pd.concat([GroupStat_train, features_train1],axis=1)
    GroupStat_test = pd.concat([GroupStat_test, features_test1],axis=1)
    
    col_temp = keycol.copy()

['num_of_prev_attempts_highest_education_mean', 'num_of_prev_attempts_highest_education_var', 'num_of_prev_attempts_highest_education_median', 'num_of_prev_attempts_highest_education_q1', 'num_of_prev_attempts_highest_education_q2', 'studied_credits_highest_education_mean', 'studied_credits_highest_education_var', 'studied_credits_highest_education_median', 'studied_credits_highest_education_q1', 'studied_credits_highest_education_q2', 'total_number_of_assessments_highest_education_mean', 'total_number_of_assessments_highest_education_var', 'total_number_of_assessments_highest_education_median', 'total_number_of_assessments_highest_education_q1', 'total_number_of_assessments_highest_education_q2', 'dataplus_highest_education_mean', 'dataplus_highest_education_var', 'dataplus_highest_education_median', 'dataplus_highest_education_q1', 'dataplus_highest_education_q2', 'dualpane_highest_education_mean', 'dualpane_highest_education_var', 'dualpane_highest_education_median', 'dualpane_highe

In [27]:
GroupStat_train.head()

Unnamed: 0,num_of_prev_attempts_highest_education_mean,num_of_prev_attempts_highest_education_var,num_of_prev_attempts_highest_education_max,num_of_prev_attempts_highest_education_min,num_of_prev_attempts_highest_education_skew,num_of_prev_attempts_highest_education_median,studied_credits_highest_education_mean,studied_credits_highest_education_var,studied_credits_highest_education_max,studied_credits_highest_education_min,...,resource_cv_imd_band,sharedsubpage_cv_imd_band,subpage_cv_imd_band,url_cv_imd_band,TA_cv_imd_band,highest_education_cv_imd_band,gender_cv_imd_band,region_cv_imd_band,disability_cv_imd_band,age_band_cv_imd_band
0,0.151384,0.211781,6.0,0.0,4.002943,0.0,81.043536,1660.329517,630.0,30.0,...,1.718787,20.960753,1.476565,1.801206,1.36414,0.984938,0.921609,0.638664,2.909407,1.54893
1,0.151384,0.211781,6.0,0.0,4.002943,0.0,81.043536,1660.329517,630.0,30.0,...,1.502766,14.572007,1.672021,3.189013,1.552777,0.86489,0.973784,0.595039,2.667573,1.691284
2,0.151384,0.211781,6.0,0.0,4.002943,0.0,81.043536,1660.329517,630.0,30.0,...,1.319883,24.890334,1.45341,1.897177,1.340288,1.013479,0.909843,0.640988,3.329169,1.508964
3,0.135035,0.184371,4.0,0.0,3.938394,0.0,79.202837,1532.664578,330.0,30.0,...,1.718787,20.960753,1.476565,1.801206,1.36414,0.984938,0.921609,0.638664,2.909407,1.54893
4,0.151384,0.211781,6.0,0.0,4.002943,0.0,81.043536,1660.329517,630.0,30.0,...,2.045887,18.891986,1.679956,1.753317,1.42551,0.961868,0.892683,0.628362,2.85852,1.530343


In [28]:
GroupStat_train.shape

(24444, 996)

## 分组统计特征衍生的特征筛选

In [31]:
#方差过滤

In [29]:
sel = VarianceThreshold()
sel.fit(GroupStat_train)

In [30]:
GroupStat_cols = list(GroupStat_train.columns[sel.variances_ > 0])
len(GroupStat_cols)

765

In [32]:
#方差分析

In [33]:
f_classif_p = f_classif(GroupStat_train[GroupStat_cols], y_train)[1]

In [34]:
f_classif_GroupStat_cols = []

for pValue, colname in zip(f_classif_p, GroupStat_cols):
    if pValue < 0.000000000001:
        f_classif_GroupStat_cols.append(colname)

print(len(f_classif_GroupStat_cols))

656


In [35]:
#互信息法

In [36]:
MI = mutual_info_classif(GroupStat_train[GroupStat_cols], y_train, random_state=22)
MI

array([1.24797242e-02, 1.15355014e-02, 1.23358484e-02, 1.05857086e-02,
       1.55379268e-02, 1.40222964e-02, 1.73406619e-02, 7.75502120e-03,
       9.52006611e-03, 6.58064797e-03, 1.12395405e-02, 1.22589329e-02,
       1.13917596e-02, 8.87158698e-03, 1.38583387e-02, 9.20078335e-03,
       1.34402897e-02, 1.20004273e-02, 4.67761720e-03, 1.08010141e-02,
       1.10804223e-02, 1.72257845e-02, 9.95839546e-03, 1.75075072e-02,
       1.31063828e-02, 1.67187313e-02, 1.17549584e-02, 1.14146818e-02,
       8.73730080e-03, 8.80205451e-03, 8.12630549e-03, 1.63212629e-02,
       1.18101982e-02, 1.55830807e-02, 1.24000345e-02, 7.05474563e-03,
       1.02886685e-02, 1.16653311e-02, 1.54757119e-02, 1.58560486e-02,
       1.34143599e-02, 1.33846352e-02, 1.26257993e-02, 1.07019284e-02,
       7.81890060e-03, 1.40103440e-02, 1.33969624e-02, 1.12842913e-02,
       1.41530848e-02, 1.38883030e-02, 1.48633431e-02, 1.53209824e-02,
       1.34245584e-02, 1.51622727e-02, 1.03988788e-02, 1.30189795e-02,
      

In [37]:
MI.mean()

0.009029063692514

In [38]:
MI_threshold = MI.mean() * 0.2
MI_threshold

0.0018058127385028

In [39]:
MI_GroupStat_cols = []

for MIvalue, colname in zip(MI, GroupStat_cols):
    if MIvalue > MI_threshold:
        MI_GroupStat_cols.append(colname)

print(len(MI_GroupStat_cols))   

734


In [40]:
GroupStat_cols_select = list(set(f_classif_GroupStat_cols) & set(MI_GroupStat_cols))
len(GroupStat_cols_select)

635

In [41]:
GroupStat_train[GroupStat_cols_select]

Unnamed: 0,highest_education_imd_band_var,page_cv_highest_education,studied_credits_norm_imd_band,disability_dive1_disability_imd_band_mean,num_of_prev_attempts_dive2_num_of_prev_attempts_imd_band_median,folder_dive2_folder_imd_band_median,gender_imd_band_mean,url_cv_imd_band,forumng_minus1_forumng_highest_education_mean,region_dive2_region_highest_education_median,...,repeatactivity_dive2_repeatactivity_imd_band_median,dataplus_minus2_dataplus_imd_band_mean,studied_credits_gap_imd_band,url_minus1_url_highest_education_mean,region_highest_education_q2,TA_mag2_imd_band,externalquiz_highest_education_mean,total_number_of_assessments_mag1_highest_education,forumng_norm_imd_band,gender_dive1_gender_imd_band_mean
0,0.885226,2.574216,-1.806523,28.385549,300000.0,300000.0,0.540821,1.801206,-246.179715,0.000000,...,300000.0,1.802735,60.0,-17.379261,9.0,0.486378,2.053357,0.084238,-0.457135,5.547024
1,1.011061,2.574216,-1.905989,0.000000,0.0,0.0,0.513363,3.189013,-246.179715,0.000000,...,0.0,-1.232739,60.0,-17.379261,9.0,0.396663,2.053357,0.084238,-0.401183,0.000000
2,0.916348,2.574216,-1.840788,60.398109,500000.0,500000.0,0.547204,1.897177,-246.179715,0.000000,...,500000.0,3.528412,30.0,-17.379261,9.0,0.560558,2.053357,0.084238,-0.412232,9.137200
3,0.885226,2.484635,-1.806523,28.385549,300000.0,300000.0,0.540821,1.801206,-312.192624,0.166666,...,300000.0,1.802735,60.0,-21.810496,8.0,0.486378,2.696454,0.292482,-0.457135,5.547024
4,0.929402,2.574216,-1.962085,18.337419,200000.0,200000.0,0.556604,1.753317,-246.179715,0.000000,...,200000.0,0.409811,60.0,-17.379261,9.0,0.501289,2.053357,0.084238,-0.460621,3.593156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24439,0.885226,2.484635,-1.806523,28.385549,300000.0,300000.0,0.540821,1.801206,-312.192624,0.166666,...,300000.0,1.802735,60.0,-21.810496,8.0,0.486378,2.696454,0.292482,-0.457135,5.547024
24440,0.929402,3.193776,-1.962085,18.337419,200000.0,200000.0,0.556604,1.753317,-215.668532,0.333333,...,200000.0,0.409811,60.0,-13.633635,9.0,0.501289,1.617764,1.086987,-0.460621,3.593156
24441,0.929402,2.574216,-1.962085,18.337419,200000.0,200000.0,0.556604,1.753317,-246.179715,0.000000,...,200000.0,0.409811,60.0,-17.379261,9.0,0.501289,2.053357,0.084238,-0.460621,3.593156
24442,0.884510,2.484635,-1.782229,108.907296,700000.0,700000.0,0.593476,1.935865,-312.192624,0.166666,...,700000.0,5.245862,30.0,-21.810496,8.0,0.525379,2.696454,0.292482,-0.462172,11.794715


In [None]:
GroupStat_train[GroupStat_cols_select].to_csv('fff/X_train_GroupStat.csv', index=False)
GroupStat_test[GroupStat_cols_select].to_csv('fff/X_test_GroupStat.csv', index=False)

In [44]:
# 定义标签
col_cat = [target]
print(col_cat)

# 创建容器
col_temp = cat_all.copy()
TarEnc_train = pd.DataFrame()
TarEnc_test = pd.DataFrame()

for keyCol in col_temp:
    features_train1, features_test1, colNames_train_new, colNames_test_new = Target_Encode(keyCol, 
                                                                                           X_train_OE, 
                                                                                           y_train,
                                                                                           X_test_OE, 
                                                                                           col_cat=col_cat, 
                                                                                           extension=True)
    
    TarEnc_train = pd.concat([TarEnc_train, features_train1],axis=1)
    TarEnc_test = pd.concat([TarEnc_test, features_test1],axis=1)
    
    col_temp = cat_all.copy()

['final_result']
['final_result_gender_mean', 'final_result_gender_var', 'final_result_gender_median', 'final_result_gender_q1', 'final_result_gender_q2']
['final_result_gender_mean', 'final_result_gender_var', 'final_result_gender_median', 'final_result_gender_q1', 'final_result_gender_q2']
['final_result_gender_mean', 'final_result_gender_var', 'final_result_gender_median', 'final_result_gender_q1', 'final_result_gender_q2']
['final_result_gender_mean', 'final_result_gender_var', 'final_result_gender_median', 'final_result_gender_q1', 'final_result_gender_q2']
['final_result_gender_mean', 'final_result_gender_var', 'final_result_gender_median', 'final_result_gender_q1', 'final_result_gender_q2']
['final_result_region_mean', 'final_result_region_var', 'final_result_region_median', 'final_result_region_q1', 'final_result_region_q2']
['final_result_region_mean', 'final_result_region_var', 'final_result_region_median', 'final_result_region_q1', 'final_result_region_q2']
['final_result_re

In [45]:
TarEnc_train.head()

Unnamed: 0,final_result_gender_mean_kfold,final_result_gender_var_kfold,final_result_gender_max_kfold,final_result_gender_min_kfold,final_result_gender_median_kfold,final_result_gender_count_kfold,final_result_gender_nunique_kfold,final_result_gender_q1_kfold,final_result_gender_q2_kfold,final_result_dive1_final_result_gender_mean_kfold,...,final_result_age_band_q2_kfold,final_result_dive1_final_result_age_band_mean_kfold,final_result_dive2_final_result_age_band_median_kfold,final_result_minus1_final_result_age_band_mean_kfold,final_result_minus2_final_result_age_band_mean_kfold,final_result_norm_age_band_kfold,final_result_gap_age_band_kfold,final_result_mag1_age_band_kfold,final_result_mag2_age_band_kfold,final_result_cv_age_band_kfold
0,0.541227,0.248323,1,0,1,10794,2,0,1,1.847621,...,1,4.784199,200000.0,1.581967,1.581967,3.194089,1,-0.418033,0.0,1.184734
4,0.513285,0.249852,1,0,1,8769,2,0,1,0.0,...,1,0.0,0.0,-0.549122,-0.549122,-1.103521,1,0.450878,1.821056,0.906156
5,0.541227,0.248323,1,0,1,10794,2,0,1,1.847621,...,1,2.076287,100000.0,0.518381,0.518381,1.037351,1,-0.481619,0.0,1.037534
6,0.539927,0.248429,1,0,1,10757,2,0,1,1.852066,...,1,0.0,0.0,-0.553139,-0.553139,-1.112517,1,0.446861,1.807831,0.898829
8,0.516059,0.249771,1,0,1,8780,2,0,1,0.0,...,1,0.0,0.0,-0.549793,-0.549793,-1.105016,1,0.450207,1.818834,0.904929


In [46]:
TarEnc_train.shape

(24444, 108)

In [48]:
sel = VarianceThreshold()
sel.fit(TarEnc_train)

In [49]:
TarEnc_cols = list(TarEnc_train.columns[sel.variances_ > 0])
len(TarEnc_cols)

70

In [None]:
TarEnc_train[TarEnc_cols].reset_index(drop=True).to_csv('fff/X_train_TarEnc.csv', index=False)
TarEnc_test[TarEnc_cols].reset_index(drop=True).to_csv('fff/X_test_TarEnc.csv', index=False)

## 将新特征与旧特征组合起来

In [50]:
X_train_GroupStat = pd.read_csv('fff/X_train_GroupStat.csv')
X_train_Poly = pd.read_csv('fff/X_train_Poly.csv')
X_train_TarEnc = pd.read_csv('fff/X_train_TarEnc.csv')
X_test_GroupStat = pd.read_csv('fff/X_test_GroupStat.csv')
X_test_Poly = pd.read_csv('fff/X_test_Poly.csv')
X_test_TarEnc = pd.read_csv('fff/X_test_TarEnc.csv')

In [51]:
features_train_new = pd.concat([X_train_GroupStat,
                                X_train_Poly,
                                X_train_TarEnc,], axis=1)

In [52]:
features_train_new

Unnamed: 0,folder_imd_band_mean,highest_education_imd_band_median,num_of_prev_attempts_minus2_num_of_prev_attempts_imd_band_mean,forumng_minus2_forumng_imd_band_mean,ouwiki_highest_education_mean,imd_band_norm_highest_education,imd_band_highest_education_var,total_number_of_assessments_mag2_imd_band,questionnaire_imd_band_mean,gender_minus1_gender_imd_band_mean,...,final_result_age_band_median_kfold,final_result_age_band_count_kfold,final_result_dive1_final_result_age_band_mean_kfold,final_result_dive2_final_result_age_band_median_kfold,final_result_minus1_final_result_age_band_mean_kfold,final_result_minus2_final_result_age_band_mean_kfold,final_result_norm_age_band_kfold,final_result_mag1_age_band_kfold,final_result_mag2_age_band_kfold,final_result_cv_age_band_kfold
0,0.159552,1.0,2.834646,-218.242851,28.219567,-1.383679,6.547766,1.016266,1.764608,2.459179,...,0,122,4.784199,200000.0,1.581967,1.581967,3.194089,-0.418033,0.000000,1.184734
1,0.161841,1.5,-0.197847,-212.667038,28.219567,-1.383679,6.547766,1.145948,1.675947,-0.513363,...,1,13782,0.000000,0.0,-0.549122,-0.549122,-1.103521,0.450878,1.821056,0.906156
2,0.160626,1.0,4.837584,-276.434004,28.219567,-1.383679,6.547766,1.012800,1.910962,4.452796,...,0,5658,2.076287,100000.0,0.518381,0.518381,1.037351,-0.481619,0.000000,1.037534
3,0.159552,1.0,2.834646,-218.242851,38.353759,-1.031570,7.042642,1.016266,1.764608,2.459179,...,1,13794,0.000000,0.0,-0.553139,-0.553139,-1.112517,0.446861,1.807831,0.898829
4,0.185660,1.0,1.836604,-231.120377,28.219567,-1.383679,6.547766,0.997740,2.086415,1.443396,...,1,13747,0.000000,0.0,-0.549793,-0.549793,-1.105016,0.450207,1.818834,0.904929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24439,0.159552,1.0,2.834646,-218.242851,38.353759,-1.031570,7.042642,1.016266,1.764608,2.459179,...,1,13758,0.000000,0.0,-0.552842,-0.552842,-1.111848,0.447158,1.808802,0.899369
24440,0.185660,1.0,1.836604,-231.120377,23.791230,-0.435004,6.158912,0.997740,2.086415,1.443396,...,0,5668,2.090698,100000.0,0.521701,0.521701,1.044273,-0.478299,0.000000,1.044456
24441,0.185660,1.0,1.836604,-231.120377,28.219567,-1.383679,6.547766,0.997740,2.086415,1.443396,...,1,13775,0.000000,0.0,-0.550708,-0.550708,-1.107061,0.449292,1.815812,0.903258
24442,0.198637,1.0,6.847128,-278.641675,38.353759,-1.031570,7.042642,1.029831,2.303797,6.406524,...,0,5658,2.076287,100000.0,0.518381,0.518381,1.037351,-0.481619,0.000000,1.037534


In [53]:
features_test_new = pd.concat([X_test_GroupStat,
                               X_test_Poly,
                               X_test_TarEnc,], axis=1)

In [54]:
features_test_new

Unnamed: 0,folder_imd_band_mean,highest_education_imd_band_median,num_of_prev_attempts_minus2_num_of_prev_attempts_imd_band_mean,forumng_minus2_forumng_imd_band_mean,ouwiki_highest_education_mean,imd_band_norm_highest_education,imd_band_highest_education_var,total_number_of_assessments_mag2_imd_band,questionnaire_imd_band_mean,gender_minus1_gender_imd_band_mean,...,final_result_age_band_median_kfold,final_result_age_band_count_kfold,final_result_dive1_final_result_age_band_mean_kfold,final_result_dive2_final_result_age_band_median_kfold,final_result_minus1_final_result_age_band_mean_kfold,final_result_minus2_final_result_age_band_mean_kfold,final_result_norm_age_band_kfold,final_result_mag1_age_band_kfold,final_result_mag2_age_band_kfold,final_result_cv_age_band_kfold
0,0.159552,1.0,2.834646,-218.242851,28.219567,-1.383679,6.547766,1.016266,1.764608,2.459179,...,0.704756,11342.314383,0.634977,30111.805950,-0.227953,-0.227953,-0.459336,0.175686,1.278784,0.945711
1,0.156481,1.0,5.863426,-244.789352,38.353759,-1.031570,7.042642,1.009109,2.143981,5.410185,...,0.704756,11342.314383,0.634977,30111.805950,-0.227953,-0.227953,-0.459336,0.175686,1.278784,0.945711
2,0.185660,1.0,1.836604,-231.120377,38.353759,-1.031570,7.042642,0.997740,2.086415,1.443396,...,0.702213,11307.037997,0.648910,30622.194707,-0.222349,-0.222349,-0.447849,0.173642,1.274177,0.946828
3,0.152442,1.0,3.852229,-260.009342,23.791230,-0.435004,6.158912,1.010674,2.015711,3.463694,...,0.702213,11307.037997,0.648910,30622.194707,-0.222349,-0.222349,-0.447849,0.173642,1.274177,0.946828
4,0.160626,1.0,4.837584,-276.434004,28.219567,-1.383679,6.547766,1.012800,1.910962,4.452796,...,0.704756,11342.314383,0.634977,30111.805950,-0.227953,-0.227953,-0.459336,0.175686,1.278784,0.945711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8144,0.161841,1.5,-0.197847,-212.667038,28.219567,-1.383679,6.547766,1.145948,1.675947,-0.513363,...,0.702213,11307.037997,0.648910,30622.194707,-0.222349,-0.222349,-0.447849,0.173642,1.274177,0.946828
8145,0.154341,1.0,0.835844,-216.750719,23.791230,-0.435004,6.158912,1.024799,1.981723,0.472161,...,0.702213,11307.037997,0.648910,30622.194707,-0.222349,-0.222349,-0.447849,0.173642,1.274177,0.946828
8146,0.154341,1.0,0.835844,-216.750719,23.791230,-0.435004,6.158912,1.024799,1.981723,0.472161,...,0.704756,11342.314383,0.634977,30111.805950,-0.227953,-0.227953,-0.459336,0.175686,1.278784,0.945711
8147,0.198637,1.0,6.847128,-278.641675,28.219567,-1.383679,6.547766,1.029831,2.303797,6.406524,...,0.704756,11342.314383,0.634977,30111.805950,-0.227953,-0.227953,-0.459336,0.175686,1.278784,0.945711


In [55]:
features_train_new.shape

(24444, 2568)

In [56]:
features_test_new.shape

(8149, 2568)

In [57]:
features_train_new.to_csv('fff/features_train_new.csv', index=False)
features_test_new.to_csv('fff/features_test_new.csv', index=False)

In [58]:
features_train_new.index = X_train_OE.index
features_train_new = pd.concat([features_train_new, 
                                X_train_OE], axis=1)

In [59]:
features_train_new

Unnamed: 0,folder_imd_band_mean,highest_education_imd_band_median,num_of_prev_attempts_minus2_num_of_prev_attempts_imd_band_mean,forumng_minus2_forumng_imd_band_mean,ouwiki_highest_education_mean,imd_band_norm_highest_education,imd_band_highest_education_var,total_number_of_assessments_mag2_imd_band,questionnaire_imd_band_mean,gender_minus1_gender_imd_band_mean,...,ouwiki,page,questionnaire,quiz,repeatactivity,resource,sharedsubpage,subpage,url,TA
13134,0.159552,1.0,2.834646,-218.242851,28.219567,-1.383679,6.547766,1.016266,1.764608,2.459179,...,58.0,1.0,0.0,0.0,0.0,42.0,0.0,157.0,6.0,782.0
16539,0.161841,1.5,-0.197847,-212.667038,28.219567,-1.383679,6.547766,1.145948,1.675947,-0.513363,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,17.0,0.0,56.0
5927,0.160626,1.0,4.837584,-276.434004,28.219567,-1.383679,6.547766,1.012800,1.910962,4.452796,...,0.0,0.0,0.0,85.0,0.0,15.0,0.0,30.0,45.0,1242.0
16185,0.159552,1.0,2.834646,-218.242851,38.353759,-1.031570,7.042642,1.016266,1.764608,2.459179,...,108.0,0.0,0.0,0.0,0.0,99.0,0.0,426.0,31.0,2026.0
32134,0.185660,1.0,1.836604,-231.120377,28.219567,-1.383679,6.547766,0.997740,2.086415,1.443396,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16218,0.159552,1.0,2.834646,-218.242851,38.353759,-1.031570,7.042642,1.016266,1.764608,2.459179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5753,0.185660,1.0,1.836604,-231.120377,23.791230,-0.435004,6.158912,0.997740,2.086415,1.443396,...,0.0,0.0,0.0,98.0,0.0,27.0,0.0,102.0,49.0,457.0
27509,0.185660,1.0,1.836604,-231.120377,28.219567,-1.383679,6.547766,0.997740,2.086415,1.443396,...,0.0,3.0,0.0,252.0,0.0,4.0,0.0,20.0,1.0,363.0
1126,0.198637,1.0,6.847128,-278.641675,38.353759,-1.031570,7.042642,1.029831,2.303797,6.406524,...,0.0,0.0,0.0,86.0,0.0,6.0,0.0,10.0,4.0,244.0


In [60]:
features_test_new.index = X_test_OE.index
features_test_new = pd.concat([features_test_new, 
                                X_test_OE], axis=1)

In [61]:
features_test_new

Unnamed: 0,folder_imd_band_mean,highest_education_imd_band_median,num_of_prev_attempts_minus2_num_of_prev_attempts_imd_band_mean,forumng_minus2_forumng_imd_band_mean,ouwiki_highest_education_mean,imd_band_norm_highest_education,imd_band_highest_education_var,total_number_of_assessments_mag2_imd_band,questionnaire_imd_band_mean,gender_minus1_gender_imd_band_mean,...,ouwiki,page,questionnaire,quiz,repeatactivity,resource,sharedsubpage,subpage,url,TA
28780,0.159552,1.0,2.834646,-218.242851,28.219567,-1.383679,6.547766,1.016266,1.764608,2.459179,...,22.0,7.0,27.0,1853.0,0.0,103.0,0.0,565.0,33.0,6360.0
18955,0.156481,1.0,5.863426,-244.789352,38.353759,-1.031570,7.042642,1.009109,2.143981,5.410185,...,0.0,0.0,0.0,0.0,0.0,101.0,0.0,268.0,28.0,1398.0
20720,0.185660,1.0,1.836604,-231.120377,38.353759,-1.031570,7.042642,0.997740,2.086415,1.443396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31586,0.152442,1.0,3.852229,-260.009342,23.791230,-0.435004,6.158912,1.010674,2.015711,3.463694,...,0.0,0.0,0.0,80.0,0.0,13.0,0.0,6.0,0.0,452.0
13656,0.160626,1.0,4.837584,-276.434004,28.219567,-1.383679,6.547766,1.012800,1.910962,4.452796,...,80.0,0.0,0.0,0.0,0.0,81.0,0.0,223.0,78.0,1213.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5926,0.161841,1.5,-0.197847,-212.667038,28.219567,-1.383679,6.547766,1.145948,1.675947,-0.513363,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,6.0
4678,0.154341,1.0,0.835844,-216.750719,23.791230,-0.435004,6.158912,1.024799,1.981723,0.472161,...,0.0,0.0,0.0,44.0,0.0,1.0,0.0,5.0,3.0,141.0
25563,0.154341,1.0,0.835844,-216.750719,23.791230,-0.435004,6.158912,1.024799,1.981723,0.472161,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20060,0.198637,1.0,6.847128,-278.641675,28.219567,-1.383679,6.547766,1.029831,2.303797,6.406524,...,145.0,0.0,0.0,147.0,0.0,10.0,0.0,28.0,104.0,2173.0


In [64]:
features_train_new.to_csv('fff/X_train.csv', index=False)
features_test_new.to_csv('fff/X_test.csv', index=False)

In [66]:
y_train.to_csv('fff/y_train.csv', index=False)
y_test.to_csv('fff/y_test.csv', index=False)