In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,Imputer, PolynomialFeatures
from sklearn.feature_selection import SelectPercentile
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [44]:
# 主要数据
df_train_origin = pd.read_csv('data/application_train.csv')
df_test_origin = pd.read_csv('data/application_test.csv')

combine_origin = [df_train_origin,df_test_origin]

In [45]:
# 由其他金融机构提供给客户的所有以前的信贷，这些信贷已报告给信贷局（针对在我们的样本中有贷款的客户）。
# 对于我们样本中的每笔贷款，行数与客户在申请日期之前在信用局中拥有的信用数一样多。
df_bureau = pd.read_csv('data/bureau.csv')

In [None]:
df_bureau_num = df_bureau.select_dtypes('number')

df_bureau_num.replace(np.nan, 0, inplace=True)

df_bureau_num['AMT_CREDIT_MAX_OVERDUE'] = df_bureau_num['AMT_CREDIT_MAX_OVERDUE'].apply(lambda x:round(x / 10000,2))
df_bureau_num['AMT_CREDIT_SUM'] = df_bureau_num['AMT_CREDIT_SUM'].apply(lambda x:round(x / 10000,2))
df_bureau_num['AMT_CREDIT_SUM_DEBT'] = df_bureau_num['AMT_CREDIT_SUM_DEBT'].apply(lambda x:round(x / 10000,2))
df_bureau_num['AMT_CREDIT_SUM_LIMIT'] = df_bureau_num['AMT_CREDIT_SUM_LIMIT'].apply(lambda x:round(x / 10000,2))
df_bureau_num['AMT_CREDIT_SUM_OVERDUE'] = df_bureau_num['AMT_CREDIT_SUM_OVERDUE'].apply(lambda x:round(x / 10000,2))
df_bureau_num['AMT_ANNUITY'] = df_bureau_num['AMT_ANNUITY'].apply(lambda x:round(x / 10000,2))

df_bureau_num = df_bureau_num.drop(['SK_ID_BUREAU'],axis=1)

df_bureau_num = df_bureau_num.groupby('SK_ID_CURR').agg(['min','max','sum','mean'])

In [47]:
df_bureau_cate = df_bureau.select_dtypes('object')

df_bureau_cate = pd.get_dummies(df_bureau_cate)

df_bureau_cate['SK_ID_CURR'] = df_bureau['SK_ID_CURR']

df_bureau_cate = df_bureau_cate.groupby('SK_ID_CURR').agg(['sum','mean'])

In [12]:
df_bureau_cate.shape

(305811, 46)

In [85]:
# 信用局中以前信用的每月余额。
# 该表在向信用局报告的每个先前信用的历史记录的每个月中都有一行–即该表具有（样本中的＃贷款*相对先前信用的数量*我们可以观察到先前信用的历史的月份数）行。
df_bureau_balance = pd.read_csv('data/bureau_balance.csv')

In [88]:
# 申请人通过房屋信贷拥有的先前信用卡的月度余额快照。
# 该表格在与样本中的贷款相关的房屋信贷（消费者信贷和现金贷款）中的每个先前信用的历史记录的每个月都有一行-即该表具有（（样本中的贷款*相对以前的信用卡数量*的＃个）以前的信用卡行中有一些历史记录的月份。
df_credit_card_balance = pd.read_csv('data/credit_card_balance.csv')

In [91]:
# 申请人通过房屋信贷拥有的先前POS（销售点）和现金贷款的每月余额快照。
# 该表在与样本中的贷款相关的房屋信贷（消费者信贷和现金贷款）中的每个先前信贷的历史记录的每个月中都有一行-即该表具有（（样本中的贷款*相对先前信贷的数量*月数）在其中，我们有一些历史记录可用于查看以前的信用记录。
df_POS_CASH_balance = pd.read_csv('data/POS_CASH_balance.csv')

In [94]:
# 在我们的样本中有贷款的客户以前所有的房屋信贷申请。
# 在我们的数据样本中，每个与贷款相关的先前申请都有一行。
df_previous_application = pd.read_csv('data/previous_application.csv')

In [None]:
# 与我们样本中的贷款相关的房屋信贷中先前已支付的信贷的还款历史。
# 有a）每笔付款都有一行，另加b）每笔未付款都有一行。
# 一行等于我们分期付款中一笔还清一笔贷款，或者相当于一笔分期付款，相当于一笔以前一笔与贷款相关的房屋信用信贷的付款。
df_installments_payments = pd.read_csv('data/installments_payments.csv')

In [154]:
# df_train[["FLAG_OWN_CAR", "TARGET"]].groupby(['FLAG_OWN_CAR'], as_index=False).mean().sort_values(by='TARGET', ascending=False)

# 收教育程度高低和逾期存在关系
# df_train_origin[["NAME_EDUCATION_TYPE", "TARGET"]].groupby(['NAME_EDUCATION_TYPE'], as_index=False).mean().sort_values(by='TARGET', ascending=False)

# 通过箱型图观察，每期还款金额会影响逾期
# df_train_origin[df_train_origin['TARGET'] == 1][['AMT_ANNUITY']].plot.box()
# df_train_origin[df_train_origin['TARGET'] == 0][['AMT_ANNUITY']].plot.box()

In [50]:
# 数据处理。空值、异常数据、对number/object数据编码、特征工程
combine = []
le = preprocessing.LabelEncoder()
# ohe = preprocessing.OneHotEncoder(categories='auto')
for i,dataset in enumerate(combine_origin):
    
    df = pd.DataFrame()
    try:
        df = dataset[['SK_ID_CURR','TARGET']]
    except:
        df = dataset[['SK_ID_CURR']]

    dataset.replace(np.nan, 0, inplace=True)
    dataset.replace(np.inf, 0, inplace=True)
    
    dataset['AMT_INCOME_TOTAL'] = dataset['AMT_INCOME_TOTAL'].apply(lambda x:round(x / 10000,2))
    dataset['AMT_CREDIT'] = dataset['AMT_CREDIT'].apply(lambda x:round(x / 10000,2))
    dataset['AMT_ANNUITY'] = dataset['AMT_ANNUITY'].apply(lambda x:round(x / 10000,2))
    dataset['AMT_GOODS_PRICE'] = dataset['AMT_GOODS_PRICE'].apply(lambda x:round(x / 10000,2))
    
    dataset['REGION_POPULATION_RELATIVE'] = dataset['REGION_POPULATION_RELATIVE'].apply(lambda x:round(x,3))
    dataset['EXT_SOURCE_1'] = dataset['EXT_SOURCE_1'].apply(lambda x:round(x,3))
    dataset['EXT_SOURCE_2'] = dataset['EXT_SOURCE_2'].apply(lambda x:round(x,3))
    dataset['EXT_SOURCE_3'] = dataset['EXT_SOURCE_3'].apply(lambda x:round(x,3))
    
    # 处理天数。
    dataset['DAYS_BIRTH'] = dataset['DAYS_BIRTH'].apply(lambda x:round(x/-365,1))
    dataset['DAYS_EMPLOYED'] = dataset['DAYS_EMPLOYED'].apply(lambda x:round(x/-365,1))
    #处理异常的天数
    dataset.loc[dataset['DAYS_EMPLOYED'] < 0,'DAYS_EMPLOYED'] = dataset[dataset['DAYS_EMPLOYED'] > 0]['DAYS_EMPLOYED'].mean()
    
    dataset['DAYS_REGISTRATION'] = dataset['DAYS_REGISTRATION'].apply(lambda x:round(x/-365,1))
    dataset['DAYS_ID_PUBLISH'] = dataset['DAYS_ID_PUBLISH'].apply(lambda x:round(x/-365,1))
    dataset['DAYS_LAST_PHONE_CHANGE'] = dataset['DAYS_LAST_PHONE_CHANGE'].apply(lambda x:round(x/-365,1))
    
    # 二分类数据用LabelEncoder进行编码
    dataset['NAME_CONTRACT_TYPE'] = le.fit_transform(dataset['NAME_CONTRACT_TYPE'])
#     dataset['NAME_CONTRACT_TYPE'] = dataset['NAME_CONTRACT_TYPE'].map({"Cash loans":1,"Revolving loans":2})
    
    dataset['FLAG_OWN_CAR'] = le.fit_transform(dataset['FLAG_OWN_CAR'])
#     dataset['FLAG_OWN_CAR'] = dataset['FLAG_OWN_CAR'].map({"N":0,"Y":1})
    dataset['FLAG_OWN_REALTY'] = le.transform(dataset['FLAG_OWN_REALTY'])
#     dataset['FLAG_OWN_REALTY'] = dataset['FLAG_OWN_REALTY'].map({"N":0,"Y":1})
    
#     dataset['CODE_GENDER'] = dataset['CODE_GENDER'].map({"F":1,"M":2,"XNA":3})
    
#     dataset['NAME_TYPE_SUITE'] = le.fit_transform(dataset['NAME_TYPE_SUITE'])
#     dataset['NAME_TYPE_SUITE'] = dataset['NAME_TYPE_SUITE'].map({"Unaccompanied":1,"Family":2,"Spouse, partner":3,"Children":4,"Other_B":5,"Other_A":6,"Group of people":7})
#     dataset['NAME_TYPE_SUITE'] = dataset['NAME_TYPE_SUITE'].fillna(0)

#     dataset['NAME_INCOME_TYPE'] = le.fit_transform(dataset['NAME_INCOME_TYPE'])
#     dataset['NAME_INCOME_TYPE'] = dataset['NAME_INCOME_TYPE'].map({"Working":1,"Commercial associate":2,"Pensioner":3,"State servant":4,"Unemployed":5,"Student":6,"Businessman":7,"Maternity leave":8})

#     dataset['NAME_FAMILY_STATUS'] = le.fit_transform(dataset['NAME_FAMILY_STATUS'])
#     dataset['NAME_FAMILY_STATUS'] = dataset['NAME_FAMILY_STATUS'].map({"Married":1,"Single / not married":2,"Civil marriage":3,"Separated":4,"Widow":5,"Unknown":6})

#     dataset['NAME_HOUSING_TYPE'] = le.fit_transform(dataset['NAME_HOUSING_TYPE'])
#     dataset['NAME_HOUSING_TYPE'] = dataset['NAME_HOUSING_TYPE'].map({"House / apartment":1,"With parents":2,"Municipal apartment":3,"Rented apartment":4,"Office apartment":5,"Co-op apartment":6})
    
#     dataset['WEEKDAY_APPR_PROCESS_START'] = dataset['WEEKDAY_APPR_PROCESS_START'].map({"SUNDAY":1,"SATURDAY":2,"FRIDAY":3,"THURSDAY":4,"MONDAY":5,"WEDNESDAY":6,"TUESDAY":7})
#     dataset['NAME_EDUCATION_TYPE'] = dataset['NAME_EDUCATION_TYPE'].map({"Lower secondary":1,"Secondary / secondary special":2,"Incomplete higher":3,"Higher education":4,"Academic degree":5})
    
    for suffix in ['_AVG','_MODE','_MEDI']:
        dataset = dataset.drop(['APARTMENTS' + suffix], axis=1)
        dataset = dataset.drop(['BASEMENTAREA' + suffix], axis=1)
        dataset = dataset.drop(['YEARS_BEGINEXPLUATATION' + suffix], axis=1)
        dataset = dataset.drop(['YEARS_BUILD' + suffix], axis=1)
        dataset = dataset.drop(['COMMONAREA' + suffix], axis=1)
        dataset = dataset.drop(['ELEVATORS' + suffix], axis=1)
        dataset = dataset.drop(['ENTRANCES' + suffix], axis=1)
        dataset = dataset.drop(['FLOORSMAX' + suffix], axis=1)
        dataset = dataset.drop(['FLOORSMIN' + suffix], axis=1)
        dataset = dataset.drop(['LANDAREA' + suffix], axis=1)
        dataset = dataset.drop(['LIVINGAPARTMENTS' + suffix], axis=1)
        dataset = dataset.drop(['LIVINGAREA' + suffix], axis=1)
        dataset = dataset.drop(['NONLIVINGAPARTMENTS' + suffix], axis=1)
        dataset = dataset.drop(['NONLIVINGAREA' + suffix], axis=1)
        
    dataset = dataset.drop(['FONDKAPREMONT_MODE'], axis=1)
    dataset = dataset.drop(['HOUSETYPE_MODE'], axis=1)
    dataset = dataset.drop(['TOTALAREA_MODE'], axis=1)
    dataset = dataset.drop(['WALLSMATERIAL_MODE'], axis=1)
    dataset = dataset.drop(['EMERGENCYSTATE_MODE'], axis=1)
    
    #商品价格-填充空值
    dataset.loc[dataset['AMT_GOODS_PRICE'] == 0,'AMT_GOODS_PRICE'] = dataset['AMT_GOODS_PRICE'].median()
    #每期还款-填充空值
    dataset.loc[dataset['AMT_ANNUITY'] == 0,'AMT_ANNUITY'] = dataset['AMT_ANNUITY'].median()
    # 其他
    dataset = pd.get_dummies(dataset)
    
    combine_origin[i] = dataset
    
    # 收入/信贷金额
    dataset['RATE_INCOME_CREDIT'] = dataset['AMT_INCOME_TOTAL'] / dataset['AMT_CREDIT']
    # 收入/商品价格
    dataset['RATE_INCOME_GPRICE'] = dataset['AMT_INCOME_TOTAL'] / dataset['AMT_GOODS_PRICE']
    # 每期还款/信贷金额
    dataset['RATE_ANNUITY_CREDIT'] = dataset['AMT_ANNUITY'] / dataset['AMT_CREDIT']
    # 每期还款/收入
    dataset['RATE_ANNUITY_INCOME'] = dataset['AMT_ANNUITY'] / dataset['AMT_INCOME_TOTAL']
    
    combine.append(df)


In [52]:
for i,dataset in enumerate(combine_origin):
    dataset = dataset.merge(df_bureau_num,on='SK_ID_CURR',how='left')
    dataset = dataset.merge(df_bureau_cate,on='SK_ID_CURR',how='left')
    
    dataset.replace(np.nan, 0, inplace=True)
    
    combine_origin[i] = dataset



In [10]:
# 分析相关性
# 特性选择。方案一
corr = combine_origin[0].corr()['TARGET'].sort_values()
features = corr[(abs(corr) > 0.01)]

# 选择特性。方案二
# select = SelectPercentile(percentile=50)
# select.fit(X_train, Y_train)

# X_train_selected = select.transform(X_train)
# support = select.get_support()

# support_list = []
# colunms = X_train.columns.values
# for i,s in enumerate(support):
#     if s:
#         support_list.append(colunms[i])

In [55]:
combine_origin[0]['TARGET'].head()

0    1
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [56]:
for i,dataset in enumerate(combine_origin):
    df = combine[i]
    
    df = pd.concat([df,dataset[['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH',
       ('CREDIT_ACTIVE_Closed', 'mean'), 'DAYS_EMPLOYED', 'EXT_SOURCE_1',
       'NAME_EDUCATION_TYPE_Higher education', 'DAYS_LAST_PHONE_CHANGE',
       'CODE_GENDER_F', 'DAYS_ID_PUBLISH', 'NAME_INCOME_TYPE_Pensioner',
       'ORGANIZATION_TYPE_XNA', 'DAYS_REGISTRATION', 'AMT_GOODS_PRICE',
       'OCCUPATION_TYPE_0', ('CREDIT_TYPE_Consumer credit', 'mean'),
       'REGION_POPULATION_RELATIVE', ('CREDIT_ACTIVE_Closed', 'sum'),
       'NAME_CONTRACT_TYPE', ('CREDIT_CURRENCY_currency 1', 'mean'),
       'AMT_CREDIT', 'FLAG_DOCUMENT_6',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_FAMILY_STATUS_Married', 'HOUR_APPR_PROCESS_START',
       'FLAG_PHONE', ('CREDIT_TYPE_Mortgage', 'sum'),
       'NAME_INCOME_TYPE_State servant', ('AMT_CREDIT_SUM', 'max'),
       ('AMT_CREDIT_SUM', 'mean'), 'FLAG_OWN_CAR',
       ('CREDIT_TYPE_Car loan', 'sum'), 'OCCUPATION_TYPE_Accountants',
       ('CREDIT_TYPE_Consumer credit', 'sum'),
       ('CREDIT_TYPE_Mortgage', 'mean'), ('CREDIT_TYPE_Car loan', 'mean'),
       'OCCUPATION_TYPE_Core staff', 'NAME_FAMILY_STATUS_Widow',
       'OCCUPATION_TYPE_Managers', ('AMT_CREDIT_SUM', 'sum'),
       'AMT_REQ_CREDIT_BUREAU_MON',
       'OCCUPATION_TYPE_High skill tech staff',
       'ORGANIZATION_TYPE_School', 'AMT_ANNUITY',
       ('AMT_CREDIT_SUM_LIMIT', 'max'), ('AMT_CREDIT_SUM', 'min'),
       ('AMT_CREDIT_SUM_LIMIT', 'mean'),
       'NAME_INCOME_TYPE_Commercial associate', 'FLAG_DOCUMENT_16',
       'FLAG_DOCUMENT_13', ('AMT_CREDIT_SUM_LIMIT', 'sum'),
       'ORGANIZATION_TYPE_Medicine', 'ORGANIZATION_TYPE_Military',
       'ORGANIZATION_TYPE_Restaurant',
       'NAME_EDUCATION_TYPE_Lower secondary',
       ('AMT_CREDIT_SUM_OVERDUE', 'sum'), 'OCCUPATION_TYPE_Cooking staff',
       'RATE_ANNUITY_CREDIT', ('CREDIT_ACTIVE_Sold', 'mean'),
       'RATE_ANNUITY_INCOME', 'OCCUPATION_TYPE_Security staff',
       'ORGANIZATION_TYPE_Transport: type 3', 'CNT_CHILDREN',
       ('CREDIT_TYPE_Credit card', 'mean'), 'OCCUPATION_TYPE_Sales staff',
       'NAME_HOUSING_TYPE_Rented apartment',
       'ORGANIZATION_TYPE_Construction',
       ('CREDIT_TYPE_Credit card', 'sum'),
       'NAME_FAMILY_STATUS_Civil marriage',
       'ORGANIZATION_TYPE_Business Entity Type 3',
       'NAME_FAMILY_STATUS_Single / not married',
       ('DAYS_CREDIT_ENDDATE', 'max'), ('DAYS_ENDDATE_FACT', 'max'),
       'OCCUPATION_TYPE_Low-skill Laborers', 'FLAG_WORK_PHONE',
       'ORGANIZATION_TYPE_Self-employed',
       'NAME_HOUSING_TYPE_With parents', ('DAYS_CREDIT_UPDATE', 'max'),
       ('CREDIT_TYPE_Microloan', 'sum'), 'OCCUPATION_TYPE_Drivers',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'LIVE_CITY_NOT_WORK_CITY', ('DAYS_CREDIT_ENDDATE', 'min'),
       ('DAYS_CREDIT_ENDDATE', 'mean'), ('CREDIT_TYPE_Microloan', 'mean'),
       'OCCUPATION_TYPE_Laborers', ('CREDIT_ACTIVE_Active', 'sum'),
       'FLAG_DOCUMENT_3', 'REG_CITY_NOT_LIVE_CITY',
       ('DAYS_CREDIT_UPDATE', 'sum'), 'FLAG_EMP_PHONE',
       ('DAYS_CREDIT_ENDDATE', 'sum'), ('DAYS_CREDIT', 'sum'),
       ('DAYS_CREDIT_UPDATE', 'min'), ('CREDIT_ACTIVE_Active', 'mean'),
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'REG_CITY_NOT_WORK_CITY', ('DAYS_ENDDATE_FACT', 'sum'),
       ('DAYS_CREDIT', 'max'), 'CODE_GENDER_M',
       'NAME_INCOME_TYPE_Working', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', ('DAYS_ENDDATE_FACT', 'min'),
       ('DAYS_CREDIT_UPDATE', 'mean'), ('DAYS_CREDIT', 'min'),
       ('DAYS_ENDDATE_FACT', 'mean'), ('DAYS_CREDIT', 'mean')]]],axis=1)
    
    # 多项式特征
    features_list = ['AMT_CREDIT', 'AMT_GOODS_PRICE']
    poly_features = dataset[features_list]
    poly_transformer = PolynomialFeatures(degree=3)
    poly_features = poly_transformer.fit_transform(poly_features)
    poly_features = pd.DataFrame(poly_features, columns=poly_transformer.get_feature_names(features_list))
    
    df = pd.concat([df,poly_features[['AMT_GOODS_PRICE^2']]],axis=1)
    
    df.replace(np.nan, 0, inplace=True)
    df.replace(np.inf, 0, inplace=True)
    
    combine[i] = df

In [134]:
# features_list = ['AMT_CREDIT', 'AMT_GOODS_PRICE']
# features_list = ['REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY']
# features_list = ['DAYS_BIRTH', 'DAYS_EMPLOYED','DAYS_LAST_PHONE_CHANGE','DAYS_REGISTRATION','DAYS_ID_PUBLISH']

# poly_features = combine_origin[0][features_list]
# poly_target = combine_origin[0]['TARGET']
# poly_transformer = PolynomialFeatures(degree=3)
# poly_features = poly_transformer.fit_transform(poly_features)
# poly_features = pd.DataFrame(poly_features, columns=poly_transformer.get_feature_names(features_list))

# poly_features['TARGET'] = poly_target


In [61]:
df_train['TARGET'].head()

0    1
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [60]:
df_train = combine[0]
df_test = combine[1]

In [62]:
X_train = df_train.drop(["TARGET",'SK_ID_CURR'], axis=1)
Y_train = df_train["TARGET"]

X_test  = df_test.drop("SK_ID_CURR", axis=1).copy()

In [65]:
# 数据对齐
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,"(CREDIT_ACTIVE_Closed, mean)",DAYS_EMPLOYED,EXT_SOURCE_1,NAME_EDUCATION_TYPE_Higher education,DAYS_LAST_PHONE_CHANGE,CODE_GENDER_F,DAYS_ID_PUBLISH,NAME_INCOME_TYPE_Pensioner,ORGANIZATION_TYPE_XNA,DAYS_REGISTRATION,AMT_GOODS_PRICE,OCCUPATION_TYPE_0,"(CREDIT_TYPE_Consumer credit, mean)",REGION_POPULATION_RELATIVE,"(CREDIT_ACTIVE_Closed, sum)",NAME_CONTRACT_TYPE,"(CREDIT_CURRENCY_currency 1, mean)",AMT_CREDIT,FLAG_DOCUMENT_6,NAME_HOUSING_TYPE_House / apartment,NAME_FAMILY_STATUS_Married,HOUR_APPR_PROCESS_START,FLAG_PHONE,"(CREDIT_TYPE_Mortgage, sum)",NAME_INCOME_TYPE_State servant,"(AMT_CREDIT_SUM, max)","(AMT_CREDIT_SUM, mean)",FLAG_OWN_CAR,"(CREDIT_TYPE_Car loan, sum)",OCCUPATION_TYPE_Accountants,"(CREDIT_TYPE_Consumer credit, sum)","(CREDIT_TYPE_Mortgage, mean)","(CREDIT_TYPE_Car loan, mean)",OCCUPATION_TYPE_Core staff,NAME_FAMILY_STATUS_Widow,OCCUPATION_TYPE_Managers,"(AMT_CREDIT_SUM, sum)",AMT_REQ_CREDIT_BUREAU_MON,OCCUPATION_TYPE_High skill tech staff,ORGANIZATION_TYPE_School,AMT_ANNUITY,"(AMT_CREDIT_SUM_LIMIT, max)","(AMT_CREDIT_SUM, min)","(AMT_CREDIT_SUM_LIMIT, mean)",NAME_INCOME_TYPE_Commercial associate,FLAG_DOCUMENT_16,FLAG_DOCUMENT_13,"(AMT_CREDIT_SUM_LIMIT, sum)",ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Restaurant,NAME_EDUCATION_TYPE_Lower secondary,"(AMT_CREDIT_SUM_OVERDUE, sum)",OCCUPATION_TYPE_Cooking staff,RATE_ANNUITY_CREDIT,"(CREDIT_ACTIVE_Sold, mean)",RATE_ANNUITY_INCOME,OCCUPATION_TYPE_Security staff,ORGANIZATION_TYPE_Transport: type 3,CNT_CHILDREN,"(CREDIT_TYPE_Credit card, mean)",OCCUPATION_TYPE_Sales staff,NAME_HOUSING_TYPE_Rented apartment,ORGANIZATION_TYPE_Construction,"(CREDIT_TYPE_Credit card, sum)",NAME_FAMILY_STATUS_Civil marriage,ORGANIZATION_TYPE_Business Entity Type 3,NAME_FAMILY_STATUS_Single / not married,"(DAYS_CREDIT_ENDDATE, max)","(DAYS_ENDDATE_FACT, max)",OCCUPATION_TYPE_Low-skill Laborers,FLAG_WORK_PHONE,ORGANIZATION_TYPE_Self-employed,NAME_HOUSING_TYPE_With parents,"(DAYS_CREDIT_UPDATE, max)","(CREDIT_TYPE_Microloan, sum)",OCCUPATION_TYPE_Drivers,DEF_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,LIVE_CITY_NOT_WORK_CITY,"(DAYS_CREDIT_ENDDATE, min)","(DAYS_CREDIT_ENDDATE, mean)","(CREDIT_TYPE_Microloan, mean)",OCCUPATION_TYPE_Laborers,"(CREDIT_ACTIVE_Active, sum)",FLAG_DOCUMENT_3,REG_CITY_NOT_LIVE_CITY,"(DAYS_CREDIT_UPDATE, sum)",FLAG_EMP_PHONE,"(DAYS_CREDIT_ENDDATE, sum)","(DAYS_CREDIT, sum)","(DAYS_CREDIT_UPDATE, min)","(CREDIT_ACTIVE_Active, mean)",NAME_EDUCATION_TYPE_Secondary / secondary special,REG_CITY_NOT_WORK_CITY,"(DAYS_ENDDATE_FACT, sum)","(DAYS_CREDIT, max)",CODE_GENDER_M,NAME_INCOME_TYPE_Working,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,"(DAYS_ENDDATE_FACT, min)","(DAYS_CREDIT_UPDATE, mean)","(DAYS_CREDIT, min)","(DAYS_ENDDATE_FACT, mean)","(DAYS_CREDIT, mean)",AMT_GOODS_PRICE^2
0,0.79,0.16,52.7,0.571429,6.4,0.753,1,4.8,1,2.2,0,0,14.2,45.0,1,1.0,0.019,4.0,0,1.0,56.88,0,1,1,18,0,0.0,0,37.8,20.761429,0,0.0,0,7.0,0.0,0.0,0,0,0,145.33,0.0,0,0,2.06,0.0,8.55,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0.036217,0.0,0.152593,0,0,0,0.0,0,0,0,0.0,0,0,0,1778.0,0.0,0,0,0,0,-6.0,0.0,0,0.0,0.0,0,-1329.0,82.428571,0.0,0,3.0,1,0,-652.0,1,577.0,-5145.0,-155.0,0.428571,0,0,-3302.0,-49.0,0,1,2,2,-1328.0,-93.142857,-1572.0,-471.714286,-735.0,2025.0
1,0.292,0.433,49.5,0.333333,12.2,0.565,0,-0.0,0,4.4,0,0,25.0,18.0,0,0.666667,0.036,1.0,0,1.0,22.28,0,1,1,9,0,0.0,0,56.88,21.903333,0,0.0,0,2.0,0.0,0.0,0,0,0,65.71,0.0,0,0,1.74,0.0,2.98,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0.078097,0.0,0.175758,0,0,0,0.333333,0,0,0,1.0,0,0,0,1324.0,0.0,1,0,1,0,-11.0,0.0,0,0.0,0.0,0,-128.0,439.333333,0.0,0,2.0,1,0,-163.0,1,1318.0,-572.0,-121.0,0.666667,1,0,-123.0,-62.0,1,1,2,2,-123.0,-54.333333,-373.0,-41.0,-190.666667,324.0
2,0.7,0.611,54.9,1.0,12.2,0.0,1,2.3,0,9.6,0,0,6.0,63.0,0,0.5,0.019,4.0,0,1.0,66.33,0,1,1,14,0,0.0,0,126.22,51.805,1,2.0,0,2.0,0.0,0.5,0,0,0,207.22,0.0,0,0,6.98,0.0,2.65,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0.105231,0.0,0.344691,0,1,0,0.0,0,0,0,0.0,0,0,0,-567.0,-549.0,0,0,0,0,-4.0,0.0,1,0.0,0.0,0,-1707.0,-1068.0,0.0,0,0.0,0,0,-3102.0,1,-4272.0,-6950.0,-1334.0,0.0,0,0,-4219.0,-1210.0,1,1,2,2,-1334.0,-775.5,-2070.0,-1054.75,-1737.5,3969.0
3,0.51,0.613,38.3,0.583333,5.1,0.526,0,4.9,1,11.5,0,0,5.5,157.5,0,0.583333,0.026,7.0,0,1.0,157.5,0,1,1,11,1,0.0,0,39.38,12.674167,0,0.0,0,7.0,0.0,0.0,0,0,0,152.09,0.0,0,0,4.9,10.14,0.0,0.845,0,0,0,10.14,0,0,0,0,0.0,0,0.031111,0.0,0.155556,0,0,2,0.416667,1,0,0,5.0,0,1,0,30885.0,0.0,0,0,0,0,-20.0,0.0,0,0.0,0.0,0,-1862.0,1989.75,0.0,0,5.0,1,0,-7818.0,1,23877.0,-16821.0,-1564.0,0.416667,1,0,-8668.0,-269.0,0,1,2,2,-1862.0,-651.5,-2105.0,-722.333333,-1401.75,24806.25
4,0.426,0.0,35.7,0.0,6.0,0.202,0,2.2,0,11.7,0,0,11.0,62.55,1,0.0,0.01,0.0,0,0.0,62.55,0,1,1,5,0,0.0,0,0.0,0.0,1,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0,3.21,0.0,0.0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0.051319,0.0,0.178333,0,0,1,0.0,0,0,0,0.0,0,1,0,0.0,0.0,0,1,0,0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,1,0,0.0,1,0.0,0.0,0.0,0.0,1,1,0.0,0.0,1,1,2,2,0.0,0.0,0.0,0.0,0.0,3912.5025


In [66]:
# 特征缩放
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [67]:
Y_train.head()

0    1
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [174]:
# model = linear_model.LinearRegression()
# model.fit(X_train, Y_train)
# Y_pred = model.predict(X_test)

In [68]:
logreg = LogisticRegression(solver='liblinear',max_iter=10000)
logreg.fit(X_train,Y_train)
# Y_pred = logreg.predict(X_test)
Y_pred = logreg.predict_proba(X_test)
# acc_log = logreg.score(X_train,Y_train)

In [69]:
pred = []
for t in Y_pred:
    pred.append(t[1])

In [70]:
len(X_test)

48744

In [71]:
submission = pd.DataFrame({
        "SK_ID_CURR": df_test_origin["SK_ID_CURR"],
        "TARGET": pred
    })

In [72]:
submission.to_csv('result/submission.csv', index=False)