In [21]:
import pandas as pd
import numpy as np

In [191]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [192]:
submitID = test[['SK_ID_CURR']]
target = train['TARGET']
train = train.drop(columns=['TARGET'])
trainID = train['SK_ID_CURR']
train = train.drop(columns=['Unnamed: 0','SK_ID_CURR'])
test = test.drop(columns=['Unnamed: 0', 'SK_ID_CURR'])

## remove Nan feature

In [44]:
def countNAN(df, threshold = .8):
    d = set()
    for c in df:
        count = df[c].isnull().sum()
        if count / len(df) > threshold:
            d.add(c)
    return list(d)
nan_features = countNAN(train)

In [45]:
train = train.drop(columns=nan_features)
test = test.drop(columns=nan_features)

In [46]:
corrs = train.corr()

## remove Collinear features

In [47]:
threshold = .8
above_threshold = {}
for c in corrs.columns:
    above_threshold[c] = list(corrs.index[np.abs(corrs[c]) > threshold])
    
    
columns_remove = set()
columns_seen = set()
for key, values in above_threshold.items():
    columns_seen.add(key)
    for v in values:
        if v != key:
            if v not in columns_seen:
                columns_remove.add(v)

columns_remove = list(columns_remove)

train = train.drop(columns=columns_remove)
test = test.drop(columns=columns_remove)

In [52]:
feature_name = list(train.columns)

In [54]:
from sklearn.preprocessing import Imputer

In [114]:
imputer = Imputer(strategy = 'median')

train_imputer = imputer.fit_transform(train)

## Flitering

In [115]:
corr = []
for i in range(len(train_imputer[0,:])):
    correlate = np.corrcoef(train_imputer[:,i], target)[0,1]
    corr.append(correlate)
    if correlate == np.nan:
        print(train_imputer[:,i])
        
corr = pd.Series(corr)
corr.index = feature_name

  c /= stddev[:, None]
  c /= stddev[None, :]


In [123]:
corrcoef_features = np.abs(corr).sort_values(ascending = False)[:200].index.tolist()

In [139]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [131]:
train_imputer_norm = MinMaxScaler().fit_transform(train_imputer)

In [146]:
chi2_stat = pd.Series(chi2(train_imputer_norm, target)[0])
chi2_stat.index = feature_name

In [147]:
chi2_features = np.abs(chi2_stat).sort_values(ascending = False)[:200].index.tolist()

## Embeded

In [99]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [93]:
train_imputer = pd.DataFrame(train_imputer)
train_imputer.columns = train.columns

In [100]:
train_imputer_norm = pd.DataFrame(train_imputer_norm)
train_imputer_norm.columns = train.columns

In [101]:
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(train_imputer_norm, target)



SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='warn',
                                             n_jobs=None, penalty='l1',
                                             random_state=None, solver='warn',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False,
                threshold='1.25*median')

In [148]:
lr_features = list(train.iloc[:,embeded_lr_selector.get_support()].columns)

In [151]:
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=200), threshold='1.25*median')
embeded_rf_selector.fit(train_imputer, target)

rf_features = list(train.iloc[:,embeded_rf_selector.get_support()].columns)

In [168]:
from sklearn.ensemble import AdaBoostClassifier

embeded_ada_selector = SelectFromModel(AdaBoostClassifier(n_estimators=200), threshold=1e-3)
embeded_ada_selector.fit(train_imputer, target)

ada_features = list(train.iloc[:, embeded_ada_selector.get_support()].columns)

In [172]:
all_features = list(
    set(corrcoef_features).union(set(chi2_features), set(lr_features), set(rf_features),
                                           set(ada_features))
)

In [193]:
train = train[all_features]
test = test[all_features]

In [194]:
test['SK_ID_CURR'] = submitID
train['SK_ID_CURR'] = trainID

In [195]:
train['TARGET'] = target

In [198]:
train.to_csv('train_after_selection.csv',index=False)
test.to_csv('test_after_selection.csv', index=False)

In [205]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe_ada = Pipeline([('imputer', Imputer(strategy = 'median')),
                    ('scaler', StandardScaler()),
                    ('clf', AdaBoostClassifier(n_estimators=200))
                    ])

In [212]:
train = train.drop(columns='TARGET')
pipe_ada.fit(train, target)

Pipeline(memory=None,
         steps=[('imputer',
                 Imputer(axis=0, copy=True, missing_values='NaN',
                         strategy='median', verbose=0)),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=200,
                                    random_state=None))],
         verbose=False)

In [213]:
pipe_ada.predict_proba(test) 

array([[0.504319  , 0.495681  ],
       [0.50251793, 0.49748207],
       [0.50624599, 0.49375401],
       ...,
       [0.50658325, 0.49341675],
       [0.50328071, 0.49671929],
       [0.50135823, 0.49864177]])

In [214]:
submit = submitID
submit['TARGET'] = pipe_ada.predict_proba(test)[:,1]
submit.to_csv('submit_ada_reduce.csv', index=False)