In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier


# Make label encode and then one-hot encode on data
def label_oh_encode(data, feature_names='all'):
    data_le = LabelEncoder()
    data_encoded = data
    if feature_names == 'all':
        for feat in list(data):
            data_encoded[feat] = data_le.fit_transform(data_encoded[feat])
    elif type(feature_names) == list:
        for feat in feature_names:
            data_encoded[feat] = data_le.fit_transform(data_encoded[feat])
        feature_names = get_feature_indices(data_encoded, feature_names)

    oh_encoder = OneHotEncoder(categorical_features=feature_names)
    data_encoded = oh_encoder.fit_transform(data_encoded).toarray()
    df = DataFrame(data=data_encoded)
    return df


# Get column indices of features for the dataset
def get_feature_indices(data, feature_names):
    header_index = []
    header = list(data)
    for name in feature_names:
        header_index.append(header.index(name))
    return header_index


# Add header row for data object of DataFrame type
def add_header(data, base_name, additional):
    column_names = [(base_name + str(i))
                    for i in range(data.shape[1] - len(additional))]
    for lbl in additional:
        column_names.append(lbl)
    data.columns = column_names
    return data


# Replacing missing values in the data
def imput_column_data(data, columns='all', imput_type='median'):
    data_imputer = Imputer(strategy=imput_type)
    imputed_data = data
    if columns == 'all':
        imputed_data = data_imputer.fit_transform(imputed_data)
    elif type(columns) == list:
        for col in columns:
            imputed_data[col] = data_imputer.fit_transform(imputed_data[col])

    df = DataFrame(data=imputed_data)
    df.columns = list(data)
    return df


# Select predictors that are best fit 
def select_predictors(data, target, base_name, oh_length=0, threshold=5.0, k_best=3, col_all=True):
    if not col_all:
        # Get all column headers, except for ID, Month and TARGET columns
        predictors = list(data)[2:-1]
    else:
        predictors = list(data)

    # Select only features with high variance with kbest
    selector = SelectKBest(f_classif, k=k_best)
    selector.fit(data[predictors], data[target])

    # Scale scores logarithmically
    scores = -np.log10(selector.pvalues_)
    pd_df = DataFrame(data=predictors)
    predictors = pd_df.loc[scores >= threshold]
    predictors = predictors[0].values.tolist()
    predictors = predictors + [(base_name + str(i))
                               for i in range(oh_length)]
    return predictors


# Creates random forest classifier for specified data
def create_rnd_forest(n_estim=100):
    forest = RandomForestClassifier(
        n_estimators=n_estim)
    return forest


# Creates support vector machines classifier for specified data
def create_svc(c=1.0, kernel='rbf'):
    svc = SVC(C=c, kernel=kernel)
    return svc


# Creates bagging classifier with decision tree as estimator for specified data
def create_bag_cls(n_estim=200):
    dcg_tree = DecisionTreeClassifier(criterion='gini')
    bag = BaggingClassifier(
        base_estimator=dcg_tree,
        n_estimators=n_estim,
        n_jobs=-1)
    return bag


def main():
    target_column = 'TARGET'
    feature_threshold = 9.0

    test_class = pd.read_csv('test.txt', header=0, sep='\t')
    train_class = pd.read_csv('train.txt', header=0, sep='\t')
    base1_feat = pd.read_csv('Base1.txt', header=0, sep='\t')
    base2_feat = pd.read_csv('Base2.txt', header=0, sep='\t')
    base2_feat_oh = label_oh_encode(base2_feat, feature_names=['T1', 'T2', 'T3', 'T4'])
    base2_feat_oh = add_header(base2_feat_oh, 'T', ['ID'])
    base1_feat_imp = imput_column_data(base1_feat)
    base1_feat_imp['key'] = base1_feat_imp['ID']
    full_data = base1_feat_imp.groupby('key').mean()

    full_data = pd.merge(full_data, base2_feat_oh, on='ID', how='right')
    full_train_data = pd.merge(full_data, train_class, on='ID', how='inner')
    full_test_data = pd.merge(full_data, test_class, on='ID', how='inner')

    predictors = select_predictors(
        full_train_data,
        target_column,
        'T',
        base2_feat_oh.shape[1] - 2,
        threshold=feature_threshold,
        col_all=False)
    forest = create_rnd_forest()
    svm = create_svc()
    bag = create_bag_cls()
    vt_classifier = VotingClassifier([
        ('Random Forest', forest),
        ('SVM', svm),
        ('Bagging', bag)],
        voting='hard')
    vt_classifier = vt_classifier.fit(
        full_train_data[predictors],
        full_train_data[target_column])
    test_pred = vt_classifier.predict(full_test_data[predictors])

    submission = pd.DataFrame({
        "ID": full_test_data["ID"],
        "TARGET": test_pred.astype(int)
    })
    submission.to_csv("ChankovYehor_test.txt", index=False, sep='\t')

In [18]:
main()

  177  179  181  195  199  212  230  232  236  237  245  251  255  261  274
  288  303  320  321  328  336  337  343  347  351  352  363  401  402  411
  419  429  444  455  461  462  465  485  494  500  502  508  509  511  516
  524  531  535  536  537  544  553  554  560  562  563  565  566  572  576
  581  589  590  591  593  595  609  614  617  620  623  632  638  645  651
  652  681  686  689  695  730  743  746  751  757  771  774  781  790  819
  820  824  827  834  839  840  841  843  849  850  853  856  858  861  864
  873  874  875  876  885  888  904  911  919  924  927  928  930  938  939
  955  959  967  969  977  983 1002 1025 1028 1032 1033 1041 1051 1056 1070
 1079 1086 1089 1090 1104 1113 1124 1144 1162 1163 1168 1172 1195 1197 1211
 1212 1214 1216 1217 1221 1239 1253 1265 1274 1279 1289 1296 1297 1304 1315
 1332 1341 1347 1353 1358 1359 1367 1369 1378 1381 1385 1401 1409 1410 1411
 1414 1421 1427 1429 1430 1443 1445 1448 1460 1473 1475 1479 1481 1489 1491
 1494 1495 1