In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split as tts
from sklearn import preprocessing as pr
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn import metrics
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import ensemble
import category_encoders as ce
import warnings
import xgboost as xgb
warnings.simplefilter('ignore')

In [21]:
def get_numerical_features(sdata):
    features = set(sdata.columns) & set([("Var%d" % i) for i in range(1, 191)])
    return list(features)


def get_categorial_features(sdata):
    features = set(sdata.columns) & set([("Var%d" % i) for i in range(191, 231)])
    return list(features)


def get_kaggle_data():
    # считываем данные
    churn_train_data = pd.read_csv('churn_train_data.csv')
    labels = churn_train_data['labels']
    churn_train_data = churn_train_data.drop(['labels', 'ID'], axis=1)
    churn_test_data = pd.read_csv('churn_test_data.csv')
    churn_test_data = churn_test_data.drop(['ID'], axis=1)
    return churn_train_data, labels, churn_test_data


def write_ans(file_name, values):
    # функция записи ответа
    with open(file_name, 'w') as il:
        il.write('ID,result\n')
        for (idx, element) in enumerate(values):
            il.write(str(idx))
            il.write(str(","))
            il.write(str(round(element, 10)))
            il.write('\n')

In [22]:
def process_numerical_features(sdata, ndata):
    features = get_numerical_features(sdata)
    to_remove = set()
    for (idx, feature) in enumerate(features):
        non_na = np.array(sdata[feature].dropna())
        mean, std = non_na.mean(), non_na.std()
        if np.isnan(mean / std) or np.isinf(mean / std):
            to_remove.add(feature)
            continue

        sdata[feature].fillna(mean, inplace=True)
        ndata[feature].fillna(mean, inplace=True)
    features = get_numerical_features(sdata)    
    features = list(set(features) - to_remove)
    return sdata[features].as_matrix(), ndata[features].as_matrix()


def remove_incomplete_features(sdata, ndata, features):
    to_remove = set()
    for feature in features:
        nan_part = sum(sdata[feature].isnull()) / float(sdata.shape[0])
        if nan_part > 0.9:
            sdata = sdata.drop(feature, axis=1)
            ndata = ndata.drop(feature, axis=1)
            to_remove.add(feature)    
    return sdata, ndata

def apply_str(data):
    features = get_categorial_features(data)
    for feature in features:
        data[feature] = data[feature].apply(lambda x: str(x))
    return data[features]


def separate_categorial_features(sdata, ndata):
    features = get_categorial_features(sdata)
    first = set(filter(lambda x: len(set(sdata[x].values)) <= 15, features))
    second = set(features) - set(first)
    first, second = list(first), list(second)
    return sdata[first], sdata[second], ndata[first], ndata[second]


def one_hot(sdata, ndata):
    encoder = DV(sparse = False)
    sdata = encoder.fit_transform(sdata.T.to_dict().values())
    ndata = encoder.transform(ndata.T.to_dict().values())
    return sdata, ndata


def label_encoder(sdata, ndata):
    features = get_categorial_features(sdata)
    for feature in features:
        # применяем LabelEncoder на все строки
        encoder = pr.LabelEncoder()
        values = set(sdata[feature].values) | set(ndata[feature].values)
        encoder.fit(list(values))
        sdata[feature] = encoder.transform(sdata[feature])
        ndata[feature] = encoder.transform(ndata[feature])
    return sdata.as_matrix(), ndata.as_matrix()


def process_categorial_features(sdata, ndata):
    sdata, ndata = apply_str(sdata), apply_str(ndata)
    features = get_categorial_features(sdata)
    for feature in features:
        if len(set(sdata[feature].values)) == 1:
            # удаляем постоянные признаки
            sdata = sdata.drop(feature, axis=1)
            ndata = ndata.drop(feature, axis=1)
    return label_encoder(sdata, ndata)


def prepare_train_data(sdata, ndata, labels, truncate=False):
    # удаляем фичи в которых большее количество пустых ячеек
    sdata, ndata = remove_incomplete_features(sdata, ndata, list(sdata.columns))
    # обрабатываем численные фичи
    numerical_train_data, numerical_test_data = process_numerical_features(
        sdata, ndata)
    # обрабатываем категориальные признаки
    categorial_train_data, categorial_test_data = process_categorial_features(
        sdata, ndata)
    prepared_train_data = np.concatenate([
            numerical_train_data, categorial_train_data], axis=1)
    prepared_test_data = np.concatenate([
            numerical_test_data, categorial_test_data], axis=1)   
    return prepared_train_data, np.array(labels), prepared_test_data


churn_train_data, labels, churn_test_data = get_kaggle_data()
X, y, ansX = prepare_train_data(churn_train_data, churn_test_data, labels)

In [23]:
def XGB(X, y, X_test, tree_size = 50, depth = 3, child_weight = 10):
    estimator = xgb.XGBClassifier(
        learning_rate=0.1, max_depth = depth, seed = 42, n_estimators = tree_size, min_child_weight = child_weight)
    folds = cross_validation.cross_val_score(
        estimator, X, y, cv = 5, scoring='roc_auc')
    print(folds.mean())
    estimator.fit(X, y)
    return estimator.predict_proba(X_test)[:,1]

values = XGB(X, y, ansX, tree_size = 100)
write_ans('ans15.csv', values)

0.738165255516
