In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split as tts
from sklearn import preprocessing as pr
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn import metrics
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import ensemble


%matplotlib inline 
%pylab inline

def read_data():
    churn_data = pd.read_csv('orange_small_churn_data.train')
    with open('orange_small_churn_labels.train') as reader:
        churn_data['label'] = np.array(map(int, reader.read().split()))
    return churn_data


def get_train_data():
    np.random.seed(0)
    data = read_data()
    data_train, data_test = tts(data, test_size=0.25)
    # отбрасываем тестовые данные
    return data_train


def judge_classifier(classifier, X, y, metrics=None):
    # Метод для проверки классификатора на нужных метриках
    if not metrics:
        metrics = ['roc_auc', 'accuracy', 'recall', 'f1_weighted'] 
    # Объявим стратегию кросс-валидации
    cv_strategy = cross_validation.StratifiedKFold(y, n_folds=5, random_state = 2)
    print(classifier)
    for metric in metrics:
        # Считаем метрики
        scores = cross_validation.cross_val_score(classifier, X, y, scoring=metric, cv = cv_strategy)
        print("Scores for metric %s:" % metric)
        print(scores)
        print(scores.mean())
    print("=====")

Populating the interactive namespace from numpy and matplotlib


In [6]:
def normalize(s, labels):
    if sum(s.isnull()) == len(s):
        return (np.nan, 0, 0)
    mean = np.array(s.dropna()).mean()
    std = np.array(s.dropna()).std()
    s.fillna(mean, inplace=True)
    s.apply(lambda x: (x - mean) / std)
    a = s[labels < 0].mean()
    b = s[labels > 0].mean()
    return ((a - b), mean, std)

def prepare_train_data(data, truncate=False):
    bad_features = []
    columns = list(data.columns)[:-1]
    for (idx, column) in enumerate(columns[:190]):
        value, mean, std = normalize(data[column].copy(), data.label)
        # нормализуем данные
        if np.isnan(value) or np.isnan(mean / std) or np.isinf(mean / std):
            bad_features.append(column)
        else:
            data[column].fillna(mean, inplace=True)
            data[column] = data[column].apply(lambda x: (x - mean) / std)
    numerical_data = data[columns[:190]]
    for feature in bad_features:
        # удаляем "плохие признаки", являющиеся константными
        numerical_data = numerical_data.drop(feature, axis=1)
    categorial_features = [("Var%d" % i) for i in range(191, 230)]
    cat_bad_features = []
    for feature in categorial_features:
        if len(set(data[feature])) == 1:
            # признак константный, удаляем
            cat_bad_features.append(feature)
    cat_data = data[categorial_features]
    for cat in categorial_features:
        cat_data[cat] = cat_data[cat].apply(lambda x: str(x))
        value_counts = cat_data[cat].value_counts()
        if truncate:
            # Для того чтобы ускорить обучение ограничим данные
            cat_data[cat] = cat_data[cat].apply(lambda x: x if value_counts[x] >= 50 else 'nan')
    for cat in cat_bad_features:
        # удаляем "плохие признаки", являющиеся константными
        cat_data = cat_data.drop(cat, axis=1)
    
    encoder = DV(sparse = False)
    encoded_data = encoder.fit_transform(cat_data.T.to_dict().values())
    prepared_data = np.concatenate([numerical_data.as_matrix(), encoded_data], axis=1)
    return prepared_data, data.label

data = get_train_data()
X, y = prepare_train_data(data, truncate=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
judge_classifier(linear_model.LogisticRegression(random_state = 2), X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Scores for metric roc_auc:
[ 0.59604075  0.60908674  0.60102175  0.61145798  0.5879523 ]
0.601111905006
Scores for metric accuracy:
[ 0.92384603  0.92416667  0.92366667  0.925       0.92315386]
0.923966643594
Scores for metric recall:
[ 0.          0.00446429  0.00446429  0.          0.00669643]
0.003125
Scores for metric f1_weighted:
[ 0.88855663  0.88950307  0.88924876  0.88928139  0.88928679]
0.889175326778
=====


In [13]:
judge_classifier(ensemble.RandomForestClassifier(random_state = 2), X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)
Scores for metric roc_auc:
[ 0.567743    0.5602176   0.6022124   0.56645188  0.57550964]
0.574426905297
Scores for metric accuracy:
[ 0.9250125   0.92533333  0.925       0.92516667  0.9249875 ]
0.925099999167
Scores for metric recall:
[ 0.00222717  0.          0.          0.          0.        ]
0.000445434298441


  'precision', 'predicted', average, warn_for)


Scores for metric f1_weighted:
[ 0.88946426  0.88944783  0.88928139  0.88936461  0.88926318]
0.889364254479
=====


In [17]:
judge_classifier(ensemble.GradientBoostingClassifier(random_state = 2), X, y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2, subsample=1.0, verbose=0,
              warm_start=False)
Scores for metric roc_auc:
[ 0.7106045   0.70749179  0.72364387  0.71778268  0.71215571]
0.714335709911
Scores for metric accuracy:
[ 0.9250125   0.92483333  0.9255      0.92533333  0.92432072]
0.924999976941
Scores for metric recall:
[ 0.00445434  0.00669643  0.00892857  0.00669643  0.        ]
0.00535515431117
Scores for metric f1_weighted:
[ 0.88978617  0.89015966  0.89081904  0.89041597  0.88893006]
0.890022180676
=====
