In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

In [49]:
data = pd.read_csv('aps_failure_training_set.csv')
data

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,neg,153002,na,664,186,0,0,0,0,0,...,998500,566884,1290398,1218244,1019768,717762,898642,28588,0,0
59996,neg,2286,na,2130706538,224,0,0,0,0,0,...,10578,6760,21126,68424,136,0,0,0,0,0
59997,neg,112,0,2130706432,18,0,0,0,0,0,...,792,386,452,144,146,2622,0,0,0,0
59998,neg,80292,na,2130706432,494,0,0,0,0,0,...,699352,222654,347378,225724,194440,165070,802280,388422,0,0


In [51]:
data['class'].replace({'neg': 0, 'pos': 1}, inplace=True)
data.replace({'na': np.nan}, inplace=True)

X = data.drop('class', axis=1)
y = data['class']

In [52]:
data['class'].sum()

1000

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [35]:
model = CatBoostClassifier(silent=True, random_state=1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
f1_score(y_test, pred)

0.7821612349914236

In [89]:
def get_positives_unknown(df, n):
    positive = df[df['class']==1]
    rp = np.random.choice(positive.index, n_positives, replace=False)
    df_p = positive.loc[rp]
    df_u = df.loc[df.index.difference(rp)]
    return df_p, df_u

In [94]:
n_positives = 200
data_p, data_u = get_positives_unknown(data, n_positives)

In [95]:
rn = np.random.choice(data_u.index, n_positives, replace=False)
data_n = data_u.loc[rn]
data_n['class'] = 0
data_rns = pd.concat([data_p, data_n], axis=0)

In [96]:
model = CatBoostClassifier(silent=True, random_state=1)
model.fit(data_rns.drop('class', axis=1), data_rns['class'])
pred = model.predict(data_u.drop('class', axis=1))
f1_score(data_u['class'], pred)

0.32317995247353637

In [110]:
def eval_rns_model(**kwargs):
    data = kwargs['df']
    n_positives = kwargs['n']
    data_p, data_u = get_positives_unknown(data, n_positives)
    rn = np.random.choice(data_u.index, n_positives, replace=False)
    data_n = data_u.loc[rn]
    data_n['class'] = 0
    data_rns = pd.concat([data_p, data_n], axis=0)
    model = CatBoostClassifier(silent=True, random_state=1)
    model.fit(data_rns.drop('class', axis=1), data_rns['class'])
    pred = model.predict(data_u.drop('class', axis=1))
    f1 = f1_score(data_u['class'], pred)
    precision = precision_score(data_u['class'], pred)
    recall = recall_score(data_u['class'], pred)
    roc_auc = roc_auc_score(data_u['class'], pred)
    return {'n_positives': n_positives, 'f1_score': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

In [111]:
result = pd.DataFrame(columns=['n_positives', 'f1_score', 'precision', 'recall', 'roc_auc'])
for i in [200, 400, 600, 800]:
    result = result.append(eval_rns_model(df=data, n=i), ignore_index=True)

In [112]:
result

Unnamed: 0,n_positives,f1_score,precision,recall,roc_auc
0,200.0,0.31102,0.185854,0.9525,0.947962
1,400.0,0.361593,0.224719,0.925,0.940864
2,600.0,0.447182,0.29669,0.9075,0.939165
3,800.0,0.425545,0.279498,0.89125,0.930049


#### Видно, что модель слабо классифицирует объекты 1 класса, precision очень низкий, recall хороший, f1-мера падает довольно сильно, roc_auc хороший