In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
import seaborn as sns
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [89]:
def normalize(df):
    return (df-df.mean())/df.std()

In [90]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.drop(columns=['5'], inplace=True)
test.drop(columns=['5'], inplace=True)

num_cols = list(train.columns[:-1])
train[num_cols] = normalize(train[num_cols])
test[num_cols] = normalize(test[num_cols])

np.shape(train), np.shape(test)

((6963, 31), (3920, 30))

In [91]:
X_train, X_val, y_train, y_val = train_test_split(train[num_cols], train.iloc[:, -1], test_size=0.3, random_state=0)
X_test = test[num_cols]

### SVM 

In [92]:
model = svm.SVC()
model = model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.8801431127012522, 0.9479768786127167, 0.8213689482470785)

In [93]:
confusion_matrix(y_val, y_val_pred)

array([[1463,   27],
       [ 107,  492]], dtype=int64)

In [94]:
# X = pd.concat([X_train, X_val])
# y = pd.concat([y_train, y_val])
# model.fit(X, y)

# answer = pd.DataFrame()
# answer['target'] = model.predict(X_test)
# answer.target.value_counts()

### Oversampling

### Данное решение выше было загружено на All_Cups

In [96]:
# separate minority and majority class
from sklearn.utils import resample

def oversampling(X_train, y_train):
    train_data = pd.concat([X_train, y_train], axis=1)
    zero_class = train_data[train_data['target']==0] 
    one_class = train_data[train_data['target']==1]  # minority
    one_class_upsampled = resample(one_class, replace=True, 
                                  n_samples=len(zero_class),
                                  random_state=27)
    # combine majority and upsampled minority
    upsampled = pd.concat([zero_class, one_class_upsampled])

    X_train_up = upsampled.copy()
    X_train_up.drop(columns='target', inplace=True)
    y_train_up = upsampled['target']
    return X_train_up, y_train_up

In [97]:
model = svm.SVC() ### подобранные параметры kernel='rbf', C=1.33)
X_train_up, y_train_up = oversampling(X_train, y_train)
model = model.fit(X_train_up, y_train_up)
y_val_pred = model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.8995057660626029, 0.8878048780487805, 0.9115191986644408)

In [98]:
confusion_matrix(y_val, y_val_pred)

array([[1421,   69],
       [  53,  546]], dtype=int64)

In [99]:
answer = pd.DataFrame()
answer['target'] = model.predict(X_test)
# answer.to_csv('svc_all_normal_sub.csv', index=False)

In [100]:
answer.target.value_counts()

0    2119
1    1801
Name: target, dtype: int64

### Detecting outliers

In [101]:
from sklearn.ensemble import IsolationForest
def isolation_outliers(X_train, y_train):
    iso = IsolationForest(contamination=0.01)
    y_hat = iso.fit_predict(X_train)
    mask = y_hat != -1
    X_train_iso, y_train_iso = X_train[mask], y_train[mask]
    return X_train_iso, y_train_iso

In [102]:
X_train_iso, y_train_iso = isolation_outliers(X_train_up, y_train_up)



In [103]:
model = svm.SVC() ### подобранные параметры kernel='rbf', C=1.33)
model = model.fit(X_train_iso, y_train_iso)
y_val_pred = model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.8993399339933993, 0.8890701468189234, 0.9098497495826378)

In [104]:
confusion_matrix(y_val, y_val_pred)

array([[1422,   68],
       [  54,  545]], dtype=int64)

### SVM One Class

In [154]:
X_train_cl, y_train_cl = one_class(X_train_up, y_train_up)

one_class_model = svm.SVC()
one_class_model = one_class_model.fit(X_train_cl, y_train_cl)
y_val_pred = one_class_model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.9008264462809917, 0.8919803600654664, 0.9098497495826378)

In [155]:
confusion_matrix(y_val, y_val_pred)

array([[1424,   66],
       [  54,  545]], dtype=int64)

In [190]:
answer = pd.DataFrame()
answer['target'] = one_class_model.predict(X_test)

In [191]:
answer.target.value_counts()

0    2126
1    1794
Name: target, dtype: int64

### Probability

In [164]:
### Trying mode with probability
from sklearn.calibration import CalibratedClassifierCV

proba_model = svm.SVC() ### подобранные параметры kernel='rbf', C=1.33)
clf = CalibratedClassifierCV(proba_model)
clf.fit(X_train_up, y_train_up)
y_proba = clf.predict_proba(X_val)

In [165]:
### Посмотрим, можно ли поиграться с cut-off для лучшего значения f1
f_scores = []
for i in range(1, 100):
    pred_prob_y = y_proba[:, 1] >= i/100
    pred_prob_y = pred_prob_y + 0
    f = f1_score(y_val, pred_prob_y)
    f_scores.append(f)

In [166]:
np.argmax(f_scores), np.max(f_scores)

(50, 0.8979253112033194)

In [184]:
y_proba = clf.predict_proba(X_val)
pred_prob_y = y_proba[:, 1] > 60/100
pred_prob_y = pred_prob_y + 0
confusion_matrix(y_val, pred_prob_y), f1_score(y_val, pred_prob_y)

(array([[1437,   53],
        [  70,  529]], dtype=int64),
 0.8958509737510585)

In [192]:
y_proba = clf.predict_proba(X_test)
pred_prob_y = y_proba[:, 1] >= 60/100
pred_prob_y = pred_prob_y + 0
answer['target_2'] = pred_prob_y

### Trying to combine probability and One class  

In [132]:
#answer[['target', 'target_ocl', 'target_prob']].value_counts()

In [133]:
y_target = answer.target_ocl|answer.target_prob
answer['target_2'] = y_target
answer[['target', 'target_2']].value_counts()

target  target_2
0       0           2115
1       1           1791
        0             10
0       1              4
dtype: int64

### RF 

In [186]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=850)
rf_clf.fit(X_train_iso, y_train_iso)
y_val_pred = rf_clf.predict(X_val)
confusion_matrix(y_val, y_val_pred), f1_score(y_val, y_val_pred)

(array([[1479,   11],
        [ 282,  317]], dtype=int64),
 0.6839266450916937)

In [146]:
# f = []
# for i in range(500, 1000, 50):
#     n_estimators = i
#     rf_clf = RandomForestClassifier(random_state=42, n_estimators=n_estimators)
#     rf_clf.fit(X_train_iso, y_train_iso)
#     y_val_pred = rf_clf.predict(X_val)
#     f.append(f1_score(y_val, y_val_pred))

In [136]:
# from sklearn.ensemble import RandomForestClassifier
# rf_clf = RandomForestClassifier(random_state=42, n_estimators=1000)
# rf_clf.fit(X_train, y_train)
# y_val_pred = rf_clf.predict(X_val)
# confusion_matrix(y_val, y_val_pred), f1_score(y_val, y_val_pred)

(array([[1487,    3],
        [ 321,  278]], dtype=int64),
 0.6318181818181818)

In [193]:
### получили лишние 9 единиц по сравнению с предыдущей моделью
answer['target_3'] = rf_clf.predict(X_test)
answer[['target', 'target_2', 'target_3']].value_counts()

target  target_2  target_3
0       0         0           2116
1       1         1            992
                  0            709
        0         0             89
0       0         1             10
1       0         1              4
dtype: int64

In [198]:
answer['target'] = answer.target.values|answer.target_2.values|answer.target_3.values|answer.target_4.values
answer[['target']].value_counts()

target
0         2116
1         1804
dtype: int64

### Extra Tree

In [151]:
from sklearn.ensemble import ExtraTreesClassifier
ex_tree_clf = ExtraTreesClassifier(n_estimators=1000, max_features=27, criterion='gini', max_depth=None, random_state=42)
ex_tree_clf.fit(X_train_up, y_train_up)
y_val_pred = ex_tree_clf.predict(X_val)
confusion_matrix(y_val, y_val_pred), f1_score(y_val, y_val_pred)

(array([[1487,    3],
        [ 336,  263]], dtype=int64),
 0.608092485549133)

In [195]:
from sklearn.ensemble import ExtraTreesClassifier
ex_tree_clf = ExtraTreesClassifier(n_estimators=1000, max_features=27, criterion='gini', max_depth=None, random_state=42)
ex_tree_clf.fit(X_train, y_train)
y_val_pred = ex_tree_clf.predict(X_val)
confusion_matrix(y_val, y_val_pred), f1_score(y_val, y_val_pred)

(array([[1482,    8],
        [ 278,  321]], dtype=int64),
 0.6918103448275862)

In [196]:
answer['target_4'] = ex_tree_clf.predict(X_test)
answer[['target', 'target_2',  'target_3', 'target_4']].value_counts()

target  target_2  target_3  target_4
0       0         0         0           2116
1       1         1         1            858
                  0         0            609
                  1         0            134
                  0         1            100
        0         0         0             87
0       0         1         1              6
                            0              4
1       0         1         0              3
                  0         1              2
                  1         1              1
dtype: int64

In [197]:
answer['target'] = answer.target.values|answer.target_2.values|answer.target_3.values|answer.target_4.values
answer[['target']].value_counts()

target
0         2116
1         1804
dtype: int64

In [203]:
answer['target'].to_csv('models_ensemble.csv')