In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
def postprocess_separate(submission_df, test_df=None, pure_df=None):
    """Update submission_df so that the predictions for the two sides of the hyperplane don't overlap.
    
    Parameters
    ----------
    submission_df : pandas DataFrame with columns 'id' and 'target'
    test_df : the competition's test data
    pure_df : the competition's original training data
    
    From https://www.kaggle.com/ambrosm/tpsnov21-007-postprocessing
    """
    if pure_df is None: pure_df = pd.read_csv('/content/drive/MyDrive/AI/dataset/tabular-playground-series-nov-2021/train.csv')
    if pure_df.shape != (600000, 102): raise ValueError("pure_df has the wrong shape")
    if test_df is None: test_df = pd.read_csv('/content/drive/MyDrive/AI/dataset/tabular-playground-series-nov-2021/test.csv')
    if test_df.shape[0] != submission_df.shape[0] or test_df.shape[1] != 101: raise ValueError("test_df has the wrong shape")

    # Find the separating hyperplane for pure_df, step 1
    # Use an SVM with almost no regularization
    model1 = make_pipeline(StandardScaler(), LinearSVC(C=1e5, tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=1))
    model1.fit(pure_df.drop(columns=['id', 'target']), pure_df.target)
    pure_pred = model1.predict(pure_df.drop(columns=['id', 'target']))
    print((pure_pred != pure_df.target).sum(), (pure_pred == pure_df.target).sum()) # 1 599999
    # model1 is not perfect: it predicts the wrong class for 1 of 600000 samples

    # Find the separating hyperplane for pure_df, step 2
    # Fit a second SVM to a subset of the points which contains the support vectors
    pure_pred = model1.decision_function(pure_df.drop(columns=['id', 'target']))
    subset_df = pure_df[(pure_pred > -5) & (pure_pred < 0.9)]
    model2 = make_pipeline(StandardScaler(), LinearSVC(C=1e5, tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=1))
    model2.fit(subset_df.drop(columns=['id', 'target']), subset_df.target)
    pure_pred = model2.predict(pure_df.drop(columns=['id', 'target']))
    print((pure_pred != pure_df.target).sum(), (pure_pred == pure_df.target).sum()) # 0 600000
    # model2 is perfect: it predicts the correct class for all 600000 training samples
    
    pure_test_pred = model2.predict(test_df.drop(columns=['id', 'target'], errors='ignore'))
    lmax, rmin = sub[pure_test_pred == 0].target.max(), sub[pure_test_pred == 1].target.min()
    if lmax < rmin:
        print("There is no overlap. No postprocessing needed.")
        return
    # There is overlap. Remove this overlap
    sub.loc[pure_test_pred == 0, 'target'] -= lmax + 1
    sub.loc[pure_test_pred == 1, 'target'] -= rmin - 1
    print(sub[pure_test_pred == 0].target.min(), sub[pure_test_pred == 0].target.max(),
          sub[pure_test_pred == 1].target.min(), sub[pure_test_pred == 1].target.max())
    




In [38]:
sub = pd.read_csv('/content/drive/MyDrive/AI/dataset/tabular-playground-series-nov-2021/sample_submission.csv')
postprocess_separate(sub)
sub.to_csv("/content/drive/MyDrive/AI/dataset/tabular-playground-series-nov-2021/svm_statistic_sub", index=False)
sub.head(20)

157559 442441
157657 442343
-1.0 -1.0 1.0 1.0


Unnamed: 0,id,target
0,600000,1.0
1,600001,1.0
2,600002,1.0
3,600003,-1.0
4,600004,1.0
5,600005,-1.0
6,600006,-1.0
7,600007,1.0
8,600008,1.0
9,600009,-1.0


In [None]:
    print('정확도(accuracy_score) : ', accuracy_score(subset_df.target, pure_pred))
    print('정밀도(precision_score) : ', precision_score(subset_df.target, pure_pred))
    print('재현율(recall_score) : ', recall_score(subset_df.target, pure_pred))
    print('F1 : ', f1_score(subset_df.target, pure_pred))
    print('AUC : ', roc_auc_score(subset_df.target, pure_pred))

    x = np.array([accuracy_score(subset_df.target, pure_pred),
              precision_score(subset_df.target, pure_pred),
              recall_score(subset_df.target, pure_pred),
              f1_score(subset_df.target, pure_pred),
              roc_auc_score(subset_df.target, pure_pred)])
    
    label = ['accuracy', 'precision', 'recall_score', 'f1_score', 'roc_auc']

    index = np.arange(len(label))


    plt.bar(index, x, width=0.5)
    plt.title('evaluation index', fontsize=20)
    plt.ylabel('%', fontsize=18)
    plt.xticks(index, label, fontsize=15,rotation=90)    # X축의 범위: [xmin, xmax]
    plt.ylim([0, 1])     # Y축의 범위: [ymin, ymax]
    plt.show()
