In [705]:
from imblearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
import pandas as pd 
import numpy as np 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [706]:
train = pd.read_csv("train_data.csv")

In [707]:
def clip_df(df, factor=2):
    for col in df.columns.values[:-1]:
        quantile25 = df[col].quantile(0.25)
        quantile75 = df[col].quantile(0.75)
        d = (quantile75-quantile25)*factor
        df[col] = np.clip(df[col], quantile25-d, quantile75+d)
    return df

In [708]:
def normalize_df(df):
    for col in df.columns.values[:-1]:
        mean = df[col].mean()
        std = df[col].std()
        df[col] = (df[col]-mean)/std
    return df

In [709]:
train = clip_df(train)
train = normalize_df(train)

In [710]:
train.shape

(7500, 301)

In [711]:
train.quantile([i/20 for i in range(21)]).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,target
0.0,-3.348112,-3.374773,-3.382031,-3.393624,-3.392549,-3.37549,-3.34832,-3.413993,-3.389158,-3.38718,...,-3.353443,-3.354548,-3.358941,-3.347704,-3.363271,-3.388695,-3.320075,-3.26333,-3.390626,0.0
0.05,-1.655218,-1.652724,-1.634289,-1.644554,-1.631531,-1.66812,-1.660318,-1.640756,-1.624919,-1.66371,...,-1.667666,-1.640746,-1.65806,-1.661641,-1.672341,-1.64989,-1.671809,-1.664678,-1.648658,0.0
0.1,-1.274972,-1.288542,-1.279643,-1.286991,-1.285936,-1.299514,-1.282394,-1.284627,-1.277103,-1.298681,...,-1.280076,-1.275981,-1.297865,-1.281816,-1.272426,-1.282155,-1.287433,-1.287924,-1.287983,0.0


# Benchmark

In [712]:
from sklearn.model_selection import train_test_split

In [713]:
feature_cols = train.columns.values[:-1]
target_col = train.columns.values[-1]
X = train.loc[:, feature_cols]
Y = train.loc[:, target_col]
X.shape, Y.shape

((7500, 300), (7500,))

In [714]:
Y.value_counts()

0    6879
1     621
Name: target, dtype: int64

In [715]:
#test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((6000, 300), (1500, 300), (6000,), (1500,))

In [716]:
Y_train.value_counts()

0    5503
1     497
Name: target, dtype: int64

In [717]:
model0 = LinearSVC(random_state=42, tol = 0.1, C=0.1, class_weight={0:10, 1:50})
model0.fit(X_train,Y_train)
roc_auc_score(model0.predict(X_test), Y_test), 'f1', f1_score(model0.predict(X_test), Y_test)



(0.6579903147699758, 'f1', 0.30769230769230765)

## confusion matrix

In [718]:
import sklearn
matrix = sklearn.metrics.confusion_matrix
matrix(model0.predict(X_test), Y_test), matrix(model0.predict(X_train), Y_train)

(array([[1324,   92],
        [  52,   32]], dtype=int64),
 array([[5312,  322],
        [ 191,  175]], dtype=int64))

# features selection

In [719]:
model0.coef_[0][:3]

array([-0.02736833, -0.02336432,  0.00289404])

In [720]:
select_num = 150
arr = model0.coef_[0]
selected_ind = [ind for coeff, ind in sorted([ (abs(arr[i]), i) for i in range(len(arr))])[-select_num:]]

In [761]:
selected_features = feature_cols[selected_ind]

In [722]:
X_s = train.loc[:, selected_features]
Y_s = train.loc[:, 'target']
print(X_s.shape, Y_s.shape)
X_train_s, X_test_s, Y_train_s, Y_test_s = train_test_split(X_s, Y_s, stratify=Y_s, test_size=0.2)

(7500, 150) (7500,)


In [723]:
# model2 = LinearSVC(random_state=42, class_weight={0:1, 1:5})
model2 = LinearSVC(random_state=42, tol=0.1, C=0.1, class_weight={0:10, 1:50})
model2.fit(X_train_s,Y_train_s)
roc_auc_score(model2.predict(X_test_s), Y_test_s), 'f1', f1_score(model2.predict(X_test_s), Y_test_s)



(0.6614126880313931, 'f1', 0.35897435897435903)

In [724]:
matrix = sklearn.metrics.confusion_matrix
matrix(model2.predict(X_test_s), Y_test_s), matrix(model2.predict(X_train_s), Y_train_s)

(array([[1308,   82],
        [  68,   42]], dtype=int64),
 array([[5276,  318],
        [ 227,  179]], dtype=int64))

# data augmentation

In [752]:
pred = model0.predict(X_train)

In [753]:
false_pos_ind = [i for i in range(len(Y_train)) if pred[i] == 0 and Y_train.values[i]==1]

In [754]:
aug_boundary = pd.Series(model0.decision_function(X)[false_pos_ind]).quantile(0.80)
aug_boundary

-0.34199387604134046

In [755]:
aug_index = [ind for ind in false_pos_ind if model0.decision_function(X)[ind] > aug_boundary ]

In [756]:
X_false_pos = np.array(X_train)[aug_index]
Y_false_pos = np.array(Y_train)[aug_index]
X_false_pos.shape, Y_false_pos.shape 

((65, 300), (65,))

In [757]:
X_aug = np.concatenate([X_train, X_false_pos])
Y_aug = np.concatenate([Y_train, Y_false_pos])

In [758]:
X_aug.shape

(6065, 300)

In [759]:
model_aug = LinearSVC(random_state=42, tol=0.1, C=0.1, class_weight={0:10, 1:50})
model_aug.fit(X_aug, Y_aug)
roc_auc_score(model_aug.predict(X_test), Y_test),  'f1', f1_score(model_aug.predict(X_test), Y_test)



(0.6285590856027279, 'f1', 0.30042918454935624)

In [760]:
import sklearn
matrix = sklearn.metrics.confusion_matrix
matrix(model_aug.predict(X_test), Y_test), matrix(model_aug.predict(X_train), Y_train)

(array([[1302,   89],
        [  74,   35]], dtype=int64),
 array([[5220,  303],
        [ 283,  194]], dtype=int64))

# GridSearchCV

In [750]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[i for i in range(1,10)]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)
sorted(clf.cv_results_)

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [751]:
clf.best_params_

{'C': 5, 'kernel': 'rbf'}

# test make_pipeline

In [618]:

from collections import Counter

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(
    iris.data,
    iris.target,
    sampling_strategy={0: 25, 1: 50, 2: 50},
    random_state=RANDOM_STATE,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

# Create a pipeline
pipeline = make_pipeline(
    NearMiss(version=2), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

Automatically created module for IPython interactive environment
Training target statistics: Counter({1: 38, 2: 38, 0: 17})
Testing target statistics: Counter({1: 12, 2: 12, 0: 8})
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.88      0.58      0.95      0.70      0.74      0.53        12
          2       0.69      0.92      0.75      0.79      0.83      0.70        12

avg / total       0.84      0.81      0.89      0.81      0.84      0.71        32

