In [1]:
# env setting
import sys
sys.path.append("../src")
sys.path.append("../models")

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
# from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

# model import
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#custom function
import config
import helpers
from custom_pipeline import Custom_Pipeline

### Binary

In [33]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
# categorical_features.append('number_of_treatements')

target = 'outcome'

numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)

train_enth_ind = train.loc[train['outcome'] == 'euthanized'].index
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

(1534, 29)
(824, 28)


In [73]:
# 안락사
X_euth = train.copy()
X_euth_test = test.copy()

train['outcome'] = train['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 0
})

y = train.outcome
USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_euth, y)
X_euth = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
X_euth_test = pipe.transform(X_euth_test, USECOLS, ALPHA, DROPCOLS)
print(X_euth.shape)

# Modeling
model = XGBClassifier()

x_train, x_val, y_train, y_val = train_test_split(X_euth, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(model, x_train, y_train,
                         cv=folds,
                         scoring='f1',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
model.fit(x_train, y_train)
prediction = model.predict(x_val)
#test our model on the test data
print(f1_score(y_val, prediction))
print(precision_score(y_val, prediction))

(1534, 27)
[0.70103093 0.66666667 0.66666667 0.6122449  0.48101266]
0.6255243634710833
[LightGBM] [Info] Number of positive: 265, number of negative: 1115
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1002
[LightGBM] [Info] Number of data points in the train set: 1380, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192029 -> initscore=-1.436880
[LightGBM] [Info] Start training from score -1.436880
0.7924528301886793
0.9130434782608695


In [35]:
model.fit(X_euth, y)
euth_prediction = model.predict(X_euth_test)
euth_prediction

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [36]:
euth_prediction_df = pd.Series(euth_prediction)
euth_test_index = euth_prediction_df.loc[euth_prediction_df == 1].index
euth_test_index

Int64Index([  3,  12,  27,  33,  37,  39,  43,  58,  74,  81,
            ...
            763, 767, 786, 793, 795, 800, 804, 810, 815, 820],
           dtype='int64', length=103)

In [72]:
# 죽음 - 생존
(train, test, origin, submission) = helpers.data_loader()

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

target = 'outcome'

numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)

train_enth_ind = train.loc[train['outcome'] == 'euthanized'].index
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

# Split Died Lived

train_died_lived = train.drop(train_enth_ind, axis=0)
test_died_lived = test.drop(euth_test_index, axis=0)

X_died_lived = train_died_lived.copy()
X_died_lived_test = test_died_lived.copy()


train_died_lived['outcome'] = train_died_lived['outcome'].map({
    'died' : 0,
    'lived' : 1
})

y = train_died_lived['outcome']
USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_died_lived, y)
X_died_lived = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
X_died_lived_test = pipe.transform(X_died_lived_test, USECOLS, ALPHA, DROPCOLS)
print(X_died_lived.shape)

# Modeling
model = XGBClassifier(**config.XGB_PARAMS)

x_train, x_val, y_train, y_val = train_test_split(X_died_lived, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(model, x_train, y_train,
                         cv=folds,
                         scoring='f1',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
model.fit(x_train, y_train)
prediction = model.predict(x_val)
#test our model on the test data
print(f1_score(y_val, prediction))
print(precision_score(y_val, prediction))

(1534, 29)
(824, 28)
(1239, 27)
[0.89298893 0.89530686 0.88475836 0.88967972 0.85818182]
0.8841831373783304
0.8399999999999999
0.84


In [24]:
model.fit(X_died_lived, y)
died_lived_prediction = model.predict(X_died_lived_test)
died_lived_prediction

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,

In [49]:
euth_prediction_df = pd.Series(euth_prediction)
euth_prediction_df.value_counts()

0    721
1    103
dtype: int64

In [50]:
euth_prediction_df = euth_prediction_df.replace({
    1 : 'euthanized'
})
euth_prediction_df.value_counts()

0             721
euthanized    103
dtype: int64

In [51]:
died_lived_prediction_df = pd.Series(died_lived_prediction)
died_lived_prediction_df = died_lived_prediction_df.replace({
    0 : 'died',
    1 : 'lived'
})
died_lived_prediction_df.value_counts()
# euth_prediction_df.loc[euth_prediction_df == 0] = died_lived_prediction_df

lived    409
died     312
dtype: int64

In [58]:
submission = euth_prediction_df.copy()
submission.loc[submission == 0] = died_lived_prediction_df.values

In [60]:
submission.value_counts()
#submission.head()

lived         409
died          312
euthanized    103
dtype: int64

In [62]:
sample_submission = pd.read_csv(config.SUBMISSION_FILE)

sample_submission['outcome'] = submission.values
sample_submission.to_csv('../output/sample_submission_V6(two model - euth only, died-lived binary).csv',index=False)
sample_submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression


sub1 = pd.read_csv("../input/sample_submission_V4(ensemble_sklearn 2nd).csv")
sub2 = pd.read_csv("../input/0.7500submission_1.csv")

sub_merge = pd.merge(sub1, sub2, 'inner', 'id').drop(['id'], axis=1)

In [5]:
meta_model = LogisticRegression()
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cross_val_score(meta_model, sub_merge)

Unnamed: 0,id,outcome_x,outcome_y
0,1235,lived,lived
1,1236,died,lived
2,1237,lived,lived
3,1238,euthanized,euthanized
4,1239,lived,lived
...,...,...,...
819,2054,died,died
820,2055,euthanized,euthanized
821,2056,died,lived
822,2057,lived,lived


### sklearn MLP

In [2]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
categorical_features.append('number_of_treatements')
# categorical_features.remove('lesion_2')
# categorical_features.remove('lesion_3')
target = 'outcome'

numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

(1534, 29)
(824, 28)


In [3]:
X_tr = train.copy()
X_test = test.copy()

train['outcome'] = train['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

y = train.outcome
USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_tr, y)
X_tr = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
X_test = pipe.transform(X_test, USECOLS, ALPHA, DROPCOLS)
print(X_tr.shape)

(1534, 27)


In [7]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix

#Load the digits data:

X_train, X_test, y_train, y_test = train_test_split(X_tr, y, test_size=0.1, random_state=42, stratify=y)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Finally for the MLP- Multilayer Perceptron
mlp = MLPClassifier(hidden_layer_sizes=(16, 32, 16, 8), max_iter=1000, random_state=42,
                    activation='relu', solver='adam', early_stopping=True)

clf = BaggingClassifier(mlp, n_estimators=300, n_jobs=-1, random_state=42)

scores = cross_val_score(clf, X_train, y_train, cv=folds, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.72101449 0.71014493 0.65942029 0.7173913  0.63043478]
0.68768115942029


In [5]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
print(cm)

[[108   5  33]
 [ 23  39  24]
 [ 24   5 200]]


In [6]:
f1_score(y_test, y_pred, average='micro')

0.7527114967462039