In [22]:
import pandas as pd
import numpy as np

import gc

from imblearn.over_sampling import SMOTE

import catboost as cb
from sklearn import model_selection, metrics, linear_model, ensemble, naive_bayes, neighbors, svm, tree
from xgboost import XGBClassifier

In [20]:
bold = ['\033[1m', '\033[0m']

In [2]:
train = pd.read_csv(r'./data/train.csv')
test = pd.read_csv(r'./data/test.csv')
ss = pd.read_csv(r'./data/sample_submission.csv')

In [3]:
print(train.shape, test.shape)

(15304, 12) (10204, 11)


In [7]:
train.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [9]:
train['stroke'].value_counts()

0    14672
1      632
Name: stroke, dtype: int64

In [10]:
target = train['stroke']

data = pd.concat([train.drop('stroke', axis = 1), test]).reset_index(drop = True)

data.shape

(25508, 11)

In [11]:
text_features = []

for column in data.columns:
    if data[column].dtype == 'object':
        text_features.append(column)

for text_feature in text_features:
    data = data.join(pd.get_dummies(data[text_feature]))
    data = data.drop(text_feature, axis = 1)

In [12]:
data.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,Female,Male,Other,No,...,Never_worked,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes
0,0,28.0,0,0,79.53,31.1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1,1,33.0,0,0,78.44,23.9,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
2,2,42.0,0,0,103.0,40.3,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
3,3,56.0,0,0,64.87,28.8,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
4,4,24.0,0,0,73.36,28.8,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0


In [13]:
df_train = data.iloc[:len(target), :]

df_test = data.iloc[len(target):, :]

df_train.shape, target.shape, df_test.shape

((15304, 22), (15304,), (10204, 22))

In [14]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.drop('id', axis = 1), target, random_state = 42)
    
# sm = SMOTE(random_state = 42)
# X_train, y_train = sm.fit_resample(X_train, y_train)

In [11]:
# from lazypredict.Supervised import LazyClassifier

# clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None)

# models,predictions = clf.fit(X_train, X_test, y_train, y_test)

# print(models)

In [2]:
FOLDS = 10
seed = 69

def norm_0to1(preds):
    return (preds - np.min(preds)) / (np.max(preds) - np.min(preds))

cb_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'rsm': 0.5,
    'subsample': 0.931,
    'l2_leaf_reg': 69,
    'min_data_in_leaf': 20,
    'random_strength': 0.175,
    
    'random_seed': seed,
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC'
}

f_imp = pd.DataFrame({'feature': df_train.columns})
predictions, scores = np.zeros(len(df_test)), []

k = model_selection.StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(k.split(df_train, target)):    
    cb_train = cb.Pool(data=df_train.iloc[train_idx],
                       label=target.iloc[train_idx])
    cb_valid = cb.Pool(data=df_train.iloc[val_idx],
                       label=target.iloc[val_idx])
    
    model = cb.train(params=cb_params,
                     dtrain=cb_train,
                     num_boost_round=10000,
                     evals=cb_valid, 
                     early_stopping_rounds=500,
                     verbose=False)
    
    f_imp['fold_'+str(fold+1)] = model.get_feature_importance()
    val_preds = model.predict(cb_valid)
    val_score = metrics.roc_auc_score(target.iloc[val_idx], val_preds)
    scores.append(val_score)
    
    predictions += model.predict(df_test) / FOLDS
    print(f'- FOLD {fold+1} AUC: {bold[0]}{round(val_score, 4)}{bold[1]} -')
    
    del cb_train, cb_valid, val_preds, val_score, model
    gc.collect()

print('*'*45)
print(f'Mean AUC: {bold[0]}{round(np.mean(scores), 4)}{bold[1]}')

predictions = norm_0to1(predictions)
f_importance_plot(f_imp)
preds_plot(predictions)

NameError: name 'pd' is not defined

In [1]:
predictions = norm_0to1(predictions)
predictions


KeyboardInterrupt



In [29]:
cb_preds = predictions
ss['stroke'] = predictions
ss.to_csv('cb_submission.csv', index=False)

In [12]:
def train_and_report(model):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.roc_auc_score(y_test, y_pred))
    
    return model

In [13]:
model = train_and_report(naive_bayes.BernoulliNB())

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      3667
           1       0.26      0.05      0.08       159

    accuracy                           0.95      3826
   macro avg       0.61      0.52      0.53      3826
weighted avg       0.93      0.95      0.94      3826

0.5220211541660877


In [14]:
import lightgbm
model = train_and_report(lightgbm.LGBMClassifier())

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      3667
           1       0.32      0.06      0.10       159

    accuracy                           0.96      3826
   macro avg       0.64      0.53      0.54      3826
weighted avg       0.93      0.96      0.94      3826

0.5257112132173233


In [78]:
submission = df_test.copy()

submission['output'] = model.predict(submission.drop('id', axis = 1))
submission[['no_stroke', 'stroke']] = model.predict_proba(submission.drop(['id', 'output'], axis = 1))

In [79]:
submission[['id', 'stroke']].to_csv('submission.csv', index = False)