In [None]:
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Explorative Data Analysis

In [None]:
pd.options.display.max_columns = 100

In [None]:
sample = pd.read_csv('/kaggle/input/training2/sample_submission.csv', sep=";")
test = pd.read_csv('/kaggle/input/training2/test.csv')
train = pd.read_csv('/kaggle/input/training2/train.csv')

In [None]:
type(sample)

In [None]:
sample.head()

In [None]:
train.head()

In [None]:
train['avg_glucose_level'].head()

In [None]:
train.columns

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train['avg_glucose_level'].hist()

In [None]:
plt.figure(figsize=(12, 8))
train.hist();

In [None]:
#indexing
train[train['bmi'] > 65]

In [None]:
train[(train['age'] > 65) & (train['stroke'] == 0)]

In [None]:
train.loc[train['stroke'] == 1, 'bmi'].median()

In [None]:
train.loc[train['stroke'] == 0, 'bmi'].median()

In [None]:
#grouping
for (stroke, subdata) in train.groupby('stroke'):
    print('Median BMI for {} is {}'.format('people with stroke' if stroke == 1 else 'people without stroke',
                                           subdata['bmi'].median()))

In [None]:
train.groupby(['gender', 'stroke'])['bmi'].median()

In [None]:
#crosstab
pd.crosstab(train['gender'], train['stroke'])

### Visualisation

In [None]:
sns.boxplot(x='age', data=train)

In [None]:
sns.boxplot(x='stroke', y='age', data=train)

### Visualisation of whole dataset

In [None]:
train.corr()


In [None]:
sns.heatmap(train.corr());

### Numeric

In [None]:
sns.boxplot(x = 'avg_glucose_level', data = train)

In [None]:
train['avg_glucose_level'].hist();

In [None]:
sns.countplot(x='work_type', data = train);

In [None]:
plt.scatter(train['avg_glucose_level'], train['bmi']);

In [None]:
train.corrwith(train['bmi'])

In [None]:
pd.crosstab(train['gender'], train['work_type'])

In [None]:
sns.countplot(x='gender', hue = 'work_type', data = train);

In [None]:
#scatter plots of features
numeric = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
sns.pairplot(train[numeric])

### Selecting features

In [None]:

#get rid of nan
for col in numeric:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())


In [None]:
categoric = list(set(train.columns.values.tolist()) - set(numeric + ['id', 'stroke']))

In [None]:
for col in categoric:
    train[col] = train[col].fillna('nan')
    test[col] = test[col].fillna('nan')

In [None]:
# converting categorical to numerical

ever_married_num = {'No' : 0, 'Yes' : 1}

train['ever_married'] = train['ever_married'].replace(ever_married_num)
test['ever_married'] = test['ever_married'].replace(ever_married_num)

In [None]:
Residence_type_num = {'Rural' : 0, 'Urban' : 1}

train['Residence_type'] = train['Residence_type'].replace(Residence_type_num)
test['Residence_type'] = test['Residence_type'].replace(Residence_type_num)

In [None]:
gender_num = {'Other' : 0, 'Male' : 0, 'Female' : 1}

train['gender'] =train['gender'].replace(gender_num)
test['gender'] = test['gender'].replace(gender_num)

In [None]:
train_num = pd.get_dummies(train)
test_num = pd.get_dummies(test)

### Algorithms

In [None]:
# Cross-validation

def cross_validation_for_roc_auc(clf, X, y ,cv=5):
    X = np.array(X.copy())
    y = np.array(y.copy())
    kf = KFold(n_splits=cv)
    kf.get_n_splits(X)
    scores = []
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        prediction_on_this_fold = clf.predict_proba(X_test)[:,1]
        
        score = roc_auc_score(y_score=prediction_on_this_fold, y_true=y_test)
        scores.append(score)
        
    return scores

In [None]:
y = train_num.stroke
train_num = train_num.drop(columns='stroke')

In [None]:
alg1 = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=1)
cv1 = cross_validation_for_roc_auc(alg1, train_num, y) 
print(cv1)
print(np.mean(cv1))
sample.to_csv('submit1.csv', index=None)

In [None]:
all_features = train.columns
features = {}
for f in all_features:
    cv1 = cross_validation_for_roc_auc(alg1, train_num[[f]] , y)
    print(f,  np.mean(cv1))
    features[f] = np.mean(cv1)

In [None]:
features_selected=['age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level']
scores_01_fs = cross_validation_for_roc_auc(alg1, train_num[features_selected] , y)
print(scores_01_fs)
print(np.mean(scores_01_fs))

In [None]:
depths = [5,6,7,8]
ns = [150,180, 220,250,300]
hyperparameters = pd.DataFrame(columns=depths, index=ns)
for d in depths:
    for n in ns:
        alg1_hs = GradientBoostingClassifier(n_estimators=n, max_depth=d, random_state=1)
        cv2_hs = cross_validation_for_roc_auc(alg1_hs, train_num[features_selected] , y)
        hyperparameters.loc[n,d]=np.mean(cv2_hs)
        print(d, n, np.mean(cv2_hs))

In [None]:
alg1_final = GradientBoostingClassifier(n_estimators=150, max_depth=5, random_state=1)
alg1_final.fit(train_num[features_selected], y)
prediction = alg1_final.predict_proba(test_num[features_selected])[:,1]
sample.stroke = prediction
sample.to_csv('submit1_1.csv', index=None)

In [None]:
alg2 = LogisticRegression(max_iter=1000,random_state=1, solver = 'lbfgs')
cv2 = cross_validation_for_roc_auc(alg2, train_num, y)
print(cv2)
print(np.mean(cv2))

In [None]:
# Boosting
alg3 = LGBMClassifier(n_estimators=100, max_depth=5, random_state=1)
cv3 = cross_validation_for_roc_auc(alg1, train_num, y)
print(cv3)
print(np.mean(cv3))