In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

data = pd.read_csv('data/combined_subsets.csv')

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE
print(imblearn.__version__)

In [None]:
from collections import Counter

In [None]:
# drop nulls from df
stroke_data_df = data.dropna()

# set y as the 'stroke' output, with targets of 0 (No) and 1 (Yes)
y = stroke_data_df['stroke']
target_names = ['0', '1']

# set X as the df after dropping stroke output and id
X = stroke_data_df.drop('stroke', axis=1).drop('id',axis=1)

# define a smote instance with default parameters
oversample = SMOTE()

# rebalance data by applying SMOTE to add instances of 'Yes'
X, y = oversample.fit_resample(X, y)

# show new counts of output variables by type (should be same)
counter = Counter(y)
print(counter)

In [None]:
#over = SMOTE(sampling_strategy=0.1)
#under = RandomUnderSampler(sampling_strategy=0.5)

In [None]:
stroke_data_df = data.dropna()
stroke_data_df.head()

In [None]:
# Create separate df for records that have stroke and have no stroke
stroke_positive = stroke_data_df[stroke_data_df['stroke'] == 1]
stroke_negative = stroke_data_df[stroke_data_df['stroke'] == 0]

# return random sample of 500 for both postive and negative results
stroke_negative_sample = stroke_negative.sample(500)
stroke_positive_sample = stroke_positive.sample(500)

# merge postive and negative df to make one combined df
stroke_sample = pd.merge(stroke_negative_sample, stroke_positive_sample, how = 'outer')

stroke_sample.head()

# K Nearest Neighbor (KNN)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
# Plot score results for training and test sets for each k=N    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Print test score for best k
# k: 5 appears to be best (scores plateau at 5)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print('k=5 Train Acc: %.3f' % knn.score(X_train, y_train))
print('k=5 Test Acc: %.3f' % knn.score(X_test, y_test))

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report

predictions = knn.predict(X_test)

print(classification_report(y_test, predictions,
                            target_names=["No Stroke", "Stroke"]))

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report

predictions = knn.predict(X)

print(classification_report(y, predictions,
                            target_names=["No Stroke", "Stroke"]))

## Use cell below to make predictions with KNN model

### List of input values in order (with codification)
 - Gender (Female=0,Male=1,Other=2)
 - Age (actual value)
 - Hypertension (No=0,Yes=1)
 - Heart Diserase (No=0,Yes=1)
 - Married (No=0,Yes=1)
 - Work Type (Private=0,Self-employed=1,children=2,Govt_job=3,Never_worked=4")
 - Residence Type (Urban=0,Rural=1)
 - Blood Glucose Level (actual value)
 - BMI (actual value)
 - Smoking (never smoked=0,formerly smoked=1,smokes=2,unkown=3)

### Output prediction value
 - Have you had a Stroke? (No=0,Yes=1)

In [None]:
sample = [[1,27,0,0,0,0,0,100,29,1]]
prediction = knn.predict(sample)
print(prediction)

## Save Model to File

In [None]:
import joblib

In [None]:
filename = 'knn_model.sav'

#dump model to file
joblib.dump(knn, filename)

In [None]:
loaded_model = joblib.load(filename)
result = loaded_model.predict(sample)
print(result)