In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import confusion_matrix

In [4]:
patient_data = pd.read_csv('train_datalabel.csv')
test = pd.read_csv('test_data.csv')
submission = pd.read_csv('sample_submission.csv')

In [5]:
def replace_nan(df):
    df = one_hot_encode(df)
    no_nan_df = without_nan(df)
    vals = {column: np.average(no_nan_df[column]) for column in no_nan_df}
    df = df.drop('id', axis = 1)
    return df.fillna(value = vals)


#this code removes all rows that contain N/A values
def without_nan(df):
    not_nan = df.notna()
    rows_no_nan = not_nan.all(axis=1)
    return df[rows_no_nan]

In [6]:
def one_hot_encode(df):
    df = pd.get_dummies(df, columns=["gender", "ever_married", "work_type", "Residence_type", "smoking_status"])
    return df

In [7]:
initial_boot = patient_data.copy()
initial_boot = replace_nan(initial_boot)
#filtering for just positive stroke == 1
just_strokes = initial_boot[initial_boot['stroke'] == 1]

bootstrap_1 = just_strokes.sample(200, replace = True)
bootstrap_1

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
111,74.0,1,0,70.28,21.800000,1,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
76,74.0,0,0,231.61,34.600000,1,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,61.0,0,0,202.21,28.930552,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
82,79.0,0,0,93.05,24.200000,1,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0
18,58.0,0,0,189.84,28.930552,1,0,1,0,0,...,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,49.0,0,0,104.86,31.900000,1,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
127,78.0,1,0,203.87,45.700000,1,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
73,58.0,0,1,240.59,31.400000,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
193,57.0,0,0,197.28,34.500000,1,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [8]:
def bootstrap(dataframe):
    '''Takes a cleaned dataframe of just stroke == 1 and bootstraps it
    '''
    df = dataframe.sample(200, replace = True)
    return df

In [9]:
for i in np.arange(19):
    new_boot = bootstrap(just_strokes)
    bootstrap_1 = bootstrap_1.append(new_boot)

len(bootstrap_1.index)

4000

In [10]:
final_patient_data = initial_boot.append(bootstrap_1)
len(final_patient_data)

8088

In [11]:
forest_factors = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "gender_Male", "ever_married_Yes", "work_type_Self-employed", "Residence_type_Rural", "smoking_status_never smoked"]

In [12]:
patient_train, patient_validation = train_test_split(final_patient_data, 
                                             test_size = .25, 
                                             random_state = 0)

In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 1000,
    min_samples_split = 2,
    min_samples_leaf = 1,
    max_features = 'sqrt',
    max_depth = None,
    bootstrap = False) # this was originally false

# now, I'm going to clean the data. We remove all irrelevant columns
patient_train_y = patient_train["stroke"]
#patient_train_x = patient_train[forest_factors]
patient_train_x = patient_train.drop(["stroke", "gender_Other"], axis=1)

model.fit(patient_train_x, patient_train_y)
forest_predictions = model.predict(patient_train_x)

# this shows the accuracy of the random forest
(forest_predictions == patient_train_y).sum() / patient_train_y.shape[0]

1.0

In [14]:
patient_validation_y = patient_validation["stroke"]
#patient_validation_x = patient_validation[forest_factors]
patient_validation_x = patient_validation.drop(["stroke", "gender_Other"], axis=1)
validation_forest_predictions = model.predict(patient_validation_x)
(validation_forest_predictions == patient_validation_y).sum() / patient_validation_y.shape[0]

0.9945598417408507

In [15]:
np.sum(patient_validation_y == 1)

1073

In [16]:
np.sum(validation_forest_predictions == 1)

1084

In [17]:
test_data = pd.read_csv('test_data.csv')
clean_test_data = replace_nan(test)

preds_for_submission = model.predict(clean_test_data)
#np.sum(preds_for_submission == 1) / len(preds_for_submission)
sum(preds_for_submission == 1)

13

In [18]:
submission["stroke"] = preds_for_submission
submission.head()

Unnamed: 0,id,stroke
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [19]:
submission.to_csv('submission.csv', index=False)