# 3. Practicing Scikit-Learn Syntax
Let's get good with Scikit-learn during this notebook.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = 100
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

heart = pd.read_csv('../data/heart.csv')
heart.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,fbs,rest_ecg,max_hr,exang,old_peak,slope,ca,thal,disease
0,63,Male,typical,145,233,1,left ventricular hypertrophy,150,0,2.3,3,0.0,fixed,0
1,67,Male,asymptomatic,160,286,0,left ventricular hypertrophy,108,1,1.5,2,3.0,normal,1
2,67,Male,asymptomatic,120,229,0,left ventricular hypertrophy,129,1,2.6,2,2.0,reversable,1
3,37,Male,nonanginal,130,250,0,normal,187,0,3.5,3,0.0,normal,0
4,41,Female,nontypical,130,204,0,left ventricular hypertrophy,172,0,1.4,1,0.0,normal,0


# Let's build many different models from different parts of the library slowly adding more features
* Let's build decision trees, random forests, K-Nearest Neighbors and Support vector machines:
* Predict, then Score, and write down your scores

In [10]:
X = heart['rest_bp'].values
X = X.reshape(-1, 1)
y = heart['disease'].values

In [11]:
# step 1 - import
from sklearn.linear_model import LogisticRegression

In [14]:
# Step 2 - instantiation with defaults
logr = LogisticRegression()

In [15]:
# Step 3 - train with fit method
logr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
logr.score(X, y)

0.5676567656765676

In [None]:
# Scorw with rest_hp with regression was 0.56

## Next machine learning model - Decision Tree

In [2]:
# step 0 - choose column, get numpy array values
X = heart['chol'].values
X = X.reshape(-1, 1)
y = heart['disease'].values

In [3]:
# step 1 
from sklearn.tree import DecisionTreeClassifier

In [4]:
# step 2
dtc = DecisionTreeClassifier()

In [5]:
# step 3 train with fit method
dtc.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf='deprecated', min_samples_split=2,
            min_weight_fraction_leaf='deprecated', presort=False,
            random_state=None, splitter='best')

In [6]:
# step 4 sore the model and record the result
dtc.score(X,y)

0.8151815181518152

## Random Forest

In [None]:
# step 0 - choose column, get numpy array values
X = heart['chol'].values
X = X.reshape(-1, 1)
y = heart['disease'].values

In [7]:
# step 1 - import
from sklearn.ensemble import RandomForestClassifier

In [8]:
# step 2
rfc = RandomForestClassifier()

In [9]:
# step 3
rfc.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf='deprecated', min_samples_split=2,
            min_weight_fraction_leaf='deprecated', n_estimators=10,
            n_jobs=None, oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
# step 4
rfc.score(X, y)

0.7953795379537953

## SVM - linear SVM

In [11]:
# step 0 - choose column, get numpy array values
X = heart['chol'].values
X = X.reshape(-1, 1)
y = heart['disease'].values

In [12]:
# step 1 - import
from sklearn.svm import LinearSVC

In [14]:
# step 2
svc1 = LinearSVC()

In [15]:
# step 3 
svc1.fit(X, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
# step 4 score
svc1.score(X, y)

0.45874587458745875

## Ensemble - AdaBoost 

In [19]:
# step 0 
X = heart['chol'].values
X = X.reshape(-1,1)
y = heart['disease'].values

In [21]:
# step 1
from sklearn.ensemble import AdaBoostClassifier

In [22]:
# step 2
ada = AdaBoostClassifier()

In [23]:
# step 3
ada.fit(X , y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [25]:
# step 4
ada.score(X, y)

0.6963696369636964

# Multiple Columns Classification

In [26]:
heart.head(2)

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,fbs,rest_ecg,max_hr,exang,old_peak,slope,ca,thal,disease
0,63,Male,typical,145,233,1,left ventricular hypertrophy,150,0,2.3,3,0.0,fixed,0
1,67,Male,asymptomatic,160,286,0,left ventricular hypertrophy,108,1,1.5,2,3.0,normal,1


In [27]:
cols = ['age', 'chol', 'max_hr']

In [29]:
X = heart[cols].values
y = heart['disease'].values

In [33]:
# step 1 - import
from sklearn.linear_model import LogisticRegression

In [34]:
# step 2 
logr = LogisticRegression()

In [35]:
X[: 2] # it's already 2d array, not need to reshape

array([[ 63, 233, 150],
       [ 67, 286, 108]])

In [36]:
# step 3 train with fit method
logr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
# Score
logr.score(X, y)

0.6765676567656765

In [38]:
logr.intercept_

array([1.35754509])

In [40]:
logr.coef_

array([[ 0.03678834,  0.00446042, -0.03125422]])

## Try other features

In [42]:
cols = ['age', 'rest_bp', 'chol', 'max_hr', 'old_peak', 'ca']

In [44]:
X = heart[cols].values
y = heart['disease'].values

In [45]:
# step 1 import
from sklearn.linear_model import LogisticRegression

In [47]:
# step 2
logr2 = LogisticRegression()

In [None]:
# step 3
