In [1]:
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn

print('Python: {}'.format(sys.version))
print('scipy: {}'.format(scipy.__version__))
print('numpy: {}'.format(numpy.__version__))
print('matplotlib: {}'.format(matplotlib.__version__))
print('pandas: {}'.format(pandas.__version__))
print('sklearn: {}'.format(sklearn.__version__))
import warnings
warnings.filterwarnings("ignore")

Python: 3.6.7 (default, Oct 22 2018, 11:32:17) 
[GCC 8.2.0]
scipy: 1.1.0
numpy: 1.14.6
matplotlib: 3.0.3
pandas: 0.22.0
sklearn: 0.20.3


## 1. Import libraries

Import all of the modules, functions, and objects we will use in this tutorial.

In [0]:
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## 2. Load the Dataset

We will be using the iris flowers dataset, which contains 150 observations of iris flowers. There are four columns of measurements and the species of flower observed.  Only three species are present in this dataset.

The data can be loaded directly from the UCI Machine Learning Repository

In [0]:
# Load Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data"
dataset = pandas.read_csv(url,header=None)
#dataset.rename(columns={279:'class'}, inplace=True)
#dataset=dataset.groupby(class)
#dataset=dataset[dataset.class != '1']


#for i in range(0,280):
  #print(i)
#tempm = numpy.mean(dataset[i])
dataset = dataset.replace('?',0)
#print(dataset.mean())

## 2.1 Dataset Properties

Lets take a look at the dataset by observing its dimensions, the first few rows of data, a statistical summary of the attributes, and a breakdown of the data by the class variable.

In [4]:
# Shape
print(dataset.shape)

(452, 280)


In [5]:
# Head
print(dataset.head(20))

    0    1    2    3    4    5    6    7    8    9   ...   270   271  272  \
0    75    0  190   80   91  193  371  174  121  -16 ...   0.0   9.0 -0.9   
1    56    1  165   64   81  174  401  149   39   25 ...   0.0   8.5  0.0   
2    54    0  172   95  138  163  386  185  102   96 ...   0.0   9.5 -2.4   
3    55    0  175   94  100  202  380  179  143   28 ...   0.0  12.2 -2.2   
4    75    0  190   80   88  181  360  177  103  -16 ...   0.0  13.1 -3.6   
5    13    0  169   51  100  167  321  174   91  107 ...  -0.6  12.2 -2.8   
6    40    1  160   52   77  129  377  133   77   77 ...   0.0   6.5  0.0   
7    49    1  162   54   78    0  376  157   70   67 ...   0.0   8.2 -1.9   
8    44    0  168   56   84  118  354  160   63   61 ...   0.0   7.0 -1.3   
9    50    1  167   67   89  130  383  156   73   85 ...  -0.6  10.8 -1.7   
10   62    0  170   72  102  135  401  156   83   72 ...  -0.5   9.0 -2.0   
11   45    1  165   86   77  143  373  150   65   12 ...   0.0   4.4 -2.2   

In [0]:
# descriptions
print(dataset.describe())

              0           1           2           3           4           5    \
count  452.000000  452.000000  452.000000  452.000000  452.000000  452.000000   
mean    46.471239    0.550885  166.188053   68.170354   88.920354  155.152655   
std     16.466631    0.497955   37.170340   16.590803   15.364394   44.842283   
min      0.000000    0.000000  105.000000    6.000000   55.000000    0.000000   
25%     36.000000    0.000000  160.000000   59.000000   80.000000  142.000000   
50%     47.000000    1.000000  164.000000   68.000000   86.000000  157.000000   
75%     58.000000    1.000000  170.000000   79.000000   94.000000  175.000000   
max     83.000000    1.000000  780.000000  176.000000  188.000000  524.000000   

              6           7           8           9       ...             270  \
count  452.000000  452.000000  452.000000  452.000000     ...      452.000000   
mean   367.207965  169.949115   90.004425   33.676991     ...       -0.278982   
std     33.385421   35.6330

In [0]:
# class distribution
print(dataset.groupby(279).size())

279
1     245
2      44
3      15
4      15
5      13
6      25
7       3
8       2
9       9
10     50
14      4
15      5
16     22
dtype: int64


## 2.2 Data Visualizations

Lets visualize the data so we can understand the distribution of the input attributes. We will use histograms of each attribute, as well as some multivariate plots so that we can view the interactions between variables.

## 3. Evaluate Algorithms

Lets create some models of the data and estimate their accuracy on unseen data.

We are going to,

* Create a validation dataset
* Set-up cross validation
* Build three different models to predict species from flower measurement
* Select the best model

## 3.1 Create Validation Dataset

Lets split the loaded dataset into two.  80% of the data will be used for training, while 20% will be used for validation.

In [0]:
# Split-out validation dataset
array = dataset.values
Y = array[:,279]
Y=Y.astype('int')
X = array[:,0:279]
#print(Y)
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size = validation_size, random_state = seed)

## 3.2 10-fold Cross Validation

This will split our dataset into 10 parts, train on 9 and test on 1 and repeate for all combinations of train-test splits

In [0]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

## 3.3 Build Models

Lets evaluate three models:

* Logistic Regression (LR)
* K-Nearest Neighbors (KNN)
* Support Vector Machine (SVM)

In [0]:
models = []
models.append(('LR', LogisticRegression()))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
#print()



In [9]:
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.629129 (0.089640)


## 4. Make Predictions

Lets test the model on the validation set to make sure that our algorithms can generalize to new data.  Otherwise, we may be overfitting the training data.  

In [11]:
# Make predictions on validation dataset
out=["Normal","Ischemic changes","Old Anterior Myocardial Infarction","Old Inferior Myocardial Infarction","Sinus tachycardy","Sinus bradycardy","Ventricular Premature Contraction","Supraventricular Premature Contraction","Left bundle branch block","Right bundle branch block","1. degree AtrioVentricular block","2. degree AV block","3. degree AV block","Left ventricule hypertrophy","Atrial Fibrillation or Flutter","Others"]
for name, model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_validation)
    print(name)
    pred=[]
    print(accuracy_score(Y_validation, predictions))
    for i in predictions:
      pred.append(out[i-1]+" ")
    print(pred)
    print(predictions)
    #print(Y_validation)
    #print(classification_report(Y_validation, predictions))

LR
0.6373626373626373
['Right bundle branch block ', 'Normal ', 'Ischemic changes ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Sinus bradycardy ', 'Normal ', 'Ischemic changes ', 'Sinus bradycardy ', 'Normal ', 'Right bundle branch block ', 'Normal ', 'Normal ', 'Right bundle branch block ', 'Normal ', 'Others ', 'Ischemic changes ', 'Normal ', 'Sinus tachycardy ', 'Normal ', 'Normal ', 'Others ', 'Sinus bradycardy ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Sinus bradycardy ', 'Right bundle branch block ', 'Normal ', 'Sinus tachycardy ', 'Ischemic changes ', 'Ischemic changes ', 'Ischemic changes ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Right bundle branch block ', 'Normal ', 'Ischemic changes ', 'Normal ', 'Normal ', 'Normal ', 'Right bundle branch block ', 'Normal ', 'Normal ', 'Normal ', 'Normal ', 'Right bundle branch block ', 'Sinus bradycardy ', 'Normal ', 'Others ', 'Normal ', 'Right bund