In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
# six biomechanical attributes:
names = ['pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle', 'sacral slope', 'pelvic radius', 'grade of spondylolisthesis']

# DH (Disk Hernia), Spondylolisthesis (SL), Normal (NO) and Abnormal (AB)
actual_names = {'DH':'Disk Hernia', 'SL':'Spondylolisthesis','NO':'Normal', 'AB':'Abnormal'}

---
# Binary

In [3]:
df = pd.read_csv('./vertebral_column_data/column_2C.dat', ' ', header=None)

df = df.sample(frac=1)

In [4]:
X = df.iloc[:,:6]
X.columns = names
X.head()

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis
277,65.76,13.21,44.0,52.55,129.39,-1.98
297,45.58,18.76,33.77,26.82,116.8,3.13
265,48.17,9.59,39.71,38.58,135.62,5.36
101,79.94,18.77,63.31,61.16,114.79,38.54
275,67.29,16.72,51.0,50.57,137.59,4.96


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=names)
X.head(10)

In [5]:
y = df.iloc[:,6]
labels = y.unique()
y.value_counts()

AB    210
NO    100
Name: 6, dtype: int64

In [6]:
# Split Data to train and test sets

from sklearn.model_selection import train_test_split

# X = data, y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   shuffle=False)

# Reset index because otherwise row index start from none zero
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [7]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(solver='lbfgs')
reg.fit(X_train, y_train)

print('coef:',reg.coef_)
print('intercept:',reg.intercept_)

coef: [[-4.41361867e-06 -6.90286976e-02  1.86820040e-02  8.13683178e-02
   9.35532319e-02 -1.54578109e-01]]
intercept: [-13.19132631]


In [8]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

y_pred = reg.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', cm.T)
print()

acc = (cm[0,0] + cm[1,1]) / np.sum(cm)
print(f'Accuracy: {acc:.3f}')
print()

cr = classification_report(y_test, y_pred, target_names=labels)
print('Classification report:\n',cr)

k=10
scores = cross_val_score(reg, X_test, y_test, scoring='accuracy', cv=k)

print(f'Accuracies from {k} individual folds:', 
      scores, 
      f'Accuracy calculated using {k}-fold cross validation = {scores.mean():.3f}', 
      sep='\n')

Confusion matrix:
 [[42  2]
 [ 2 16]]

Accuracy: 0.935

Classification report:
               precision    recall  f1-score   support

          NO       0.95      0.95      0.95        44
          AB       0.89      0.89      0.89        18

    accuracy                           0.94        62
   macro avg       0.92      0.92      0.92        62
weighted avg       0.94      0.94      0.94        62

Accuracies from 10 individual folds:
[1.         1.         0.42857143 0.85714286 1.         1.
 1.         0.83333333 1.         1.        ]
Accuracy calculated using 10-fold cross validation = 0.912




In [15]:
inputs = []
for i, name in enumerate(names):
    inputs.append(float(input(f'{i+1}. {name}: ')))
    
probs = reg.predict_proba([inputs]).flatten()
imax = probs.argmax()

print('\nPrediction:', actual_names[labels[imax]])
print(f'Certainty: {probs[imax]*100:.2f} %')

1. pelvic incidence: 0
2. pelvic tilt: 0
3. lumbar lordosis angle: 0
4. sacral slope: 0
5. pelvic radius: 0
6. grade of spondylolisthesis: 0

Prediction: Normal
Certainty: 100.00 %


---
# Tri-categorical

In [16]:
df2 = pd.read_csv('./vertebral_column_data/column_3C.dat', ' ', header=None)

df2 = df2.sample(frac=1)

In [17]:
X2 = df2.iloc[:,:6]
X2.columns = names
X2.head()

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis
222,56.1,13.11,62.64,43.0,116.23,31.17
72,84.97,33.02,60.86,51.95,125.66,74.33
52,50.21,29.76,36.1,20.45,128.29,5.74
118,65.54,24.16,45.78,41.38,136.44,16.38
83,81.1,24.79,77.89,56.31,151.84,65.21


In [18]:
y2 = df2.iloc[:,6]
labels2 = y2.unique()
y2.value_counts()

SL    150
NO    100
DH     60
Name: 6, dtype: int64

In [19]:
# Split Data to train and test sets

from sklearn.model_selection import train_test_split

# X = data, y = labels
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   shuffle=False)

# Reset index because otherwise row index start from none zero
X2_test = X2_test.reset_index(drop=True)
y2_test = y2_test.reset_index(drop=True)

In [20]:
reg2 = LogisticRegression(solver='liblinear', multi_class='ovr')
reg2.fit(X2_train, y2_train)

print('coef:',reg.coef_)
print('intercept:',reg.intercept_)

coef: [[-4.41361867e-06 -6.90286976e-02  1.86820040e-02  8.13683178e-02
   9.35532319e-02 -1.54578109e-01]]
intercept: [-13.19132631]


In [21]:
y_pred2 = reg2.predict(X2_test)

cm = confusion_matrix(y2_test, y_pred2)
print('Confusion matrix:\n', cm.T)
print()

acc = (cm[0,0] + cm[1,1] + cm[2,2]) / np.sum(cm)
print(f'Accuracy: {acc:.3f}')
print()

cr = classification_report(y2_test, y_pred2, target_names=labels2)
print('Classification report:\n',cr)

k=10
scores = cross_val_score(reg2, X2_test, y2_test, scoring='accuracy', cv=k)

print(f'Accuracies from {k} individual folds:', 
      scores, 
      f'Accuracy calculated using {k}-fold cross validation = {scores.mean():.3f}', 
      sep='\n')

Confusion matrix:
 [[ 4  8  0]
 [ 6 16  0]
 [ 0  0 28]]

Accuracy: 0.774

Classification report:
               precision    recall  f1-score   support

          NO       0.33      0.40      0.36        10
          SL       0.73      0.67      0.70        24
          DH       1.00      1.00      1.00        28

    accuracy                           0.77        62
   macro avg       0.69      0.69      0.69        62
weighted avg       0.79      0.77      0.78        62

Accuracies from 10 individual folds:
[0.85714286 0.71428571 0.85714286 0.85714286 0.83333333 0.83333333
 0.83333333 1.         0.8        0.6       ]
Accuracy calculated using 10-fold cross validation = 0.819


In [25]:
inputs = []
for i, name in enumerate(names):
    inputs.append(float(input(f'{i+1}. {name}: ')))
    
probs = reg2.predict_proba([inputs]).flatten()
imax = probs.argmax()

print('\nPrediction:', actual_names[labels2[imax]])
print(f'Certainty: {probs[imax]*100:.2f} %')

1. pelvic incidence: 12
2. pelvic tilt: 45
3. lumbar lordosis angle: 3
4. sacral slope: 5
5. pelvic radius: 2
6. grade of spondylolisthesis: 33

Prediction: Disk Hernia
Certainty: 50.72 %
