In [136]:
import pandas as pd
from sklearn.model_selection import train_test_split
  
# Read the CSV file.
data = pd.read_csv("assets/CTG.csv", skiprows=1)

# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data = data[selected_cols].dropna()

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)

# Map the diagnosis code to a human-readable label.
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]

Y = data_shuffled['NSP'].apply(to_label)

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

In [137]:
Xtrain.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
1690,137.0,13.0,0.0,3.0,4.0,0.0,0.0,56.0,2.1,0.0,...,136.0,52.0,188.0,3.0,0.0,160.0,144.0,161.0,39.0,1.0
12,131.0,4.0,57.0,6.0,2.0,0.0,0.0,28.0,1.4,0.0,...,66.0,88.0,154.0,5.0,0.0,135.0,134.0,137.0,7.0,1.0
950,142.0,4.0,0.0,6.0,0.0,0.0,0.0,34.0,1.1,0.0,...,67.0,94.0,161.0,5.0,1.0,151.0,146.0,149.0,8.0,1.0
480,140.0,0.0,0.0,1.0,0.0,0.0,0.0,60.0,0.8,32.0,...,120.0,79.0,199.0,9.0,0.0,141.0,141.0,142.0,3.0,0.0
25,124.0,0.0,0.0,0.0,0.0,0.0,0.0,86.0,0.3,72.0,...,12.0,118.0,130.0,1.0,0.0,124.0,124.0,125.0,0.0,0.0


In [138]:
from sklearn.model_selection import cross_val_score
import sklearn as SK
import warnings
from statistics import mean

methods = []
def evalAndAppendClassifier(clf, suffix="", normalize=False):
    if normalize:
        xt = SK.preprocessing.normalize(Xtrain)
        yt = SK.preprocessing.normalize(Xtrain)
    else:
        xt = Xtrain
        yt = Ytrain
    with warnings.catch_warnings():
        warnings.filterwarnings('error', category=SK.exceptions.ConvergenceWarning)
        try:
            scores = cross_val_score(clf, xt, yt)
            avgScore = mean(scores)
        except (TypeError, ValueError, Warning) as e:
            avgScore = f"Could not compute ({type(e).__name__})"
    methodName = f"{type(clf).__name__}"
    if len(suffix)>0:
        methodName += f" ({suffix})"
    print(f"{methodName} => {avgScore}")
    methods.append((methodName, avgScore))


In [139]:
evalAndAppendClassifier(SK.dummy.DummyClassifier(strategy="most_frequent"), suffix="most frequent")

DummyClassifier:most frequent => 0.7805882352941177


In [140]:
evalAndAppendClassifier(SK.tree.DecisionTreeClassifier())

DecisionTreeClassifier: => 0.9229411764705883


In [141]:
evalAndAppendClassifier(SK.ensemble.RandomForestClassifier())

RandomForestClassifier: => 0.9405882352941176


In [142]:
evalAndAppendClassifier(SK.ensemble.GradientBoostingClassifier())

GradientBoostingClassifier: => 0.9488235294117647


In [143]:
evalAndAppendClassifier(SK.linear_model.Perceptron())

Perceptron: => 0.8252941176470588


In [144]:
evalAndAppendClassifier(SK.linear_model.LogisticRegression())

LogisticRegression: => Could not compute (ValueError)


In [145]:
evalAndAppendClassifier(SK.svm.LinearSVC())

LinearSVC: => Could not compute (ValueError)


In [146]:
evalAndAppendClassifier(SK.neural_network.MLPClassifier())

MLPClassifier: => 0.8841176470588236
