In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
### pre-processing

# load data
df = pd.read_csv('df_points.txt', sep = '\t', usecols = ['x', 'y', 'z', 'label'])

# check data dtypes
print(df.dtypes)

# check summary statistics
print(df.describe())

# shuffle samples
df = shuffle(df)

# separate attributes and label
X, y = df[['x', 'y', 'z']], df['label']

# standardize attributes: z = (x - u) / s
scaler = StandardScaler()
X = scaler.fit_transform(X)

# split test/train samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)


x        float64
y        float64
z        float64
label    float64
dtype: object
                  x             y             z         label
count  10000.000000  10000.000000  10000.000000  10000.000000
mean       0.850362     -3.108769     -2.601124      0.502700
std      288.379928    287.120263    290.379789      0.500018
min     -499.802348   -499.899134   -499.952571      0.000000
25%     -249.199895   -248.954580   -258.005693      0.000000
50%        3.663472     -5.446168     -8.221000      1.000000
75%      248.879970    244.395864    252.930406      1.000000
max      499.872453    499.752418    499.872329      1.000000


In [3]:
### logistic regression

# try different C values
for C in [0.25, 0.5, 0.75, 1.0]:
    
    # try different solvers
    for solver in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']:

        # initialize classifier
        clf = LogisticRegression(C = C, solver = solver)

        # train classifier
        clf.fit(X_train, y_train)

        # check results
        score = clf.score(X_test, y_test)
        print(' ')
        print('score obtained using C =', C, 'and solver =', solver)
        print(score)

print(' ')
print('Logistic regression not good here: no better than tossing a coin.')

 
score obtained using C = 0.25 and solver = liblinear
0.5124
 
score obtained using C = 0.25 and solver = newton-cg
0.512
 
score obtained using C = 0.25 and solver = lbfgs
0.512
 
score obtained using C = 0.25 and solver = sag
0.512
 
score obtained using C = 0.25 and solver = saga
0.512
 
score obtained using C = 0.5 and solver = liblinear
0.5124
 
score obtained using C = 0.5 and solver = newton-cg
0.5124
 
score obtained using C = 0.5 and solver = lbfgs
0.5124
 
score obtained using C = 0.5 and solver = sag
0.5124
 
score obtained using C = 0.5 and solver = saga
0.5124
 
score obtained using C = 0.75 and solver = liblinear
0.5124
 
score obtained using C = 0.75 and solver = newton-cg
0.5124
 
score obtained using C = 0.75 and solver = lbfgs
0.5124
 
score obtained using C = 0.75 and solver = sag
0.5124
 
score obtained using C = 0.75 and solver = saga
0.5124
 
score obtained using C = 1.0 and solver = liblinear
0.5124
 
score obtained using C = 1.0 and solver = newton-cg
0.5124
 


In [4]:
### try other algorithms

clf1 = GradientBoostingClassifier()
clf2 = SVC()
clf3 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 10)

for clf in [clf1, clf2, clf3]:

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(' ')
    print('score obtained with', clf)
    print(score)
    if score < 0.7:
        print('No good.')
    else:
        print('We have a candidate algorithm.')

 
score obtained with GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
0.5928
No good.




 
score obtained with SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.7324
We have a candidate algorithm.
 
score obtained with RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.78
We have a candidate algorithm.


In [5]:
### what if we combine the three attributes into one?
pca = PCA(n_components = 1)
X_pca = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size = 0.25)

clf1 = GradientBoostingClassifier()
clf2 = SVC()
clf3 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 10)

for clf in [clf1, clf2, clf3]:

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(' ')
    print('score obtained with k=1 and', clf)
    print(score)
    if score < 0.7:
        print('No good.')
    else:
        print('We have a candidate algorithm.')

 
score obtained with k=1 and GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
0.63
No good.




 
score obtained with k=1 and SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.6444
No good.
 
score obtained with k=1 and RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.6036
No good.


In [6]:
### try a neural network

# go back to three attributes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# build model
model = Sequential()
model.add(Dense(8, input_dim = 3, activation = 'relu'))
model.add(Dense(1, activation = 'softmax'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X_train, y_train, batch_size = 32, epochs = 100)
loss, acc = model.evaluate(X_test, y_test, verbose = 0)
print(' ')
print('score obtained with a simple neural network')
print(acc)
if score < 0.7:
    print('No good.')
else:
    print('We have a candidate algorithm.')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
 
score obtained with a simple neural network
0.4968
No good.


In [12]:
commentary = '''
commentary on results

Logistic regression did not work out: it classifies 
the samples correctly only ~50pct of the time, which
is what we would get by simply tossing a coin. Since
the samples are balanced (about 50pct belonging to each
class), this is a terrible result.

Gradient boosting and neural network yields a similar 
result. I.e., no good.

Support vector machine yields a much better result: over
70pct of correct classifications.

Random forest yields the best result from the ones obtained
in this exercise: almost 80pct of correct classifications.

Combining the 3 attributes into 1 does not improve the results.
(In fact it seems to worsen them.)

The instruction sheet says "Please do not spend too much 
time in it.", so I am stopping here. In a real-world 
application there would be lots more to do, like using
hyperparameter optimization to improve the random
forest results, trying different neural network 
architectures, etc. Also, I would use cross validation instead
of separating train/test samples only once (that would violate
item (a) of the instruction sheet though).
'''
print(commentary)


commentary on results

Logistic regression did not work out: it classifies 
the samples correctly only ~50pct of the time, which
is what we would get by simply tossing a coin. Since
the samples are balanced (about 50pct belonging to each
class), this is a terrible result.

Gradient boosting and neural network yields a similar 
result. I.e., no good.

Support vector machine yields a much better result: over
70pct of correct classifications.

Random forest yields the best result from the ones obtained
in this exercise: almost 80pct of correct classifications.

Combining the 3 attributes into 1 does not improve the results.
(In fact it seems to worsen them.)

The instruction sheet says "Please do not spend too much 
time in it.", so I am stopping here. In a real-world 
application there would be lots more to do, like using
hyperparameter optimization to improve the random
forest results, trying different neural network 
architectures, etc. Also, I would use cross validation instead
of se