In [1]:
import set_sys_path

In [2]:
from classy.controller.tasks import four

In [3]:
print(four.__doc__)


[Classification: Performance of the Naive Bayes algorithm on the given data set] Run the Naive
Bayes tool in Weka on the resulting version of train_gr_smpl. To be able to do this, you may need to
apply several Weka “Filters”. Explain the reason for choosing and using these filters. Once you can run
the algorithm, record, compare and analyse the classifier’s accuracy on different classes (as given by the
Weka Summary and the confusion matrix).




In [4]:
# Load data from task 2
from classy.model.data.read import Reader
reader = Reader()

In [5]:
reader.list_data_files()

['task_2.hdf', 'task_1.hdf', 'dataframe_with_class_attribute.hdf']

In [6]:
dataset = reader.load_data("dataframe_with_class_attribute")

In [7]:
# Instruct pandas show everything we ask it
# import pandas
# pandas.set_option('display.max_columns', None)
# pandas.set_option('display.max_rows', None)
# Not as informative as I'd hoped. Need my own visual feature reduction.

In [8]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2295,2296,2297,2298,2299,2300,2301,2302,2303,label
0,30,29,28,29,31,30,29,28,27,26,...,32,35,38,39,39,40,39,39,38,3
1,31,31,33,32,31,30,29,28,28,28,...,34,35,36,36,37,38,38,37,37,3
2,30,30,31,29,28,27,26,28,30,31,...,35,37,37,38,39,38,38,39,40,3
3,26,25,24,24,24,27,28,29,29,30,...,34,36,37,38,42,40,37,36,36,3
4,25,26,28,28,28,28,28,27,26,25,...,31,33,37,38,37,36,36,35,35,3


In [9]:
# The classifier will output a label number, so let's get our mapping table mapped label_number to label_name
label_mappings = {v:k for k,v in {
    "speed_limit_60": 3,
    "speed_limit_80": 5,
    "speed_limit_80_lifted": 6,
    "right_of_way_crossing": 11,
    "right_of_way_general": 12,
    "give_way": 13,
    "stop": 14,
    "no_speed_limit_general": 32,
    "turn_right_down": 38,
    "turn_left_down": 39,
}.items()}
label_mappings

{3: 'speed_limit_60',
 5: 'speed_limit_80',
 6: 'speed_limit_80_lifted',
 11: 'right_of_way_crossing',
 12: 'right_of_way_general',
 13: 'give_way',
 14: 'stop',
 32: 'no_speed_limit_general',
 38: 'turn_right_down',
 39: 'turn_left_down'}

#### Okay, now we're getting to it.

In [10]:
# First we want a stratified train test split.
from sklearn.model_selection import train_test_split

In [11]:
labels_numpy_array = dataset['label'].values

In [12]:
labels_numpy_array.shape

(12660,)

In [13]:
dataset.shape

(12660, 2305)

In [14]:
all_but_labels = dataset.iloc[:, :-1].values

In [15]:
print("Data:", dataset.shape, "Labels:" ,labels_numpy_array.shape)

Data: (12660, 2305) Labels: (12660,)


Now we have the labels and instances and attributes in a numpy array as train_test split wants them.

In [16]:
# Now the actual train test split. Vamos!
X_train, X_test, y_train, y_test = train_test_split(
    all_but_labels, 
    labels_numpy_array,
    # (70 30)% split
    test_size=0.3,
    # We want it stratified, labels equally distributed in the training and test datasets
    stratify=labels_numpy_array
)

Now we're reader to run Naive Bayes.
No dimensionality reduction via feature selection has been done on the data.
Scikit-learn has 5 naive bayes types. 

In [17]:
# Let's start with Gaussian Naive Bayes, see what accuracy we get with that

from sklearn.naive_bayes import GaussianNB

In [18]:
gnb = GaussianNB()

In [19]:
print("X:", X_train.shape, "y:" ,y_train.shape)

X: (8862, 2304) y: (8862,)


In [20]:
predictor = gnb.fit(X_train, y_train)

In [21]:
predicted = predictor.predict(X_test)

In [23]:
# What's the accuracy?
round(predictor.score(X_test, y_test), 2)

0.31

But the docs do say "In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted." But lets see if dimensionality reduction via feature selection helps with this.

In [37]:
# What does the confusion matrix look like?

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predicted)

array([[ 60,  46, 225,   2,   0,  16,  47,  24,   3,   0],
       [ 75, 135, 242,   3,   2,  28,  21,  29,   0,  23],
       [  4,   3, 107,   0,   0,   0,   0,  12,   0,   0],
       [ 15,  66,  57, 166,   4,  37,   1,  33,   3,  14],
       [  8,  68, 166,  15, 134,  95,  26,  85,  18,  15],
       [  9, 106, 113,   3,   3, 287,  11,  87,   7,  22],
       [  3,  36,  14,   0,   4,  10, 144,   4,   3,  16],
       [  7,   9,  11,   1,   1,   0,   2,  41,   0,   0],
       [ 34,  72, 195,   4,  13, 115,  81,  18,  70,  19],
       [  0,  13,  18,   0,   0,  16,  14,   4,   0,  25]])

Not good. Let's reduce the features, see if that improves.

# Dimensionality reduction

We'll do the feature selection/reduction on the training data (X_train and y_train)

Let's start with ANOVA (Analysis Of Variance). The test are univariate, and discard any feature that is significant only in combination with another.(ref_needed)

scikit-learn has SelectKBest which takes a specified number of features to return, and SelectPercentile. We'll start with the latter.

In [38]:
from sklearn.feature_selection import SelectPercentile

In [39]:
# How many features do we have?
X_train.shape

(8862, 2304)

In [40]:
# 2304. Let's see what accuracy we gate with half


In [41]:
selekta.fit(X_train, y_train)

SelectPercentile(percentile=50, score_func=<function f_classif at 0x1a44cd5b90>)

In [42]:
X_train_selected = selekta.transform(X_train)

In [43]:
# How many features do we have now?
X_train_selected.shape[1]

1152

In [44]:
# Alright. Now lets train the model with that and see if the accuracy improves.
gnb.fit(X_train_selected, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [45]:
# We need to select the X_test too
X_test_selected = selekta.transform(X_test)

In [47]:
# And see how the classifier scores on that
gnb.score(X_test_selected, y_test)

0.3665086887835703

36% accuracy now, but we can do better.

What about with 20% of the features?

In [48]:
selekta = SelectPercentile(percentile=20)
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
gnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
gnb.score(X_test_selected, y_test)

0.39046866771985256

In [57]:
# 39% much better! What about 10% ??
selekta = SelectPercentile(percentile=10)
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
gnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
gnb.score(X_test_selected, y_test)

0.36334913112164297

Okay, so 20% is the point of diminishing returns for this dataset.

But what if we use a model to work out which features are best? scikit-learn enables us to use a random forest classifier to do just that.

In [58]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [62]:
selekta = SelectFromModel(
    RandomForestClassifier(
        n_estimators=100, # Chose this purely randomly
        random_state=42, # Deterministic tree generation
        n_jobs=4, # I have 4 cores
    ),
    threshold="median", # >= the median of feature importances, so same as for 50% with SelectPercentile
)

In [61]:
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
gnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
gnb.score(X_test_selected, y_test)

0.37519747235387046

In [79]:
# 37%, now we get the best 20%
selekta = SelectFromModel(
    RandomForestClassifier(
        n_estimators=100, # Chose this purely randomly
        random_state=42, # Deterministic tree generation
        n_jobs=4, # I have 4 cores
    ),
    threshold="0.80*mean", # Best 20%
)

In [80]:
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
X_train_selected.shape

(8862, 593)

In [76]:
(20*2303)/100

460.6

In [81]:
gnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
gnb.score(X_test_selected, y_test)

0.41047919957872564

In [82]:
X_test.shape

(3798, 2304)

In [83]:
(593*100)/2304

25.73784722222222

So 25% of the features produce 41% accuracy ... with GaussianNB, maybe another NB will do better?

In [84]:
# Lets see how complement naive bayes does
from sklearn.naive_bayes import ComplementNB

In [85]:
cnb = ComplementNB()

In [86]:
# Then train it as the gnb
cnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
cnb.score(X_test_selected, y_test)

0.7132701421800948

Jesus.

Bernouli Naive Bayes is for binary data. So really CNB is the classifier for this data. I need to work out why.

In [87]:
# How does it perform with the no-dimentionally reduced data?
cnb.fit(X_train, y_train)
cnb.score(X_test, y_test)

0.4636650868878357

In [88]:
# Alright. So the dimensionality reduction makes it do better (as expected). 
# But how about with 50% of the features selected by the tree?
selekta = SelectFromModel(
    RandomForestClassifier(
        n_estimators=100, # Chose this purely randomly
        random_state=42, # Deterministic tree generation
        n_jobs=4, # I have 4 cores
    ),
    threshold="median", # >= the median of feature importances, so same as for 50% with SelectPercentile
)
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
cnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
cnb.score(X_test_selected, y_test)

0.6045286993154292

Very nice. How about 20% of of the SelectPercentile ones?

In [92]:
selekta = SelectPercentile(percentile=25)

In [93]:
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
cnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
cnb.score(X_test_selected, y_test)

0.6579778830963665

The results are conclusive, and here they are:

In [97]:
selekta = SelectFromModel(
    RandomForestClassifier(
        n_estimators=100, # Chose this purely randomly
        random_state=42, # Deterministic tree generation
        n_jobs=4, # I have 4 cores
    ),
    threshold="0.8*mean", # Best 25% based on feature importances
)
selekta.fit(X_train, y_train)
X_train_selected = selekta.transform(X_train)
cnb.fit(X_train_selected, y_train)
X_test_selected = selekta.transform(X_test)
cnb.score(X_test_selected, y_test)

0.7132701421800948

#### Model based feature selection with RandomForest as the model, produces features that produce the best classification using a Complement Naive Bayes classifier.

In [101]:
# How does the confusion matrix look like?
predicted = cnb.predict(X_test_selected)
confusion_matrix(y_test, predicted)

array([[ 16, 173,   0,  18,  89, 114,   1,   0,  12,   0],
       [  0, 316,   0,  25,  79, 122,   0,   0,  16,   0],
       [  0,   0,   0,   3, 123,   0,   0,   0,   0,   0],
       [  0,   0,   0, 360,  35,   0,   0,   0,   1,   0],
       [  0,  22,   0,  21, 571,  14,   0,   0,   2,   0],
       [  0,   0,   0,  20,  12, 615,   1,   0,   0,   0],
       [  0,   0,   0,   2,   3,   1, 225,   0,   3,   0],
       [  0,   3,   0,   1,  66,   1,   0,   0,   1,   0],
       [  0,   0,   0,   0,  28,  17,   0,   0, 576,   0],
       [  0,   0,   0,   7,  10,  43,   0,   0,   0,  30]])

### MUCH better.

In [102]:
# And a more detailed report
from sklearn.metrics import classification_report

In [104]:
report = classification_report(
    y_test, predicted
)

In [105]:
print(report)

              precision    recall  f1-score   support

           3       1.00      0.04      0.07       423
           5       0.61      0.57      0.59       558
           6       0.00      0.00      0.00       126
          11       0.79      0.91      0.84       396
          12       0.56      0.91      0.69       630
          13       0.66      0.95      0.78       648
          14       0.99      0.96      0.98       234
          32       0.00      0.00      0.00        72
          38       0.94      0.93      0.94       621
          39       1.00      0.33      0.50        90

    accuracy                           0.71      3798
   macro avg       0.66      0.56      0.54      3798
weighted avg       0.73      0.71      0.66      3798



In [None]:
!pip install python-cv2