# Dummy Classifier as Zero Baseline model

## Imports

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os  # easier file path handling

# for data pre-processing
from sklearn.model_selection import train_test_split

# for model evaluation
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

from sklearn.dummy import DummyClassifier

In [57]:
# Install nb_black for autoformatting
!pip install nb_black --quiet
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


## Load Data

## Run with smaller dataset

In [58]:
# load data
X = np.load("../data/X-data.npy")
y = np.load("../data/y-data.npy")

In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Dummy Classifier

In [60]:
dummy_model = DummyClassifier(strategy="stratified", random_state=42)
dummy_model.fit(X_train, y_train)
y_pred = dummy_model.predict(X_test)
y_proba = dummy_model.predict_proba(X_test)

In [61]:
# convert one-hot encoded categories back to labels,
# e.g. 0, 1 and 2 instead of [1,0,0], [0,1,0], [0,0,1]

# reshaping probabilities for each entry to final array
# y_proba = np.array(y_proba)[:, :, 1:].reshape(67, 10)

y_test_original = y_test.copy()
y_pred_original = y_pred.copy()
y_test = np.argmax(y_test, axis=1).tolist()
y_pred = np.argmax(y_pred, axis=1).tolist()

In [62]:
# multilabel confusion matrix
multilabel_confusion_matrix(y_test, y_pred)

array([[[36, 23],
        [ 2,  6]],

       [[55,  6],
        [ 6,  0]],

       [[55,  4],
        [ 7,  1]],

       [[56,  7],
        [ 4,  0]],

       [[58,  1],
        [ 8,  0]],

       [[56,  4],
        [ 6,  1]],

       [[58,  3],
        [ 6,  0]],

       [[53,  6],
        [ 7,  1]],

       [[60,  1],
        [ 6,  0]],

       [[58,  3],
        [ 6,  0]]])

In [63]:
# accuracy score
accuracy_score(y_test, y_pred)

0.13432835820895522

## Run with whole dataset

In [64]:
df = pd.read_csv("../data/asl-signs/train.csv")

In [65]:
sign_map = {
    "TV": 0,
    "after": 1,
    "airplane": 2,
    "all": 3,
    "alligator": 4,
    "animal": 5,
    "another": 6,
    "any": 7,
    "apple": 8,
    "arm": 9,
    "aunt": 10,
    "awake": 11,
    "backyard": 12,
    "bad": 13,
    "balloon": 14,
    "bath": 15,
    "because": 16,
    "bed": 17,
    "bedroom": 18,
    "bee": 19,
    "before": 20,
    "beside": 21,
    "better": 22,
    "bird": 23,
    "black": 24,
    "blow": 25,
    "blue": 26,
    "boat": 27,
    "book": 28,
    "boy": 29,
    "brother": 30,
    "brown": 31,
    "bug": 32,
    "bye": 33,
    "callonphone": 34,
    "can": 35,
    "car": 36,
    "carrot": 37,
    "cat": 38,
    "cereal": 39,
    "chair": 40,
    "cheek": 41,
    "child": 42,
    "chin": 43,
    "chocolate": 44,
    "clean": 45,
    "close": 46,
    "closet": 47,
    "cloud": 48,
    "clown": 49,
    "cow": 50,
    "cowboy": 51,
    "cry": 52,
    "cut": 53,
    "cute": 54,
    "dad": 55,
    "dance": 56,
    "dirty": 57,
    "dog": 58,
    "doll": 59,
    "donkey": 60,
    "down": 61,
    "drawer": 62,
    "drink": 63,
    "drop": 64,
    "dry": 65,
    "dryer": 66,
    "duck": 67,
    "ear": 68,
    "elephant": 69,
    "empty": 70,
    "every": 71,
    "eye": 72,
    "face": 73,
    "fall": 74,
    "farm": 75,
    "fast": 76,
    "feet": 77,
    "find": 78,
    "fine": 79,
    "finger": 80,
    "finish": 81,
    "fireman": 82,
    "first": 83,
    "fish": 84,
    "flag": 85,
    "flower": 86,
    "food": 87,
    "for": 88,
    "frenchfries": 89,
    "frog": 90,
    "garbage": 91,
    "gift": 92,
    "giraffe": 93,
    "girl": 94,
    "give": 95,
    "glasswindow": 96,
    "go": 97,
    "goose": 98,
    "grandma": 99,
    "grandpa": 100,
    "grass": 101,
    "green": 102,
    "gum": 103,
    "hair": 104,
    "happy": 105,
    "hat": 106,
    "hate": 107,
    "have": 108,
    "haveto": 109,
    "head": 110,
    "hear": 111,
    "helicopter": 112,
    "hello": 113,
    "hen": 114,
    "hesheit": 115,
    "hide": 116,
    "high": 117,
    "home": 118,
    "horse": 119,
    "hot": 120,
    "hungry": 121,
    "icecream": 122,
    "if": 123,
    "into": 124,
    "jacket": 125,
    "jeans": 126,
    "jump": 127,
    "kiss": 128,
    "kitty": 129,
    "lamp": 130,
    "later": 131,
    "like": 132,
    "lion": 133,
    "lips": 134,
    "listen": 135,
    "look": 136,
    "loud": 137,
    "mad": 138,
    "make": 139,
    "man": 140,
    "many": 141,
    "milk": 142,
    "minemy": 143,
    "mitten": 144,
    "mom": 145,
    "moon": 146,
    "morning": 147,
    "mouse": 148,
    "mouth": 149,
    "nap": 150,
    "napkin": 151,
    "night": 152,
    "no": 153,
    "noisy": 154,
    "nose": 155,
    "not": 156,
    "now": 157,
    "nuts": 158,
    "old": 159,
    "on": 160,
    "open": 161,
    "orange": 162,
    "outside": 163,
    "owie": 164,
    "owl": 165,
    "pajamas": 166,
    "pen": 167,
    "pencil": 168,
    "penny": 169,
    "person": 170,
    "pig": 171,
    "pizza": 172,
    "please": 173,
    "police": 174,
    "pool": 175,
    "potty": 176,
    "pretend": 177,
    "pretty": 178,
    "puppy": 179,
    "puzzle": 180,
    "quiet": 181,
    "radio": 182,
    "rain": 183,
    "read": 184,
    "red": 185,
    "refrigerator": 186,
    "ride": 187,
    "room": 188,
    "sad": 189,
    "same": 190,
    "say": 191,
    "scissors": 192,
    "see": 193,
    "shhh": 194,
    "shirt": 195,
    "shoe": 196,
    "shower": 197,
    "sick": 198,
    "sleep": 199,
    "sleepy": 200,
    "smile": 201,
    "snack": 202,
    "snow": 203,
    "stairs": 204,
    "stay": 205,
    "sticky": 206,
    "store": 207,
    "story": 208,
    "stuck": 209,
    "sun": 210,
    "table": 211,
    "talk": 212,
    "taste": 213,
    "thankyou": 214,
    "that": 215,
    "there": 216,
    "think": 217,
    "thirsty": 218,
    "tiger": 219,
    "time": 220,
    "tomorrow": 221,
    "tongue": 222,
    "tooth": 223,
    "toothbrush": 224,
    "touch": 225,
    "toy": 226,
    "tree": 227,
    "uncle": 228,
    "underwear": 229,
    "up": 230,
    "vacuum": 231,
    "wait": 232,
    "wake": 233,
    "water": 234,
    "wet": 235,
    "weus": 236,
    "where": 237,
    "white": 238,
    "who": 239,
    "why": 240,
    "will": 241,
    "wolf": 242,
    "yellow": 243,
    "yes": 244,
    "yesterday": 245,
    "yourself": 246,
    "yucky": 247,
    "zebra": 248,
    "zipper": 249,
}

In [66]:
# applying dictionary on sign to create target column
df["target"] = df.sign.map(sign_map)
df.head()

Unnamed: 0,path,participant_id,sequence_id,sign,target
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow,25
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait,232
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,48
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,23
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie,164


In [67]:
y = df.target
X = df.drop("target", axis=1)
X.head()

Unnamed: 0,path,participant_id,sequence_id,sign
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie


In [68]:
y = pd.get_dummies(y)
y.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Dummy Classifier

In [70]:
dummy_model = DummyClassifier(strategy="stratified", random_state=42)
dummy_model.fit(X_train, y_train)
y_pred = dummy_model.predict(X_test)
y_proba = dummy_model.predict_proba(X_test)

In [71]:
np.array(y_test).shape

(18896, 250)

In [72]:
np.argmax(np.array(y_test), axis=1)

array([206,  20, 178, ...,   4,  95,   7])

In [74]:
y_test_original = y_test.copy()
y_pred_original = y_pred.copy()
y_test = np.argmax(np.array(y_test), axis=1).tolist()
y_pred = np.argmax(y_pred, axis=1).tolist()

In [75]:
# multilabel confusion matrix
multilabel_confusion_matrix(y_test, y_pred)

array([[[11845,  6980],
        [   37,    34]],

       [[18740,    81],
        [   74,     1]],

       [[18748,    73],
        [   73,     2]],

       [[18742,    69],
        [   84,     1]],

       [[18752,    70],
        [   74,     0]],

       [[18779,    53],
        [   64,     0]],

       [[18735,    69],
        [   89,     3]],

       [[18745,    80],
        [   71,     0]],

       [[18755,    72],
        [   69,     0]],

       [[18747,    78],
        [   70,     1]],

       [[18748,    74],
        [   74,     0]],

       [[18747,    61],
        [   87,     1]],

       [[18755,    78],
        [   63,     0]],

       [[18745,    71],
        [   80,     0]],

       [[18747,    77],
        [   72,     0]],

       [[18756,    72],
        [   68,     0]],

       [[18748,    78],
        [   70,     0]],

       [[18758,    66],
        [   72,     0]],

       [[18749,    68],
        [   79,     0]],

       [[18747,    71],
        [   77,     1]],



In [76]:
# accuracy score
accuracy_score(y_test, y_pred)

0.004498306519898391

## Summary for baseline model
* I am not 100% sure how well this works for multiclass classification because for y_pred you get several predictions for several classes

