# Baseline model
In this notebook we explore our options for a baseline model. We also look at what preprocessing steps are needed.

In [11]:
print("hello")

hello


In [12]:
import pandas as pd
import numpy as np
import os

RANDOM_SEED = 21223

Unzip dataset

In [13]:
from zipfile import ZipFile
dir_path = "../data"
train_path = dir_path + "/asl_train"
test_path = dir_path + "/asl_test"

def unzip_if_not_exists(zip_file_path, extract_to_path):
    # check if the target directory already exists
    if not os.path.exists(extract_to_path):
        # create the directory if it doesn't exist
        os.makedirs(extract_to_path)
        # unzip the contents
        with ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_path)
        print(f"Successfully unzipped to {extract_to_path}")
    else:
        print(f"The directory {extract_to_path} already exists. Skipped unzipping.")

unzip_if_not_exists(dir_path + "/asl_alphabet_train.zip", train_path)
unzip_if_not_exists(dir_path + "/asl_alphabet_test.zip", test_path)

The directory ../data/asl_train already exists. Skipped unzipping.
The directory ../data/asl_test already exists. Skipped unzipping.


Store all data in a pandas df

In [14]:
train_path += "/asl_alphabet_train/"
test_path += "/asl_alphabet_test/"

# map alphabet to numbers
categories = {  0: "A",
                1: "B",
                2: "C",
                3: "D",
                4: "E",
                5: "F",
                6: "G",
                7: "H",
                8: "I",
                9: "K",
                10: "L",
                11: "M",
                12: "N",
                13: "O",
                14: "P",
                15: "Q",
                16: "R",
                17: "S",
                18: "T",
                19: "U",
                20: "V",
                21: "W",
                22: "X",
                23: "Y",
            }

def add_class_name_prefix(df, col_name):
    df[col_name]
    return df

# store all the file names in the dataset
filenames = []
# store the corresponding class for each file
target = []

for category in categories:
    files = os.listdir(train_path + categories[category])
    filenames += files
    target += [category] * len(files)

df = pd.DataFrame({"filename": filenames, "category": target})
df = add_class_name_prefix(df, "filename")

# shuffle the dataframe
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

check it out

In [15]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  72000 non-null  object
 1   category  72000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


Unnamed: 0,filename,category
0,V1957.jpg,20
1,M2695.jpg,11
2,C705.jpg,2
3,T1148.jpg,18
4,P600.jpg,14


sample a smaller dataset. split into train and test.

In [16]:
n = 100
n_train = int(n*0.8)

# sample n random data points from each sign class
df_small = df.groupby('category', group_keys=False).apply(lambda x: x.sample(n, ignore_index=True, random_state=RANDOM_SEED))

# sample n_train % of the small df for the train set
df_train = df.groupby('category', group_keys=False).apply(lambda x: x.sample(n_train, ignore_index=True, random_state=RANDOM_SEED))

# shuffle train set
df_train = df_train.sample(frac=1).reset_index(drop=True)

# obtain test set from remaining points in the small df
df_test = pd.merge(df_small, df_train, how='left', indicator=True)
df_test = df_test[df_test['_merge'] == 'left_only'].drop(columns=['_merge']).reset_index(drop=True)

print(df_train.shape)
print(df_test.shape)

(1920, 2)
(480, 2)


Preprocessing steps:
- Grayscale
- Flatten to 1D
- Normalize pixel values to 0-1
- Feature extraction/Dimensionality reduction -> TBD

In [17]:
from PIL import Image

# preprocess each row in the dataframes
def process_row(row):
    filename = row["filename"]
    label = row["category"]
    # load
    image = Image.open(train_path + "/" + categories[label] + "/" + filename)
    # convert to grayscale
    gray_image = image.convert('L')
    # convert to 2d tensor
    image_arr = np.array(gray_image)
    # normalize to 0-1 range
    normalized_arr = image_arr / 255
    # flatten 2d tensor to 1d array
    flat_image = normalized_arr.ravel()
    new_row = {'image_array': flat_image, 'label': label}
    return new_row

df_train_processed = df_train.apply(process_row, axis=1, result_type="expand")
df_test_processed = df_test.apply(process_row, axis=1, result_type="expand")

In [18]:
df_train_processed.head(2)

Unnamed: 0,image_array,label
0,"[0.12549019607843137, 0.13725490196078433, 0.1...",22
1,"[0.11764705882352941, 0.12941176470588237, 0.1...",15


extract X and y from df

In [21]:
X_train = np.array(df_train_processed['image_array'].tolist())
y_train = np.array(df_train_processed['label'])

X_test = np.array(df_test_processed['image_array'].tolist())
y_test = np.array(df_test_processed['label'])

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

1920
1920
480
480


at this point we have 1D arrays of all 40k pixels for each image