In [15]:
from pathlib import Path
import re
import pandas as pd

# set working directory and read in dataframes of image file names
wkdir=Path(".")
sam_df=pd.read_csv(wkdir/"preprocessed_data"/"Sam_aiueoe_100.csv")
yc_df=pd.read_csv(wkdir/"preprocessed_data"/"Yen-chen_aiueoe_100.csv")

# create vowel category column
sam_df["Vowel"]=[re.findall(r"(^\D+)", label)[0] for label in sam_df["Label"]]
yc_df["Vowel"]=[re.findall(r"(^\D+)", label)[0] for label in yc_df["Label"]]

In [16]:
import os
import numpy as np
import cv2

# create folders to store preprocessed images
try:
    os.makedirs((wkdir/"sam_frames_preprocessed"))
except FileExistsError:
    pass
try:
    os.makedirs((wkdir/"yenchen_frames_preprocessed"))
except FileExistsError:
    pass

# preprocess images by converting to grayscale, crop, and resize, then save to a different folder
# images are cropped to 700*480 pixels, then downsized to 140*96 pixels
for thisimagepath in sam_df["Label"]:
    image=cv2.imread(filename=str(wkdir/"sam_frames"/thisimagepath), flags=cv2.IMREAD_UNCHANGED)
    gray_image=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    cropped_gray_image=gray_image[82:562, 211:911].copy()
    resized_cropped_gray_image=cv2.resize(src=cropped_gray_image, dsize=(140, 96), interpolation=cv2.INTER_AREA)
    cv2.imwrite(filename=str(wkdir/"sam_frames_preprocessed"/thisimagepath), img=resized_cropped_gray_image)

for thisimagepath in yc_df["Label"]:
    image=cv2.imread(filename=str(wkdir/"yenchen_frames"/thisimagepath), flags=cv2.IMREAD_UNCHANGED)
    gray_image=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    cropped_gray_image=gray_image[82:562, 211:911].copy()
    resized_cropped_gray_image=cv2.resize(src=cropped_gray_image, dsize=(140, 96), interpolation=cv2.INTER_AREA)
    cv2.imwrite(filename=str(wkdir/"yenchen_frames_preprocessed"/thisimagepath), img=resized_cropped_gray_image)

In [33]:
from sklearn.utils import Bunch

# transform preprocessed image data and corresponding vowel labels into a dictionary
# this dictionary is in a format that can be passed to classification and clustering algorithms
image_list=list()
flat_image_list=list()
vowel_category_list=list()
for irow in range(sam_df.shape[0]):
    thisimagepath=sam_df["Label"][irow]
    image_list.append(cv2.imread(filename=str(wkdir/"sam_frames_preprocessed"/thisimagepath), flags=cv2.IMREAD_UNCHANGED))
    flat_image_list.append(cv2.imread(filename=str(wkdir/"sam_frames_preprocessed"/thisimagepath), flags=cv2.IMREAD_UNCHANGED).flatten())
    vowel_category_list.append(sam_df["Vowel"][irow])
sam_data=Bunch(flat_images=np.array(flat_image_list), target=np.array(vowel_category_list), images=np.array(image_list))

image_list=list()
flat_image_list=list()
vowel_category_list=list()
for irow in range(yc_df.shape[0]):
    thisimagepath=yc_df["Label"][irow]
    image_list.append(cv2.imread(filename=str(wkdir/"yenchen_frames_preprocessed"/thisimagepath), flags=cv2.IMREAD_UNCHANGED))
    flat_image_list.append(cv2.imread(filename=str(wkdir/"yenchen_frames_preprocessed"/thisimagepath), flags=cv2.IMREAD_UNCHANGED).flatten())
    vowel_category_list.append(yc_df["Vowel"][irow])
yc_data=Bunch(flat_images=np.array(flat_image_list), target=np.array(vowel_category_list), images=np.array(image_list))

In [78]:
import pickle

# save data objects as pickle files
with open((wkdir/"preprocessed_data"/"sam_data.pkl"), "wb") as output_file:
    pickle.dump(obj=sam_data, file=output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open((wkdir/"preprocessed_data"/"yc_data.pkl"), "wb") as output_file:
    pickle.dump(obj=yc_data, file=output_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# for any further analysis, read in pickled data using the code below

import pickle

with open((wkdir/"preprocessed_data"/"sam_data.pkl"), "rb") as input_file:
    sam_data=pickle.load(input_file)
with open((wkdir/"preprocessed_data"/"yc_data.pkl"), "rb") as input_file:
    yc_data=pickle.load(input_file)