In [40]:
% tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

# Print TF version, keep the version in mind when you look the documentation.
print('TensorFlow version: {}'.format(tf.__version__))

TensorFlow version: 2.1.0


In [0]:
import numpy as np
import os
import pandas as pd
import time, gc
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import PIL.Image as Image, PIL.ImageDraw as ImageDraw, PIL.ImageFont as ImageFont
from matplotlib import pyplot as plt
import pyarrow.parquet as pq
from PIL import Image
from PIL import ImageEnhance
from keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from scipy import stats
import pickle

In [0]:
path = "/content/drive/My Drive/Data/bengaliai-cv19/" # TCC
# path="/content/drive/My Drive/data 2040 midterm private folder/Data/bengaliai-cv19/" # LSY

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Obtain Data from CSV and Parquet

***The codes in model building and training are credit to [this pubic notebook](https://www.kaggle.com/kaushal2896/bengali-graphemes-starter-eda-multi-output-cnn)***

Train, test label from CSV


In [0]:
train_df_ = pd.read_csv(path + 'train.csv')
test_df_ = pd.read_csv(path + 'test.csv')
class_map_df = pd.read_csv(path + 'class_map.csv')
sample_sub_df = pd.read_csv(path + 'sample_submission.csv')

## Enhancement Functions

To increase training speed and model accuracy, the images are first cropped to fufill the image with characters and then resized to a 64 by 64 pixel image.

In [0]:
# resize the dataframe
from tqdm.auto import tqdm
def resize(df, size=96, need_progress_bar=True, sharpness=3.0):
    resized = {}
    resize_size=96
    if need_progress_bar:
        for i in tqdm(range(df.shape[0])):
            image=df.loc[df.index[i]].values.reshape(137,236)
            # Add sharpness
            # image = Image.fromarray(image)
            # image = ImageEnhance.Sharpness(image)
            # image = image.enhance(sharpness)
            # image = np.array(image)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]

            idx = 0 
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    else:
        for i in range(df.shape[0]):
            image=df.loc[df.index[i]].values.reshape(137,236)
            # Add sharpness
            # image = Image.fromarray(image)
            # image = ImageEnhance.Sharpness(image)
            # image = image.enhance(sharpness)
            # image = np.array(image)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]
            idx = 0 
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    resized = pd.DataFrame(resized).T
    return resized

In [0]:
# Build DataGenerator
class MultiOutputDataGenerator(keras.preprocessing.image.ImageDataGenerator):

    def flow(self,
             x,
             y=None,
             batch_size=32,
             shuffle=True,
             sample_weight=None,
             seed=None,
             save_to_dir=None,
             save_prefix='',
             save_format='png',
             subset=None):

        targets = None
        target_lengths = {}
        ordered_outputs = []
        for output, target in y.items():
            if targets is None:
                targets = target
            else:
                targets = np.concatenate((targets, target), axis=1)
            target_lengths[output] = target.shape[1]
            ordered_outputs.append(output)


        for flowx, flowy in super().flow(x, targets, batch_size=batch_size,
                                         shuffle=shuffle):
            target_dict = {}
            i = 0
            for output in ordered_outputs:
                target_length = target_lengths[output]
                target_dict[output] = flowy[:, i: i + target_length]
                i += target_length

            yield flowx, target_dict

In [0]:
IMG_SIZE=96
N_CHANNELS=1
batch_size = 256
epochs = 30
HEIGHT = 137
WIDTH = 236

## Model Bagging

In [0]:
vanilla_96 = tf.keras.models.load_model('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/vanilla_96_model.h5')
vanilla_96.load_weights('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/vanilla_96_model.h5')

In [10]:
converter = tf.lite.TFLiteConverter.from_keras_model(vanilla_96)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
vanilla_96 = converter.convert()
open('vanilla_96.tflite', 'wb').write(vanilla_96)

15009584

In [0]:
moredense_96 = tf.keras.models.load_model('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/96_more_dense_model.h5')
moredense_96.load_weights('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/96_more_dense_model.h5')

In [12]:
converter = tf.lite.TFLiteConverter.from_keras_model(moredense_96)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
moredense_96 = converter.convert()
open('moredense_96.tflite', 'wb').write(moredense_96)

15495136

In [0]:
moredense_96_moretrain = tf.keras.models.load_model('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/96_more_dense_model_moretrain.h5')
moredense_96_moretrain.load_weights('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/96_more_dense_model_moretrain.h5')

In [14]:
converter = tf.lite.TFLiteConverter.from_keras_model(moredense_96_moretrain)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
moredense_96_moretrain = converter.convert()
open('moredense_96_moretrain.tflite', 'wb').write(moredense_96_moretrain)

15495136

In [0]:
densenet2 = tf.keras.models.load_model('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/saved_models/LSY_MODEL_densenet2.h5')
densenet2.load_weights('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/saved_models/LSY_MODEL_densenet2.h5')

In [16]:
converter = tf.lite.TFLiteConverter.from_keras_model(densenet2)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
densenet2 = converter.convert()
open('densenet2.tflite', 'wb').write(densenet2)

9809736

In [0]:
densenet4_2 = tf.keras.models.load_model('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/saved_models/LSY_MODEL_densenet4.2.h5')
densenet4_2.load_weights('/content/drive/My Drive/Data/bengaliai-cv19/saved_modelssss/saved_models/LSY_MODEL_densenet4.2.h5')

In [18]:
converter = tf.lite.TFLiteConverter.from_keras_model(densenet4_2)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
densenet4_2 = converter.convert()
open('densenet4_2.tflite', 'wb').write(densenet4_2)

9829088

In [0]:
vanilla_96 = tf.lite.Interpreter(model_path="vanilla_96.tflite")
vanilla_96.allocate_tensors()
moredense_96 = tf.lite.Interpreter(model_path="moredense_96.tflite")
moredense_96.allocate_tensors()
moredense_96_moretrain = tf.lite.Interpreter(model_path="moredense_96_moretrain.tflite")
moredense_96_moretrain.allocate_tensors()
densenet2 = tf.lite.Interpreter(model_path="densenet2.tflite")
densenet2.allocate_tensors()
densenet4_2 = tf.lite.Interpreter(model_path="densenet4_2.tflite")
densenet4_2.allocate_tensors()

In [0]:
models = (vanilla_96, moredense_96, moredense_96_moretrain, densenet2, densenet4_2)
input_details = vanilla_96.get_input_details()
output_details = vanilla_96.get_output_details()
input_shape = input_details[0]['shape']
input_index = input_details[0]['index']

In [0]:
i = 0
df_test_img = pd.read_parquet(path + f'test_image_data_{i}.parquet') 
df_test_img.set_index('image_id', inplace=True)

X_test = resize(df_test_img, need_progress_bar=False)/255
X_test = tf.convert_to_tensor(X_test.values.reshape(-1, IMG_SIZE, IMG_SIZE, N_CHANNELS))

In [0]:
def bagging_pred_lite(x, models):
    """Bagging the input models, for lite models"""
    len_model = len(models)
    dict_pred_func = {}

    def predict_results(input_data):
        """Return mode of model preidctions"""
        # input_data_2 = np.array(input_data)
        # input_data_2 = input_data
        temp_pred = np.empty([3, len_model])
        for model_idx, model in enumerate(models):
            model.set_tensor(input_index, input_data)
            model.invoke()
            temp_pred[0][model_idx] = np.argmax(model.get_tensor(output_details[0]['index']))
            temp_pred[1][model_idx] = np.argmax(model.get_tensor(output_details[1]['index']))
            temp_pred[2][model_idx] = np.argmax(model.get_tensor(output_details[2]['index']))
        return stats.mode(temp_pred, axis=1)[0]

    @tf.function
    def map_predictions(data):
        return tf.map_fn(predict_results, data, parallel_iterations=1024, dtype=tf.float32)
    

    bagging_preds = np.transpose(map_predictions(x))[0]

    for key_idx, key in enumerate(('grapheme_root', 'vowel_diacritic', 'consonant_diacritic')):
        dict_pred_func[key] = bagging_preds[key_idx]

    return dict_pred_func

In [0]:
bagging_pred_lite(X_test, models)

In [0]:
def bagging_pred(x, models):
    """Bagging the input models, for full models"""
    len_model = len(models)
    dict_pred = {
    'grapheme_root': np.empty([x.shape[0], len_model]),
    'vowel_diacritic': np.empty([x.shape[0], len_model]),
    'consonant_diacritic': np.empty([x.shape[0], len_model]),
    }
    for model_idx, model in enumerate(models):
        preds = model.predict(x)
        for i, p in enumerate(dict_pred):
            dict_pred[p][:, model_idx] = np.argmax(preds[i], axis=1)
    
    for key in dict_pred.keys():
        dict_pred[key] = np.transpose(stats.mode(dict_pred[key], axis=1)[0])[0]

    return dict_pred

In [54]:
# Separate preidtions
dict_pred = {
    'grapheme_root': np.zeros(6),
    'vowel_diacritic': np.zeros(6),
    'consonant_diacritic': np.zeros(6),
    'total': np.zeros(6)
}
for model_count, model in enumerate(models):
    components = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']
    count = 0
    for i in range(4):
        train_df = pd.merge(pd.read_parquet(path+f'train_image_data_{i}.parquet'), train_df_, on='image_id').drop(['image_id', 'grapheme'], axis=1)
        X_train = train_df.drop(['grapheme_root', 'vowel_diacritic', 'consonant_diacritic'], axis=1)
        
        Y_train_root = pd.get_dummies(train_df['grapheme_root']).values
        Y_train_vowel = pd.get_dummies(train_df['vowel_diacritic']).values
        Y_train_consonant = pd.get_dummies(train_df['consonant_diacritic']).values

        # Divide the data into training and validation set
        x_train, x_test, y_train_root, y_test_root, y_train_vowel, y_test_vowel, y_train_consonant, y_test_consonant = train_test_split(X_train, Y_train_root, Y_train_vowel, Y_train_consonant, test_size=0.08, random_state=666)
        del train_df
        del X_train
        del Y_train_root, Y_train_vowel, Y_train_consonant
        del x_train
        del y_train_root
        del y_train_vowel
        del y_train_consonant

        x_test = resize(x_test)/255
        x_test = x_test.values.reshape(-1, IMG_SIZE, IMG_SIZE, N_CHANNELS)

        preds = model.predict(x_test)
        y_true = np.array([
            np.argmax(y_test_root, axis=1), 
            np.argmax(y_test_vowel, axis=1), 
            np.argmax(y_test_consonant, axis=1)
            ])
        for pred_idx in range(len(preds)):
            preds[pred_idx] = np.argmax(preds[pred_idx], axis=1)

        acc = preds == y_true
        dict_pred['grapheme_root'][model_count] += np.sum(acc[0])
        dict_pred['vowel_diacritic'][model_count] += np.sum(acc[1])
        dict_pred['consonant_diacritic'][model_count] += np.sum(acc[2])
        ds_len = x_test.shape[0]
        dict_pred['total'][model_count] += np.sum(np.sum(acc, axis=0) == np.full((1, ds_len), 3))
        count += ds_len
        
        # Delete to reduce memory usage
        del x_test
        del y_test_root
        del y_test_vowel
        del y_test_consonant
        del y_true

    for key in dict_pred.keys():
        dict_pred[key][model_count] /= count

HBox(children=(IntProgress(value=0, max=4017), HTML(value='')))




AttributeError: ignored

In [0]:
# bagging predictions
count = 0
for i in range(4):
    train_df = pd.merge(pd.read_parquet(path+f'train_image_data_{i}.parquet'), train_df_, on='image_id').drop(['image_id', 'grapheme'], axis=1)
    X_train = train_df.drop(['grapheme_root', 'vowel_diacritic', 'consonant_diacritic'], axis=1)
    
    Y_train_root = pd.get_dummies(train_df['grapheme_root']).values
    Y_train_vowel = pd.get_dummies(train_df['vowel_diacritic']).values
    Y_train_consonant = pd.get_dummies(train_df['consonant_diacritic']).values

    # Divide the data into training and validation set
    x_train, x_test, y_train_root, y_test_root, y_train_vowel, y_test_vowel, y_train_consonant, y_test_consonant = train_test_split(X_train, Y_train_root, Y_train_vowel, Y_train_consonant, test_size=0.08, random_state=666)
    del train_df
    del X_train
    del Y_train_root, Y_train_vowel, Y_train_consonant
    del x_train
    del y_train_root
    del y_train_vowel
    del y_train_consonant

    x_test = resize(x_test)/255
    x_test = x_test.values.reshape(-1, IMG_SIZE, IMG_SIZE, N_CHANNELS)

    preds = bagging_pred_lite(x_test, models)
    y_true = np.array([
        np.argmax(y_test_root, axis=1), 
        np.argmax(y_test_vowel, axis=1), 
        np.argmax(y_test_consonant, axis=1)
        ])
    
    preds_array = []
    for pred_key in ['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']:
        preds_array.append(preds[pred_key])

    acc = np.array(preds_array) == y_true
    dict_pred['grapheme_root'][5] += np.sum(acc[0])
    dict_pred['vowel_diacritic'][5] += np.sum(acc[1])
    dict_pred['consonant_diacritic'][5] += np.sum(acc[2])
    ds_len = x_test.shape[0]
    dict_pred['total'][5] += np.sum(np.sum(acc, axis=0) == np.full((1, ds_len), 3))
    count += ds_len
    
    # Delete to reduce memory usage
    del x_test
    del y_test_root
    del y_test_vowel
    del y_test_consonant
    del y_true

for key in dict_pred.keys():
    dict_pred[key][5] /= count

In [55]:
df_pred_eval = pd.DataFrame(dict_pred, index=['vanilla_96', 'moredense_96', 'moredense_96_moretrain', 'densenet2', 'densenet4_2', 'bagging'])
df_pred_eval

Unnamed: 0,grapheme_root,vowel_diacritic,consonant_diacritic,total
vanilla_96,0.0,0.0,0.0,0.0
moredense_96,0.0,0.0,0.0,0.0
moredense_96_moretrain,0.0,0.0,0.0,0.0
densenet2,0.0,0.0,0.0,0.0
densenet4_2,0.0,0.0,0.0,0.0
bagging,0.0,0.0,0.0,0.0


In [0]:
# Test Data
components = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']
target=[] # model predictions placeholder
row_id=[] # row_id place holder
for i in range(4):
    df_test_img = pd.read_parquet(path + f'test_image_data_{i}.parquet') 
    df_test_img.set_index('image_id', inplace=True)

    X_test = resize(df_test_img, need_progress_bar=False)/255
    X_test = X_test.values.reshape(-1, IMG_SIZE, IMG_SIZE, N_CHANNELS)
    
    dict_pred = bagging_pred(X_test, models)

    for k,id in enumerate(df_test_img.index.values):  
        for i,comp in enumerate(components):
            id_sample=id+'_'+comp
            row_id.append(id_sample)
            target.append(dict_pred[comp][k])
    del df_test_img
    del X_test
    gc.collect()

df_sample = pd.DataFrame(
    {
        'row_id': row_id,
        'target':target
    },
    columns = ['row_id','target'] 
)
df_sample.to_csv('submission.csv',index=False)
df_sample.head()

Unnamed: 0,row_id,target
0,Test_0_consonant_diacritic,0
1,Test_0_grapheme_root,3
2,Test_0_vowel_diacritic,0
3,Test_1_consonant_diacritic,0
4,Test_1_grapheme_root,93


In [0]:
df_sample

Unnamed: 0,row_id,target
0,Test_0_consonant_diacritic,0.0
1,Test_0_grapheme_root,3.0
2,Test_0_vowel_diacritic,0.0
3,Test_1_consonant_diacritic,0.0
4,Test_1_grapheme_root,93.0
5,Test_1_vowel_diacritic,2.0
6,Test_2_consonant_diacritic,0.0
7,Test_2_grapheme_root,19.0
8,Test_2_vowel_diacritic,0.0
9,Test_3_consonant_diacritic,0.0


## For submission

In [0]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load in 

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the "../input/" directory.
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # Any results you write to the current directory are saved as output.


import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import pandas as pd
import time, gc
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import PIL.Image as Image, PIL.ImageDraw as ImageDraw, PIL.ImageFont as ImageFont
from matplotlib import pyplot as plt
import pyarrow.parquet as pq
from PIL import Image
from PIL import ImageEnhance
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from sklearn.metrics import accuracy_score
from scipy import stats
import pickle


# load models
path = '/kaggle/input/bagging'

vanilla_96 = tf.keras.models.load_model(path + '/vanilla_96_model.h5')
vanilla_96.load_weights(path + '/vanilla_96_model.h5')

moredense_96 = tf.keras.models.load_model(path + '/96_more_dense_model.h5')
moredense_96.load_weights(path + '/96_more_dense_model.h5')

moredense_96_moretrain = tf.keras.models.load_model(path + '/96_more_dense_model_moretrain.h5')
moredense_96_moretrain.load_weights(path + '/96_more_dense_model_moretrain.h5')

models = (vanilla_96, moredense_96, moredense_96_moretrain)


IMG_SIZE=96
N_CHANNELS=1
batch_size = 256
epochs = 30
HEIGHT = 137
WIDTH = 236


# bagging function
def bagging_pred(x, models):
    len_model = len(models)
    dict_pred = {
    'grapheme_root': np.empty([x.shape[0], len_model]),
    'vowel_diacritic': np.empty([x.shape[0], len_model]),
    'consonant_diacritic': np.empty([x.shape[0], len_model]),
    }
    for model_idx, model in enumerate(models):
        preds = model.predict(x)
        for i, p in enumerate(dict_pred):
            dict_pred[p][:, model_idx] = np.argmax(preds[i], axis=1)
    
    for key in dict_pred.keys():
        dict_pred[key] = np.transpose(stats.mode(dict_pred[key], axis=1)[0])[0]

    return dict_pred

# Clean Data
# resize the dataframe
from tqdm.auto import tqdm
def resize(df, size=96, need_progress_bar=True, sharpness=3.0):
    resized = {}
    resize_size=96
    if need_progress_bar:
        for i in tqdm(range(df.shape[0])):
            image=df.loc[df.index[i]].values.reshape(137,236)
            # Add sharpness
            image = Image.fromarray(image)
            image = ImageEnhance.Sharpness(image)
            image = image.enhance(sharpness)
            image = np.array(image)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]

            idx = 0 
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    else:
        for i in range(df.shape[0]):
            image=df.loc[df.index[i]].values.reshape(137,236)
            # Add sharpness
            image = Image.fromarray(image)
            image = ImageEnhance.Sharpness(image)
            image = image.enhance(sharpness)
            image = np.array(image)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]
            idx = 0 
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    resized = pd.DataFrame(resized).T
    return resized



# Test Data
# Test Data
components = ['consonant_diacritic', 'grapheme_root', 'vowel_diacritic']
target=[] # model predictions placeholder
row_id=[] # row_id place holder
for i in range(4):
    df_test_img = pd.read_parquet('/kaggle/input/bengaliai-cv19/test_image_data_{}.parquet'.format(i)) 
    df_test_img.set_index('image_id', inplace=True)

    X_test = resize(df_test_img, need_progress_bar=False)/255
    X_test = X_test.values.reshape(-1, IMG_SIZE, IMG_SIZE, N_CHANNELS)
    
    dict_pred = bagging_pred(X_test, models)

    for k,id in enumerate(df_test_img.index.values):  
        for i,comp in enumerate(components):
            id_sample=id+'_'+comp
            row_id.append(id_sample)
            target.append(dict_pred[comp][k])
    del df_test_img
    del X_test
    gc.collect()

df_sample = pd.DataFrame(
    {
        'row_id': row_id,
        'target':target
    },
    columns = ['row_id','target'] 
)
df_sample.to_csv('submission.csv',index=False)
df_sample