In [1]:
import os, sys, h5py, numpy as np, pandas as pd, torch

In [2]:
from sklearn.decomposition import PCA

In [3]:
import pickle

In [4]:
thesis_dataset_folder_path = r"/Volumes/suraj/ovgu/thesis/dataset/COMICS/data"
ocr_filepath = r"/Users/surajshashidhar/Downloads/COMICS_ocr_file_bkp.csv"
vgg_file_name = "vgg_features_bkp.h5"
vgg_file_path = os.path.join(thesis_dataset_folder_path, vgg_file_name)


In [5]:
ocr_df = pd.read_csv(ocr_filepath)
ocr_df.head(3)

Unnamed: 0,comic_no,page_no,panel_no,textbox_no,dialog_or_narration,text,x1,y1,x2,y2
0,0,0,0,0,,,,,,
1,0,0,1,0,1.0,account of s wiggins,11.0,17.0,361.0,190.0
2,0,0,1,1,1.0,10c,855.0,5.0,1067.0,154.0


In [6]:
ocr_df.shape

(2545728, 10)

In [35]:
max_page_per_book_df = ocr_df.groupby('comic_no').agg({'page_no':'max', 'panel_no':'sum'}).reset_index()
total_panels_per_book_df = ocr_df.groupby(['comic_no', 'page_no']).agg({'panel_no':'nunique'}).reset_index()

max_page_per_book_df['new_comic_no'] = max_page_per_book_df.index.copy()
max_page_per_book_df['max_page_no'] = max_page_per_book_df['page_no']
# max_page_per_book_df.sort_values(['new_comic_no', 'page_no'], ascending=[True, True], inplace=True)

merged_df = pd.merge(left=total_panels_per_book_df, right=max_page_per_book_df, left_on='comic_no', right_on='comic_no', suffixes=['_l', '_r'], how='inner')
merged_df = merged_df[['comic_no', 'page_no_l', 'panel_no_l',  'panel_no_r','new_comic_no', 'max_page_no']]
merged_df.columns = ['old_comic_no', 'per_book_page_no', 'per_page_panel_count', 'per_book_panel_sum', 'new_comic_no', 'max_page_no']
# print(merged_df.shape)
merged_df.head(5)

Unnamed: 0,old_comic_no,per_book_page_no,per_page_panel_count,per_book_panel_sum,new_comic_no,max_page_no
0,0,0,2,1674,0,51
1,0,1,7,1674,0,51
2,0,2,2,1674,0,51
3,0,3,7,1674,0,51
4,0,4,7,1674,0,51


In [37]:
merged_df.to_csv("merged_df.csv")

In [17]:
ocr_total_pages_df = ocr_df.groupby(['comic_no', 'page_no'])['panel_no'].nunique().reset_index()

In [18]:
ocr_total_pages_df.head()

Unnamed: 0,comic_no,page_no,panel_no
0,0,0,2
1,0,1,7
2,0,2,2
3,0,3,7
4,0,4,7


In [23]:
ocr_total_pages_df.tail()

Unnamed: 0,comic_no,page_no,panel_no
195790,3958,34,7
195791,3958,35,5
195792,3958,36,6
195793,3958,37,3
195794,3958,38,6


In [21]:
ocr_total_pages_df['comic_no'].nunique()

3929

In [29]:
ocr_total_pages_df['panel_no'].sum()

1229664

In [26]:
max_page_per_book_df = ocr_total_pages_df.groupby('comic_no')['page_no'].max().reset_index()
max_page_per_book_df['new_comic_no'] = max_page_per_book_df.index.copy()
max_page_per_book_df['max_page_no'] = max_page_per_book_df['page_no']

max_page_per_book_df.sort_values(['new_comic_no', 'page_no'], ascending=[True, True], inplace=True)
max_page_per_book_df.head(5)

Unnamed: 0,comic_no,page_no,new_comic_no,max_page_no
0,0,51,0,51
1,1,128,1,128
2,2,37,2,37
3,3,37,3,37
4,4,53,4,53


In [27]:
ocr_train_pages_df = max_page_per_book_df[max_page_per_book_df['new_comic_no'] < 2929]
ocr_dev_pages_df = max_page_per_book_df[ (max_page_per_book_df['new_comic_no'] >= 2929)  &  (max_page_per_book_df['new_comic_no'] < 3429) ]
ocr_test_pages_df = max_page_per_book_df[max_page_per_book_df['new_comic_no'] >= 3429]

In [31]:
print(ocr_train_pages_df.shape, ocr_dev_pages_df.shape, ocr_test_pages_df.shape)

(2929, 4) (500, 4) (500, 4)


In [30]:
merged_df = pd.merge(left=max_page_per_book_df, right=ocr_total_pages_df, left_on='comic_no', right_on='comic_no', how='inner')
merged_df.shape

(195795, 6)

In [33]:
merged_df['panel_no'].sum()

1229664

In [28]:
ocr_dev_alle_df = ocr_df[(ocr_df['comic_no']>= 2929) & (ocr_df['comic_no']< 3429)]

In [29]:
ocr_dev_alle_df.groupby(['comic_no', 'page_no'])['panel_no'].max().reset_index().to_csv('ocr_dev_alle_gp.csv')

In [7]:
ocr_changed_df = ocr_df.dropna(subset=['x1', 'x2', 'y1', 'y2'], how='all')
print(ocr_changed_df.shape)
ocr_changed_df = ocr_changed_df.dropna(subset=['comic_no', 'page_no'], how='all')
print(ocr_changed_df.shape)

(2498656, 10)
(2498656, 10)


In [16]:
max_page_per_book_df = ocr_changed_df.groupby('comic_no')['page_no'].max().reset_index()
# max_page_per_book_df.to_csv('max_page_per_book_df.csv')
max_page_per_book_df['new_comic_no'] = max_page_per_book_df.index.copy()
max_page_per_book_df['max_page_no'] = max_page_per_book_df['page_no']
max_page_per_book_df.head(5)

Unnamed: 0,comic_no,page_no,new_comic_no,max_page_no
0,0,51,0,51
1,1,128,1,128
2,2,37,2,37
3,3,37,3,37
4,4,53,4,53


In [11]:
number_of_panels_per_book_df = ocr_changed_df.groupby(['comic_no', 'page_no'])['panel_no'].max().reset_index()
number_of_panels_per_book_df.head(5)

Unnamed: 0,comic_no,page_no,panel_no
0,0,0,1
1,0,1,6
2,0,2,1
3,0,3,6
4,0,4,6


In [12]:
number_of_panels_per_book_df.shape

(194489, 3)

In [17]:
comic_info_df = pd.merge(left=number_of_panels_per_book_df, right=max_page_per_book_df, how='inner', left_on=['comic_no'], right_on=['comic_no'],suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)')

In [19]:
comic_info_df.head(2)

Unnamed: 0,comic_no,page_no,panel_no,new_comic_no,max_page_no
0,0,0,1,0,51
1,0,1,6,0,51


In [20]:
comic_info_df.to_csv('comic_info_df.csv')

In [24]:
dev_comic_info_df = comic_info_df[(comic_info_df['new_comic_no'] >= 2929) & (comic_info_df['new_comic_no'] < 3429)]

In [25]:
dev_comic_info_df.shape

(24429, 5)

In [26]:
dev_comic_info_df.to_csv('dev_comic_info_df.csv')

# Read VGG Embeddings

In [9]:
comics_fc7 = h5py.File(vgg_file_path, mode='r')

In [10]:
# training_all_embeddings = comics_fc7['train']['vgg_features']
test_all_embeddings = comics_fc7['test']['vgg_features']
dev_all_embeddings = comics_fc7['dev']['vgg_features']

In [15]:
dev_all_np = dev_all_embeddings[:]

In [16]:
dev_filtered_np = dev_all_np[(dev_all_np>1e-7).any(axis=2)]

In [17]:
with open('dev_filtered_np.pickle', 'wb') as handle:
    pickle.dump(dev_filtered_np, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
del dev_all_np, dev_filtered_np

In [19]:
test_all_np = test_all_embeddings[:]
test_filtered_np = test_all_np[(test_all_np>1e-7).any(axis=2)]

with open('test_filtered_np.pickle', 'wb') as handle:
    pickle.dump(test_filtered_np, handle, protocol=pickle.HIGHEST_PROTOCOL)

del test_all_np, test_filtered_np

In [11]:
dev_comic_vgg_info_dict = {}
for page_num in range(dev_all_embeddings.shape[0]):

    dev_comic_vgg_info_dict['page_' + str(page_num)] = {}
    counter = 0
    for panel_num in range(9):
        panel_embedding = dev_all_embeddings[page_num, panel_num, :]

        
        if np.min(panel_embedding) == np.max(panel_embedding):
            break;
        else:
            counter = counter + 1
            dev_comic_vgg_info_dict['page_' + str(page_num)]['counter'] = counter
            dev_comic_vgg_info_dict['page_' + str(page_num)]['panel_num'+str(panel_num)] = panel_embedding 




In [12]:
with open('dev_vgg_embedding.pickle', 'wb') as handle:
    pickle.dump(dev_comic_vgg_info_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
dev_all_embeddings[:, :]

In [27]:
test_comic_vgg_info_dict = {}
for page_num in range(test_all_embeddings.shape[0]):

    test_comic_vgg_info_dict['page_' + str(page_num)] = {}
    counter = 0
    for panel_num in range(9):
        panel_embedding = test_all_embeddings[page_num, panel_num, :]

        
        if np.min(panel_embedding) <= 1e-6:
            break;
        else:
            counter = counter + 1
            test_comic_vgg_info_dict['page_' + str(page_num)]['counter'] = counter
            test_comic_vgg_info_dict['page_' + str(page_num)]['panel_num'+str(panel_num)] = panel_embedding 




In [28]:
with open('test_vgg_embedding.pickle', 'wb') as handle:
    pickle.dump(test_comic_vgg_info_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
del test_comic_vgg_info_dict, dev_comic_vgg_info_dict

In [30]:
training_comic_vgg_info_dict = {}
for page_num in range(training_all_embeddings.shape[0]):

    training_comic_vgg_info_dict['page_' + str(page_num)] = {}
    counter = 0
    for panel_num in range(9):
        panel_embedding = training_all_embeddings[page_num, panel_num, :]

        
        if np.min(panel_embedding) <= 1e-6:
            break;
        else:
            counter = counter + 1
            training_comic_vgg_info_dict['page_' + str(page_num)]['counter'] = counter
            training_comic_vgg_info_dict['page_' + str(page_num)]['panel_num'+str(panel_num)] = panel_embedding 




In [31]:
with open('training_vgg_embedding.pickle', 'wb') as handle:
    pickle.dump(training_comic_vgg_info_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)