In [6]:
from PIL import Image, ImageDraw
import sys
import os
import re
import numpy as np
import cv2
import torch, torch.utils.data
import pickle
import torchvision.transforms as transforms

# Directory where folders with datasets are stored
dir='C:/Users/twest/Documents/school/Master/Deep Learning/Project/Assamese/'

# Get the character name for each label
GLOBAL_LABEL = 0
def get_label(cmap, character_name):
  global GLOBAL_LABEL
  if character_name not in cmap:
    cmap[character_name] = GLOBAL_LABEL
    GLOBAL_LABEL += 1
  return cmap[character_name]

# Create images from the txt files
def get_image(filename):
  f = open(filename, 'r')
  x = None
  y = None
  start = False
  img = Image.new( 'RGB', (4392,4868), "black") # create a new black image
  draw = ImageDraw.Draw(img)

  for line in f:
    if 'CHARACTER_NAME:' in line:
      character_name = line[len('CHARACTER_NAME:'):].replace(' ', '').rstrip()
    if 'PEN_DOWN' in line:
      start = True
      x = None
      y = None
    if 'PEN_UP' in line:
      start = False
      x = None
      y = None
    if not start:
      continue
    m = re.match(r'[0-9]+ +[0-9]+ +[0-9]+ +[0-9]+\.?', line)
    if m is not None:
      l = str(m.group()).split()
      if x is not None:
        draw.line((int(l[0]), int(l[1]), x, y), fill=255, width=80) 
      x = int(l[0])
      y = int(l[1])
  #img.show()
  return character_name, img

def change_image(img,img_sz):
    img_arr = np.asarray(img) # Convert to array
    img_arr = np.sum(img_arr,axis=2) # Get rid of RGB dimension
    img_arr = img_arr/np.max(img_arr) # Normalize
    img_res = cv2.resize(img_arr, dsize=(img_sz,img_sz), interpolation=cv2.INTER_CUBIC) # Shrink
    #img_res = img_res.where(img_res == 0, 1) # Convert to zeros and ones (binarize)
    return img_res

cmap = {"BD": 179, "NAA": 93, "BA": 111, "BB": 61, "BRR": 77, "BL": 178, "DRR": 74, "JR": 72, "PAC": 88, "GHA": 56, "JJ": 142, "DRA": 125, "GHN": 153, "CCA": 89, "JA": 94, "CCC": 141, "NNN": 168, "GHR": 71, "KHR": 69, "SPH": 28, "PA": 109, "RA": 115, "PRR": 76, "KHA": 34, "TINI": 86, "BHR": 42, "GR": 70, "XAT": 91, "MRR": 79, "HRR": 82, "BJ": 144, "GA": 45, "MNA": 102, "GN": 145, "GM": 152, "GL": 138, "THB": 43, "THA": 104, "OI": 172, "MND": 7, "PHA": 110, "PTTT": 0, "OU": 12, "A": 0, "NGG": 31, "NGC": 32, "HR": 49, "NGN": 35, "NGH": 53, "NGJ": 37, "NGK": 39, "ST": 26, "HN": 58, "E": 106, "HA": 122, "TSR": 80, "SKH": 30, "PS": 68, "KHYA": 123, "KR": 40, "TTT": 158, "SK": 20, "TTH": 55, "JB": 143, "TTB": 159, "PTT": 175, "PN": 57, "TM": 160, "PL": 176, "EK": 84, "EE": 117, "SL": 66, "DSR": 81, "O": 1, "SC": 15, "DDH": 48, "NKH": 52, "TRR": 73, "TRU": 41, "MF": 60, "MA": 113, "MB": 5, "NGKH": 54, "ML": 65, "MN": 3, "CBN": 131, "JJB": 147, "SSM": 22, "SSN": 24, "MP": 2, "DGH": 46, "MXA": 120, "FP": 33, "FT": 19, "ATH": 92, "NIYA": 97, "WA": 118, "FK": 8, "FN": 18, "REE": 150, "GGU": 50, "NM": 171, "NN": 156, "RRG": 164, "NA": 108, "NB": 169, "MDA": 100, "UU": 139, "MDD": 154, "NG": 67, "GGN": 51, "NT": 155, "DD": 47, "DUI": 85, "XN": 59, "NTR": 4, "XM": 36, "CC": 140, "CA": 78, "DHRR": 75, "NDD": 165, "SUNYA": 83, "CARI": 87, "DHRA": 126, "KA": 23, "NS": 170, "KK": 132, "SP": 27, "NTT": 163, "KM": 137, "KL": 136, "KS": 135, "NTH": 166, "SSTH": 21, "SN": 14, "SM": 16, "KT": 133, "GDH": 151, "SB": 17, "NMM": 157, "MM": 181, "JHA": 96, "MNTH": 38, "CAY": 90, "NDH": 167, "DG": 44, "ANSR": 129, "DB": 173, "DA": 105, "DV": 177, "BXG": 130, "LG": 148, "LD": 9, "LB": 62, "LA": 116, "LL": 10, "LM": 63, "TR": 162, "LK": 6, "TXA": 119, "TN": 146, "LT": 13, "MV": 182, "LP": 11, "TB": 180, "TA": 103, "AA": 95, "AE": 161, "KTT": 134, "AYA": 124, "STH": 29, "DHA": 107, "U": 128, "AJA": 114, "MTA": 98, "KTA": 127, "QJ": 174, "BHM": 64, "MDHA": 101, "BHA": 112, "MTHA": 99, "SSB": 25, "TT": 149, "DXA": 121}

def create_array(type,num_files,output_size):
    if type == 'train':
        folder_min = 1
        folder_max = 36
    elif type == 'test':
        folder_min = 37
        folder_max = 45
    else:
        print('error')
        crash

    arr = []
    arr2 = []
    arr3 = []
    arr4 = []
    arr5 = []
    orig = []
    x = []
    data = []
    labels = []
        
    for file_nr in range(1,num_files+1):
        print(file_nr)
        for folder in range(folder_min,folder_max+1):
            # Get correct file
            file = str(file_nr)+'.'+str(folder)+'.txt'
            # Convert
            character_name, img = get_image(dir + 'W' + str(folder) + '/' + file)
            label = get_label(cmap, character_name)
            #img_res = change_image(img, output_size[0])
            img_res2 = change_image(img, output_size[1])
            img_res3 = change_image(img, output_size[2])
            img_res4 = change_image(img, output_size[3])
            img_res5 = change_image(img, output_size[4])
            # Append
            #arr.append(img_res)
            arr2.append(img_res2)
            arr3.append(img_res3)
            arr4.append(img_res4)
            arr5.append(img_res5)
            #orig.append(img)
            x.append(label)
    #data = arr
    data2 = arr2
    data3 = arr3
    data4 = arr4
    data5 = arr5
    labels = x
    
    if type == 'train':
        #np.save('trainingdata'+str(output_size[0])+'_'+str(num_files),data)
        np.save('trainingdata'+str(output_size[1])+'_'+str(num_files),data2)
        np.save('trainingdata'+str(output_size[2])+'_'+str(num_files),data3)
        np.save('trainingdata'+str(output_size[3])+'_'+str(num_files),data4)
        np.save('trainingdata'+str(output_size[4])+'_'+str(num_files),data5)
        #np.save('alltrainingdata_'+str(num_files),orig)
        np.save('traininglabels_'+str(num_files),labels)
    elif type == 'test':
        #np.save('testdata'+str(output_size[0])+'_'+str(num_files),data)
        np.save('testdata'+str(output_size[1])+'_'+str(num_files),data2)
        np.save('testdata'+str(output_size[2])+'_'+str(num_files),data3)
        np.save('testdata'+str(output_size[3])+'_'+str(num_files),data4)
        np.save('testdata'+str(output_size[4])+'_'+str(num_files),data5)
        #np.save('alltestdata_'+str(num_files),orig)
        np.save('testlabels_'+str(num_files),labels)
    else:
        print('error')
        crash
    print('success')
        
class DataAssamese(torch.utils.data.Dataset):
    def __init__(self, data_path=None, labels_path=None, 
                 transform=None, dataset: torch.utils.data.Dataset =None, 
                 data : np.ndarray = None, labels: np.ndarray = None,
                 mean : float = None, std : float = None,
                 transform_data=False):
        torch.utils.data.Dataset.__init__(self)
        
        self.data = np.load(data_path)
        self.labels = np.load(labels_path)
        
        if mean == None and std == None:
                # stats of the dataset
            self.mean = np.mean(self.data[:,:,:])
            self.std = np.std(self.data[:,:,:])
            # for test set, use mean and std from
            # the train set to normalize
        else:
            self.mean = mean
            self.std = std
            # Normalize by default!
        self.normalize = transforms.Normalize(mean=(self.mean,),
                                            std = (self.std,))
        
        self.transforms = transforms.Compose([
            transforms.ToTensor(),
            self.normalize
        ])
        
        if transform is not None:
            self.transforms = transforms.Compose([
                  transform
              ])
        else:
            print("Using the default transformations")
            self.transforms = transforms.Compose([
                transforms.ToTensor(),
                self.normalize                                
            ])
            
        
    
    def __getitem__(self,n):
        
        if self.transform_data:
            data = self.data[n]
            label = self.labels[n]
            # we need to convert the x's to images to apply the transforms
            return np.array(self.transforms(data)), label
        
        else:
        
            data = self.data[n].astype(np.float32)
            
            label = self.labels[n].astype(np.float32)
            return self.transforms(_x1), self.transforms(_x2), label

    def __len__(self):
        return len(self.data)


In [7]:
# Create array

output_sizes = [384, 192, 96, 48, 24]
n_files = 183
create_array('train',n_files,output_sizes)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
success


In [11]:
#voor test
create_array('test',n_files,output_sizes)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
success


In [9]:
train_data_path = 'trainingdata96_183.npy'
train_labels_path = 'traininglabels96_183.npy'

test_data_path = 'testdata96_183.npy'
test_labels_path = 'testlabels96_183.npy'

train_d = DataAssamese(train_data_path, train_labels_path)
train_loader = torch.utils.data.DataLoader(train_d, batch_size=128, shuffle=True, pin_memory=True, num_workers=4)

test_d = DataAssamese(train_data_path, train_labels_path)
test_loader = torch.utils.data.DataLoader(train_d, batch_size=128, shuffle=True, pin_memory=True, num_workers=4)

FileNotFoundError: [Errno 2] No such file or directory: 'traininglabels96_183.npy'

In [None]:
import gc
from torch.utils import data

def augment_dataset(dat, label):
  """ Augments the dataset and returns a siamese dataset
  with 9x as much data, the original data in the argument dataset
  plus 8 affine transformations of that input data"""
  # Create a data loader of the dataset
  tensors_train = torch.Tensor(dat), torch.Tensor(label).long()
  loader = data.DataLoader(TensorDataset(*tensors_train), batch_size=10000)

  # Altered samples of the input data
  _altered = None
  mean = None
  std = None

  # Check the size of the batches and so on
  # Read in batches of 15000, and do it 
  for j in range(8):
      gc.collect()
      print("starting with round ",j)
      for i, (x1, _) in enumerate(loader):
          if i % 1 == 0:
              print(i)
          x1 = np.expand_dims(x1, 1)
          # concatenate the arrays by their second axis
          _data = x1
          _mean = np.mean(_data)
          _std = np.std(_data)
          if mean is None:
            mean = _mean
            std = _std
          else:
            mean = (mean*len(_altered) +  _mean*len(_data))/(len(_altered)+len(_data))
            std = (std*len(_altered) +  _std*len(_data))/(len(_altered)+len(_data))
          # add them to the dataset
          if _altered is None:
              _altered = _data
          else:
              # Concatenate the existing data and the new batch
              _altered = np.concatenate((_altered, _data), axis = 0)
      
      print(f'Size of the datasets -> {_altered.shape}')

  # Now create a new dataset with the newly defined data
  # Concatenate the original dataset with the new one
  d = tensors_train
  dat = dat[:,None,:,:]
  all_data = np.concatenate((dat, _altered), axis = 0)
  labels = np.tile(label, 9)
  # Add mean of the original datset
  #mean = (mean*len(_altered) +  d.mean*len(d))/(len(_altered)+len(d))
  #std = (std*len(_altered) +  d.std*len(d))/(len(_altered)+len(d))
  tensors = torch.Tensor(all_data), torch.Tensor(labels).long()
  d = DataLoader(TensorDataset(*tensors), batch_size=100)
  return d