In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
from multiprocessing.pool import Pool
from keras.preprocessing import image
from keras.applications.inception_v3  import preprocess_input

In [None]:
def loadImage(path):
    img = image.load_img(path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [None]:
def loadBatch(img_paths):
    with Pool(processes=8) as pool:
        imgs = pool.map(loadImage, img_paths)
        return np.vstack(imgs)

In [None]:
def batchGenerator(img_paths, labels, batch_size):
    for i in range(0, len(img_paths), batch_size):
        batch_paths = img_paths[i:(i + batch_size)]
        batch_labels = labels[i:(i + batch_size)]
        batch_images = loadBatch(batch_paths)
        yield batch_images, batch_labels

In [None]:
from keras.applications.inception_v3 import InceptionV3
from sklearn.preprocessing import normalize

In [None]:
class FeatureExtractor:
    def __init__(self):
        print("loading DeepNet (Inception-V3) ...")
        self.model = InceptionV3(weights='imagenet')
        
        # Initialise the model to output the second to last layer, which contains the deeplearning featuers  
        self.model.layers.pop() # Get rid of the classification layer
        self.model.outputs = [self.model.layers[-1].output]
        self.model.layers[-1].outbound_node = []
     
    def get_features(self, batch):
        features =  self.model.predict(batch)
        features = features.reshape(-1,features.shape[-1])
        return normalize(features, axis=1, norm='l2') 

In [None]:
import numpy as np
from os import listdir, path
from os.path import isdir, isfile, join, exists, dirname
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model 
from sklearn.externals import joblib
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import os
import cv2



In [None]:
dataPath         = '/content/drive/MyDrive/Dataset'
stelePath        = join(dataPath, "Manual/Preprocessed")
examplePath = join(dataPath, "Examples")
featurePath      = "features.npy"
labelsPath       = "labels.npy"
svmPath          = "svm.pkl"
image_paths      = []
labels           = []
batch_size       = 2000

In [None]:
print("indexing images...")
Steles = [ join(stelePath,f) for f in listdir(stelePath) if isdir(join(stelePath,f)) ]
for stele in Steles:    
    imagePaths = [ join(stele,f) for f in listdir(stele) if isfile(join(stele,f)) ]
    for path in imagePaths:
        image_paths.append(path)
        labels.append(path[(path.rfind("_") + 1): path.rfind(".")])

featureExtractor = FeatureExtractor()
features = []
print("computing features...")
for idx, (batch_images, _) in enumerate(batchGenerator(image_paths, labels, batch_size)):
    print("{}/{}".format((idx+1) * batch_size, len(labels)))
    features_ = featureExtractor.get_features(batch_images)
    features.append(features_)
features = np.vstack(features)

labels = np.asarray(labels)
print("saving features...")
np.save(featurePath, features)
np.save(labelsPath, labels)

indexing images...
loading DeepNet (Inception-V3) ...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
computing features...
200/4210
400/4210
600/4210
800/4210
1000/4210
1200/4210
1400/4210
1600/4210
1800/4210
2000/4210
2200/4210
2400/4210
2600/4210
2800/4210
3000/4210
3200/4210
3400/4210
3600/4210
3800/4210
4000/4210
4200/4210
4400/4210
saving features...


In [None]:
tobeDeleted = np.nonzero(labels == "UNKNOWN") # Remove the Unknown class from the database
features = np.delete(features,tobeDeleted, 0)
labels = np.delete(labels,tobeDeleted, 0)
numImages = len(labels)
trainSet, testSet, trainLabels, testLabels = train_test_split(features, labels, test_size=0.20, random_state=42) 

In [None]:
print("training SVM...")
if 0: # optinal; either train 1 classifier fast, or search trough the parameter space by training multiple classifiers to sqeeze out that extra 2%
    clf = linear_model.LogisticRegression(C=10000)
else:
    svr = linear_model.LogisticRegression(max_iter=10000)
    parameters = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
    clf = GridSearchCV(svr, parameters, n_jobs=8)
clf.fit(trainSet, trainLabels)
    
print(clf)
print("finished training! saving...")
joblib.dump(clf, 'clf.pkl', compress=1) 

prediction = clf.predict(testSet)
accuracy = np.sum(testLabels == prediction) / float(len(prediction))

# for idx, pred in enumerate(prediction):
#     print("%-5s --> %s" % (testLabels[idx], pred))
print("accuracy = {}%".format(accuracy*100))

training SVM...




GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=8,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
finished training! saving...
accuracy = 64.56009913258984%


In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()
xgb.fit(trainSet, trainLabels)

joblib.dump(xgb, svmPath, compress=1) 

prediction = xgb.predict(testSet)
accuracy = np.sum(testLabels == prediction) / float(len(prediction))

# for idx, pred in enumerate(prediction):
#     print("%-5s --> %s" % (testLabels[idx], pred))
print("accuracy = {}%".format(accuracy*100))

accuracy = 68.02973977695167%


In [None]:
 from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(1000, 1000, 500, 171) , alpha=0.0001, solver='adam')
mlp.fit(trainSet, trainLabels)

joblib.dump(mlp, "mlp.pkl", compress=1) 

prediction = mlp.predict(testSet)
accuracy = np.sum(testLabels == prediction) / float(len(prediction))

# for idx, pred in enumerate(prediction):
#     print("%-5s --> %s" % (testLabels[idx], pred))
print("accuracy = {}%".format(accuracy*100))

accuracy = 64.80793060718712%


In [None]:
import os

In [None]:
inputPath = examplePath
if isdir(inputPath):
    imagePaths = [join(inputPath, f) for f in listdir(inputPath) if f.endswith(('.png', '.jpg'))]
else:
    imagePaths = [inputPath,]
    
print("loading images...")
Images = loadBatch(imagePaths)
print("loading SVM model...")
clf = joblib.load(svmPath);
    
print("Extracting features, this may take a while for large collections of images...") # should probably use batches for this as well
extractor = FeatureExtractor()
features  = extractor.get_features(Images)

classes = xgb.classes_
print("Predicting the Hieroglyph type...")
prob = np.array(xgb.predict_proba(features))
top5_i = np.argsort(-prob)[:,0]
top5_s = np.array([prob[row,top5_i[row]] for row, top5_i_row in enumerate(top5_i)])  
top5_n = classes[top5_i]

print("{:<25} ::: {}".format("image name", "top 5 best matching hieroglyphs"))
for idx, path in enumerate(imagePaths):
    print("{:<25} --> {}".format(os.path.basename(path), top5_n[idx]))

loading images...
loading SVM model...
Extracting features, this may take a while for large collections of images...
loading DeepNet (Inception-V3) ...
Predicting the Hieroglyph type...
image name                ::: top 5 best matching hieroglyphs
200000_S29.png            --> S29
200001_V13.png            --> V13
200002_V13.png            --> V13
200003_G43.png            --> G43
200004_D21.png            --> D21
200005_O50.png            --> O50
200006_X1.png             --> X1
200007_M23.png            --> M23
200008_G43.png            --> G43
200009_S29.png            --> S29
200010_V13.png            --> V13
200011_M23.png            --> M23
200012_G43.png            --> G43
200013_D21.png            --> D21
200014_O50.png            --> O50
200015_V13.png            --> V13
200016_G43.png            --> G43
200017_S29.png            --> S29


In [None]:
from keras.preprocessing.image import ImageDataGenerator
import cv2

In [None]:
def augment_data(file_dir, n_generated_samples, save_to_dir, taking):
    """
    Arguments:
        file_dir: A string representing the directory where images that we want to augment are found.
        n_generated_samples: A string representing the number of generated samples using the given image.
        save_to_dir: A string representing the directory in which the generated images will be saved.
    """
    
    
    
    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                 )

    """for filename in os.listdir(file_dir):
      image_paths.append(join(file_dir, filename))"""
      
    for filename in taking:
            # load the image
            image = cv2.imread(file_dir + '/' + filename)
            # reshape the image
            image = image.reshape((1,)+image.shape)
            # prefix of the names for the generated sampels.
            save_prefix = 'aug_' + filename[:filename.rfind('.')]
            print(save_prefix)
            # generate 'n_generated_samples' sample images
            i=0
            for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                              save_prefix=save_prefix, save_format='png'):
                i += 1
                if i > n_generated_samples:
                    break
    """for filename in (image_paths):
        # load the image
        image = cv2.imread(filename)
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # print(filename)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[(filename.rfind("_") + 1): filename.rfind(".")]
        # print(save_prefix)
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='png'):
            i += 1
            if i > n_generated_samples:
                break"""

In [None]:
# DON'T RUN THIS CELL AGAIN.
augmented_data_path = '/content/drive/MyDrive/GlyphDataset/Augmented_hieroglyphs'

# augment data for the examples with label equal to 'yes' representing tumurous examples
augment_data(file_dir='/content/drive/MyDrive/GlyphDataset/hieroglyphs', n_generated_samples=10, save_to_dir=augmented_data_path, taking=taking)

In [None]:
image_paths1      = []
labels1           = []
features1         = []
batch_size        = 2000

In [None]:
print("indexing images...")
Steles = [ join(stelePath,f) for f in listdir(stelePath) if isdir(join(stelePath,f)) ]
for stele in Steles:    
    imagePaths1 = [ join(stele,f) for f in listdir(stele) if isfile(join(stele,f)) ]
    for path in imagePaths1:
        image_paths1.append(path)
        #print(path)
        labels1.append(path[(path.rfind("_") + 1): path.rfind(".")])
for filename in os.listdir("/content/drive/MyDrive/Dataset/Augmented"):
    image_paths1.append(join('/content/drive/MyDrive/Dataset/Augmented', filename))
    labels1.append(path[(path.rfind("_") + 1): path.rfind(".")])
print(len(labels1))

indexing images...
63680


In [None]:
print("indexing images...")
image_paths2 = []
labels2 = []
for filename in taking:
    image_paths2.append(join('/content/drive/MyDrive/GlyphDataset/hieroglyphs', filename))
    labels2.append(filename[:filename.rfind(".")])
for filename in os.listdir('/content/drive/MyDrive/GlyphDataset/Augmented_hieroglyphs'):
    image_paths2.append(join('/content/drive/MyDrive/GlyphDataset/Augmented_hieroglyphs', filename))
    labels2.append(filename[4 : (filename.rfind("_")-2)])
print(len(labels2))

indexing images...
1967


In [None]:
taking = []

In [None]:
for filename in os.listdir('/content/drive/MyDrive/GlyphDataset/hieroglyphs'):
  if filename[:filename.rfind('.')] in labels1:
    taking.append(filename)

In [None]:
len(taking)
taking

In [None]:
len(np.unique(labels1))

171

In [None]:
tobeDeleted = np.nonzero(labels1 == "UNKNOWN") # Remove the Unknown class from the database
image_paths1 = np.delete(image_paths1,tobeDeleted, 0)
labels1 = np.delete(labels1,tobeDeleted, 0)
numImages = len(labels1)

In [None]:
for i in labels2:
  print(i)

V4
V16
V24
W14
V6
V7
W11
W15
V28
V13
V25
V31
V22
V30
X1
Y3
W24
Y1
W22
W18
W25
Y2
W19
X6
Y5
X8
Z1
Z11
Z7
T21
T28
U15
U1
U35
U7
U28
U33
F31
F29
F35
F40
G1
G10
G17
G14
F9
F4
G50
G26
G39
G21
G40
G35
G37
G43
G29
G5
G25
G36
G4
I10
G7
H6
I5
I9
M16
M12
M17
L1
M18
M1
M3
M4
M40
M26
M42
M20
M23
M29
M44
N19
N37
M8
N17
N24
N16
N1
N14
N31
N25
N29
N5
N2
N18
N26
N35
N36
O29
O1
O28
O11
O34
O4
P8
O50
Q1
P6
O49
P1
O51
P13
Q7
Q3
R8
R4
S29
S28
S42
S24
S34
T20
T22
T30
A55
Aa28
Aa26
Aa27
Aa15
D156
D2
D10
D1
D19
D28
D21
D52
D35
D46
D36
D54
D62
D58
D60
D53
D56
D34
D4
D39
E1
E23
E17
E34
E9
F34
F18
F30
F23
F32
F26
F21
F16
F13
F12
F22
M40
M40
M40
M40
M40
M40
M40
M40
M40
M40
M40
M26
M26
M26
M26
M26
M26
M26
M26
M26
M26
M26
M42
M42
M42
M42
M42
M42
M42
M42
M42
M42
M42
M20
M20
M20
M20
M20
M20
M20
M20
M20
M20
M20
M23
M23
M23
M23
M23
M23
M23
M23
M23
M23
M23
M29
M29
M29
M29
M29
M29
M29
M29
M29
M29
M29
M44
M44
M44
M44
M44
M44
M44
M44
M44
M44
M44
N19
N19
N19
N19
N19
N19
N19
N19
N19
N19
N19
N37
N37
N37
N37
N37
N37
N37
N37
N

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(labels1)
labels2_encoded = label_encoder.transform(labels2)

In [None]:
labels2_encoded

array([150, 143, 145, ...,  81,  81,  81])

In [None]:
np.unique(labels2_encoded)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,  78,
        79,  80,  81,  82,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  99, 100, 101, 102, 104, 105, 106, 107,
       108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122,
       123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136,
       137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
       150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 167, 168, 169, 170])

In [None]:
len(np.zeros(171,))

171

In [None]:
labels2_one_hot_encoded = []
for i in labels2_encoded:
  a = np.zeros(171,)
  a[i] = 1
  labels2_one_hot_encoded.append(a)

In [None]:
target = label_encoder.classes_
len(target)

171

In [None]:
labels1 = to_categorical(labels1)

In [None]:
def getPixels(image, size):
  pixels = cv2.resize(image,size)
  return pixels

In [None]:
image_paths2

In [None]:
#iterating in training set of data
rawImages2 = []
for i in image_paths2:
    img = cv2.imread(i, cv2.IMREAD_GRAYSCALE)
    #stores the raw pixel values of this image after resizing
    pixels = getPixels(img,(32,32))
    #stores the raw pixel values of images
    rawImages2.append(pixels)

In [None]:
rawImages2 = np.asarray(rawImages2)
labels2_one_hot_encoded = np.asarray(labels2_one_hot_encoded)

In [None]:
labels2_one_hot_encoded.shape

(1967, 171)

In [None]:
 np.save('/content/drive/MyDrive/Dataset/rawimages_ejypt.npy', rawImages1) # next time use np.load('/content/drive/MyDrive/Dataset/rawimages_ejypt.npy')

In [None]:
from keras.utils import to_categorical

In [None]:
X_train1,X_test1,y_train1,y_test1 = train_test_split(rawImages1,labels1,test_size = 0.2, random_state=42)

In [None]:
import seaborn as sns

In [None]:
X_train1 = X_train1.reshape(-1, 32, 32, 1)
X_test1 = X_test1.reshape(-1, 32, 32, 1)

In [None]:
rawImages2 = rawImages2.reshape(-1, 32, 32, 1)
rawImages2 = rawImages2 / 255.0

In [None]:
X_train1 = X_train1 / 255.0
X_test1 = X_test1 / 255.0

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
from tensorflow.keras import layers, models, utils, datasets
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization
from keras.layers import LeakyReLU

In [None]:
model1=models.Sequential()
model1.add(Conv2D(32, (5, 5), input_shape = (32, 32, 1)))
model1.add(LeakyReLU(alpha=0.1))
model1.add(MaxPooling2D(pool_size = (2, 2)))
model1.add(Conv2D(128, (5, 5)))
model1.add(LeakyReLU(alpha=0.1))
model1.add(Conv2D(64, (5, 5)))
model1.add(LeakyReLU(alpha=0.1))
model1.add(Conv2D(32, (5, 5)))
model1.add(LeakyReLU(alpha=0.1))
model1.add(MaxPooling2D(pool_size = (2, 2)))
model1.add(Flatten())
model1.add(Dense(1000))
model1.add(LeakyReLU(alpha=0.1))
model1.add(Dropout(0.5))
model1.add(Dense(500))
model1.add(LeakyReLU(alpha=0.1))
model1.add(Dropout(0.5))
model1.add(Dense(250))
model1.add(LeakyReLU(alpha=0.1))
model1.add(Dense(171, activation = 'softmax'))
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 28, 28, 32)        832       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 28, 28, 32)        0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 10, 10, 128)       102528    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 10, 10, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 6, 6, 64)          204864    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 6, 6, 64)          0

In [None]:
model1.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history1 = model1.fit(X_train1, y_train1, epochs= 50, 
                    validation_split = 0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
print("Loss of the model is - " , model1.evaluate(X_test1,y_test1)[0])
print("Accuracy of the model is - " , model1.evaluate(X_test1,y_test1)[1]*100 , "%")

Loss of the model is -  0.17208269238471985
Accuracy of the model is -  97.07897305488586 %


In [None]:
joblib.dump(model1, "cnn.pkl")

TypeError: ignored

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [None]:
predict = model1.predict(X_test1)

In [None]:
y_classes = [np.argmax(y, axis=None, out=None) for y in y_test1]

In [None]:
yp_classes = [np.argmax(y, axis=None, out=None) for y in predict]

In [None]:
accuracy_score(y_classes, yp_classes)

0.9707897015982994

In [None]:
precision_score(y_classes, yp_classes, average='micro')

0.9707897015982994

In [None]:
recall_score(y_classes, yp_classes, average='micro')

0.9707897015982994

In [None]:
f1_score(y_classes, yp_classes, average='micro')

0.9707897015982994

In [None]:
num = []
for i in y_classes:
  if i not in num:
    num.append(i)

In [None]:
import seaborn as sns

In [None]:
model1.save('hierogylyphs.h5')

In [None]:
model1 = models.load_model('hierogylyphs.h5')

In [None]:
print("Loss of the model is - " , model1.evaluate(rawImages2,labels2_one_hot_encoded)[0])
print("Accuracy of the model is - " , model1.evaluate(rawImages2,labels2_one_hot_encoded)[1]*100 , "%")

Loss of the model is -  6175.41552734375
Accuracy of the model is -  0.6100661121308804 %
