In [32]:
import cv2 as cv

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
import pickle
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
# DataSetPATH = "../input/fonts-dataset-cmp/fonts-dataset" # for kaggle
DataSetPATH = "./Dataset" # for local
sift = cv.SIFT_create(
    nfeatures=200,  # Allow more keypoints
    contrastThreshold=0.005,  # Adjusted for better sensitivity
    nOctaveLayers=5,  # Adjusted for multi-scale detection
    edgeThreshold=10,  # More edge sensitivity
   
)
import import_ipynb

from preprocessing import *


<span style="color:#fff; font-family: 'Bebas Neue'; font-size: 3em;" >Data Extraction</span>

In [None]:
Scheherazade=[]
destination = "./Corrected/Scheherazade New"
if not os.path.exists(destination):
    os.makedirs(destination)

for filepath in tqdm(sorted(glob.glob(f"{DataSetPATH}/Scheherazade New/*.jpeg"))):
    filename = os.path.basename(filepath)
    img = io.imread(filepath)
    corrected = Preprocessing(img)
    if corrected is not None:
        Scheherazade.append(corrected)
        
        # Scheherazade.append(extract_features(img))
        # io.imsave(f"{destination}/{filename}", corrected, check_contrast=False)

In [7]:
destination = "./Corrected/IBM Plex Sans Arabic"
if not os.path.exists(destination):
    os.makedirs(destination)

IBM = []
for filepath in tqdm(sorted(glob.glob(f"{DataSetPATH}/IBM Plex Sans Arabic/*.jpeg"))):
    filename = os.path.basename(filepath)
    img = io.imread(filepath)
    corrected = Preprocessing(img)
    if corrected is not None:
        IBM.append(corrected)
        # IBM.append(extract_features(img))
        # io.imsave(
        #     f"./Corrected/IBM Plex Sans Arabic/{filename}",
        #     corrected,
        #     check_contrast=False,
        # )

100%|██████████| 1000/1000 [00:43<00:00, 22.77it/s]


In [8]:
destination = "./Corrected/Marhey"
if not os.path.exists(destination):
    os.makedirs(destination)

Marhey=[]
for filepath in tqdm(sorted(glob.glob(f"{DataSetPATH}/Marhey/*.jpeg"))):
    filename = os.path.basename(filepath)
    img = io.imread(filepath)
    corrected = Preprocessing(img)
    if corrected is not None:
        Marhey.append(corrected)
        # Marhey.append(extract_features(img))
        # io.imsave(
        #     f"./Corrected/Marhey/{filename}",
        #     corrected,
        #     check_contrast=False,
        # )


100%|██████████| 1000/1000 [00:46<00:00, 21.68it/s]


In [9]:
destination = "./Corrected/Lemonada"
if not os.path.exists(destination):
    os.makedirs(destination)

Lemonada=[]
for filepath in tqdm(sorted(glob.glob(f"{DataSetPATH}/Lemonada/*.jpeg"))):
    filename = os.path.basename(filepath)
    img = io.imread(filepath)
    corrected = Preprocessing(img)
    if corrected is not None:
        Lemonada.append(corrected)
        
        # Lemonada.append(extract_features(img))
        # io.imsave(
        #     f"./Corrected/Lemonada/{filename}",
        #     corrected,
        #     check_contrast=False, )

100%|██████████| 1000/1000 [00:53<00:00, 18.84it/s]


<span style="color:#fff; font-family: 'Bebas Neue'; font-size: 3em;" >Feature Extraction Module</span>

In [36]:
def extract_features(input):
     descriptors=[]
     data=[]
     for  img in (input):
         _, des = sift.detectAndCompute(img, None)
         data.append(des)
         if des is not None:
               for d in des:
                    descriptors.append(d)
     return  descriptors ,data

In [12]:
def generate_hist(input,kmeans):
    data = []
    kmeans.verbose = False
    for img in tqdm(input):     
                histo = np.zeros(k)
                nkp = np.size(len(img))
                if(img is not None):
                    for d in img:
                        idx = kmeans.predict([d])
                        histo[idx] += 1/nkp # Because we need normalized histograms, I prefere to add 1/nkp directly
                data.append(histo) 
    return  data
     

In [13]:
Lemonada = np.array(Lemonada)
IBM = np.array(IBM)
Marhey = np.array(Marhey)
Scheherazade = np.array(Scheherazade)

Data=np.concatenate((Scheherazade,Marhey,Lemonada,IBM))
labels = np.concatenate(
    (np.zeros(Scheherazade.shape[0]), np.ones(Marhey.shape[0]), np.ones(Lemonada.shape[0]) * 2, np.ones(IBM.shape[0]) * 3))


X_train, X_test, Y_train, Y_test = train_test_split( Data, labels, test_size=0.4, random_state=42)
X_validation, X_test, Y_validation, Y_test = train_test_split( X_test, Y_test, test_size=0.5, random_state=42)


In [39]:
validation_descriptors,data_validation=extract_features(X_validation)
train_descriptors,data_train=extract_features(X_train)
test_descriptors,data_test=extract_features(X_test)


In [40]:
k=1700
kmeans = MiniBatchKMeans(n_clusters=k, verbose=0).fit(train_descriptors)
with open( 'kmeans.pkl', 'wb') as file:
    pickle.dump(kmeans, file)
train_histo=generate_hist(data_train,kmeans)
validation_histo=generate_hist(data_validation,kmeans)
histo_test=generate_hist(data_test,kmeans)


  super()._check_params_vs_input(X, default_n_init=3)
100%|██████████| 2398/2398 [12:04<00:00,  3.31it/s]
100%|██████████| 799/799 [05:04<00:00,  2.63it/s]
100%|██████████| 800/800 [04:39<00:00,  2.86it/s]


<span style="color:#fff; font-family: 'Bebas Neue'; font-size: 3em;" >Model Selection/Training Module</span>

In [50]:
#  make a prediction with probability true to debug

svm_classifier = svm.SVC(C=4.0, gamma='scale')
svm_classifier.fit(train_histo, Y_train)
with open( 'svm_classifier.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)
y_pred = svm_classifier.predict(validation_histo)

print("SVM Model Accuracy",accuracy_score(Y_validation, y_pred) * 100)


SVM Model Accuracy 99.12390488110138


In [43]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(train_histo, Y_train)



y_pred = knn.predict(validation_histo)

print("knn Model Accuracy",accuracy_score(Y_validation, y_pred) * 100)



knn Model Accuracy 97.74718397997498


In [44]:

rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=40,
                                       n_estimators=200, oob_score=True)

rf.fit(train_histo, Y_train)
y_pred = rf.predict(validation_histo)
print("random forest Model Accuracy",accuracy_score(Y_validation, y_pred) * 100)


random forest Model Accuracy 98.6232790988736


In [45]:

xgb_clf = xgb.XGBClassifier(
    n_estimators=100,  # Number of boosting rounds
    max_depth=40,  # Maximum tree depth
    learning_rate=0.1,  # Learning rate for gradient boosting
    objective='multi:softmax',  # Multi-class objective
    num_class=4,
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)


xgb_clf.fit(train_histo, Y_train)
y_pred = xgb_clf.predict(validation_histo)
print("XGBoost Model Accuracy",accuracy_score(Y_validation, y_pred) * 100)


XGBoost Model Accuracy 98.3729662077597


In [46]:
kmeans = MiniBatchKMeans(n_clusters=k, verbose=0).fit(train_descriptors+validation_descriptors+test_descriptors)
with open( 'final_kmeans.pkl', 'wb') as file:
    pickle.dump(kmeans, file)

train_histo=generate_hist(data_train,kmeans)
validation_histo=generate_hist(data_validation,kmeans)
histo_test=generate_hist(data_test,kmeans)


final_data=train_histo+validation_histo+histo_test

final_labels=np.concatenate((Y_train,Y_validation,Y_test))

svm_classifier.fit(final_data, final_labels)
with open( 'final_svm.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)


  super()._check_params_vs_input(X, default_n_init=3)
100%|██████████| 2398/2398 [13:40<00:00,  2.92it/s] 
100%|██████████| 799/799 [03:44<00:00,  3.56it/s]
100%|██████████| 800/800 [03:50<00:00,  3.48it/s]


In [47]:
%store  train_descriptors 
%store  data_validation 
%store  data_train 
%store  Y_train 
%store  Y_validation 
%store  X_test 
%store  Y_test 
%store histo_test

Stored 'train_descriptors' (list)
Stored 'data_validation' (list)
Stored 'data_train' (list)
Stored 'Y_train' (ndarray)
Stored 'Y_validation' (ndarray)
Stored 'X_test' (ndarray)
Stored 'Y_test' (ndarray)
Stored 'histo_test' (list)
