In [16]:
import numpy as np
import os
import pandas as pd
import imageio
from PIL import Image,ImageOps
from random import sample
import hashlib
import cv2


In [17]:

def delete_duplicates(root_path):
    print('Root Directory:', root_path)

    total_files = 0
    total_duplicates = 0
    file_hashes = {}
    for dirpath, dirnames, filenames in os.walk(root_path):
        print('Processing Directory:', dirpath)
        for filename in filenames:
            total_files += 1
            if filename.endswith('.jpg') and not filename.startswith('._'):
                file_path = os.path.join(dirpath, filename)
                with open(file_path, 'rb') as f:
                    file_data = f.read()
                file_hash = hashlib.md5(file_data).hexdigest()

                if file_hash in file_hashes:
                    total_duplicates += 1
                    print('Duplicate found:', filename)
                    os.remove(file_path)
                else:
                    file_hashes[file_hash] = file_path

    print('Total Files:', total_files)
    print('Total Duplicates Removed:', total_duplicates)

root_directory = "D:\Ahmed\AI\OCR-Math\Data\crohme"
delete_duplicates(root_directory)

  root_directory = "D:\Ahmed\AI\OCR-Math\Data\crohme"


Root Directory: D:\Ahmed\AI\OCR-Math\Data\crohme
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\(
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\)
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\+
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\-
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\0
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\1
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\2
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\3
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\4
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\5
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\6
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\7
Processing Directory: D:\Ahmed\AI\OCR-Math\Data\crohme\crohme\8
Pr

In [11]:

def process_mapping(mapping_raw, mapping_processed, extra_chars):

    df = pd.read_csv(mapping_raw, sep=' ', header=None, names=["id", "code"])
    df["char"] = df["code"].apply(chr)
  
    nextId = df.shape[0]

    for i, c in enumerate(extra_chars, start=nextId):
        df.loc[i] = [i, ord(c), c]
  
    df.to_csv(mapping_processed, index=False)

mapping_raw = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Original-EMNIST-Dataset\\emnist-balanced-mapping.txt"
mapping_processed = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping.csv"
extra_chars = ['(', ')', '+', '-', '=']

process_mapping(mapping_raw, mapping_processed, extra_chars)


In [18]:
#images from the emnist dataset are flipped horizontally and rotated 90 degrees.
#These functions are used to adjust the images to their original orientation.
#The pixel values are also normalized to the range [0, 1]


def mirror(X):
    res = np.zeros(X.shape)
    n = 28
    for r in range(n, n**2 + 1, n):
        l = r - n
        for k in range(l, r):
            index = l + (r - k - 1)
            res[:,k] = X[:,index]
            
    return res        
def rotate_clockwise(X):
    res = np.zeros(X.shape)
    size = 28
    k = 0
    for i in reversed(range(size)):
        j = i
        while j < size**2:
            res[:,k] = X[:,j]
            k += 1
            j += size

    return res

def rotate(X, times):
    for i in range(times):
        X = rotate_clockwise(X)
    
    return X

In [None]:
def process_data(file_from, file_to):
    label = 'label'
    names = [label] + ["px" + str(i) for i in range(784)]
    data = pd.read_csv(file_from, header=None, names=names)
    
    Y_data = data[label]
    X_data = data.drop(labels = [label], axis = 1)
    
    X_data = X_data / 255
    X_data = np.where(X_data > 0.5, 1, 0)
    X_data = rotate(X_data, 3)
    X_data = mirror(X_data)
    
    data = pd.DataFrame(X_data, columns=names[1:], dtype='int')    
    data.insert(0, label, Y_data)        
    data.to_csv(file_to, index=False)

raw_training_cv = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Original-EMNIST-Dataset\\emnist-balanced-train.csv"
processed_training = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_training.csv"
raw_testing_cv = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Original-EMNIST-Dataset\\emnist-balanced-test.csv"
processed_testing = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_testing.csv"
process_data(raw_training_cv, processed_training)
process_data(raw_testing_cv, processed_testing)

In [None]:


def remove_mappings_with_ids(csv_file_path, ids_to_remove):
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Remove entries with the specified IDs
    df = df[~df['id'].isin(ids_to_remove)]
    
    # Save the filtered DataFrame back to a CSV file
    filtered_csv_file_path = csv_file_path.replace('.csv', '_filtered.csv')
    df.to_csv(filtered_csv_file_path, index=False)
    
    return filtered_csv_file_path

csv_file_path = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping.csv"
ids_to_remove = [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,36,37,38,40,41,42,43,44,45,46]
filtered_csv_file_path = remove_mappings_with_ids(csv_file_path, ids_to_remove)
print(f"Filtered CSV file saved to: {filtered_csv_file_path}")


In [None]:


def renumber_labels(csv_file_path):
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Create a mapping dictionary to map old IDs to new IDs
    mapping_dict = {old_id: new_id for new_id, old_id in enumerate(df['id'])}
    
    # Update the 'id' column in the DataFrame using the mapping dictionary
    df['id'] = df['id'].map(mapping_dict)
    
    # Save the updated DataFrame back to a CSV file
    updated_csv_file_path = csv_file_path.replace('.csv', '_renumbered.csv')
    df.to_csv(updated_csv_file_path, index=False)
    
    return updated_csv_file_path


csv_file_path = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping_filtered.csv"
renumbered_csv_file_path = renumber_labels(csv_file_path)
print(f"Renumbered CSV file saved to: {renumbered_csv_file_path}")


In [19]:


def remove_rows_with_labels(csv_file_path, labels_to_remove):
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Remove rows with the specified labels
    for label in labels_to_remove:
        df = df[df['label'] != label]
    
    # Save the filtered DataFrame back to a CSV file
    filtered_csv_file_path = csv_file_path.replace('.csv', '_filtered.csv')
    df.to_csv(filtered_csv_file_path, index=False)
    
    return filtered_csv_file_path

csv_file_testing = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_testing.csv"
csv_file_training = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_training.csv"
labels_to_remove = [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,36,37,38,40,41,42,43,44,45,46]
filtered_csv_file_path = remove_rows_with_labels(csv_file_testing, labels_to_remove)
filtered_csv_file_path = remove_rows_with_labels(csv_file_training, labels_to_remove)



In [None]:


def map_old_to_new_labels(old_mapping_file_path, new_mapping_file_path, image_csv_file_path):
    # Read the old and new mapping CSV files
    old_mapping_df = pd.read_csv(old_mapping_file_path)
    new_mapping_df = pd.read_csv(new_mapping_file_path)
    
    # Merge the old and new mapping DataFrames on the "code" column
    merged_df = pd.merge(old_mapping_df, new_mapping_df, on='code', suffixes=('_old', '_new'))
    
    # Create a dictionary to map old labels to new labels
    mapping_dict = dict(zip(merged_df['id_old'], merged_df['id_new']))
    
    # Read the image pixel CSV file
    image_df = pd.read_csv(image_csv_file_path)
    
    # Map old labels to new labels based on the mapping dictionary
    image_df['label'] = image_df['label'].map(mapping_dict)
    
    # Save the updated DataFrame back to a CSV file
    updated_csv_file_path = image_csv_file_path.replace('.csv', '_mapped.csv')
    image_df.to_csv(updated_csv_file_path, index=False)
    
    return updated_csv_file_path

old_mapping_file_path = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping_filtered.csv"
new_mapping_file_path = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping_filtered_renumbered.csv"
image_csv_file_path = "D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_testing_filtered.csv"
mapped_image_csv_file_path = map_old_to_new_labels(old_mapping_file_path, new_mapping_file_path, image_csv_file_path)



In [22]:
df = pd.read_csv("D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_mapping_filtered_renumbered.csv")
char2code = {}
for index, row in df.iterrows():
    char2code[row['char']] = row['id']

In [23]:
def convert_image_to_csv(filepath, char_code):
    img = cv2.imread(filepath, 0)
    kernel = np.ones((3,3), np.uint8)
    dilation = cv2.erode(img, kernel, iterations=1)
    
    img = Image.fromarray(dilation).resize((28, 28))
    inv_img = ImageOps.invert(img)
    
    flattened = np.array(inv_img).flatten()
    flattened = flattened / 255
    flattened = np.where(flattened > 0.5, 1, 0)
    
    csv_img = ','.join([str(num) for num in flattened])
    csv_str = '{},{}'.format(char_code, csv_img)
    return csv_str


In [24]:
train_data_size = 2400
test_data_size = 400
f_test_output = open("D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_testing_filtered_mapped.csv", 'a')
f_train_output = open("D:\\Ahmed\\AI\\OCR-Math\\Data\\emnist\\Processed-Dataset\\processed_training_filtered_mapped.csv", 'a')

for c in extra_chars:
    print('Processing "{}" character...'.format(c))
    current_directory =  "D:\\Ahmed\\AI\\OCR-Math\\Data\\crohme\\crohme\\" + c + '\\'
    files = [file_name for dir_path, dir_name, file_name in os.walk(current_directory)]
    subset = sample(files[0], train_data_size + test_data_size)
    train_subset = subset[0:train_data_size]
    test_subset = subset[train_data_size:train_data_size + test_data_size]
    
    for filename in train_subset:
        csv_string = convert_image_to_csv(current_directory + filename, char2code[c])
        print(csv_string, file=f_train_output)
    
    for filename in test_subset:
        csv_string = convert_image_to_csv(current_directory + filename, char2code[c])
        print(csv_string, file=f_test_output)

f_test_output.close()
f_train_output.close()

Processing "(" character...
Processing ")" character...
Processing "+" character...
Processing "-" character...
Processing "=" character...
