All images must be of the same size. 
Resize all training images to the same size and convert to grayscale.

Ideally I would like to try CNN and keras, but my machine cannot run tensorflow>=2.2, so I'll use "more traditional" ML approaches.

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from skimage import filters

def imgProcessing(img):
    img = cv2.resize(img, (30,30), interpolation = cv2.INTER_AREA)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    img = clahe.apply(img)
    
    img = filters.gaussian(img, sigma=1, multichannel=False)
    img = filters.sobel(img)
    return(img)

def getLabelFromN(labels, n):
    row = n//9
    col = n-row*9
    labels = labels[row][col]
    return(labels)

dataDir = './allForTraining/'
labelDir = 'D:\\repos\\sudokuSolver_py\\rawImg\\'
allFiles = os.listdir(dataDir)
X = []
y = []
for i in allFiles:
    img = cv2.imread(dataDir+i)
    img = imgProcessing(img)
    img = img/255
    X.append(np.ndarray.flatten(img))
    names = i.split('_')
    imgName = names[0].split('.')[0]
    imgNum = int(names[1].split('.')[0])
    with open(labelDir+imgName+'.dat') as f:
        labels = [line.split()[0:9] for line in f]
    labels = labels[2:]
    thisLabel = getLabelFromN(labels, imgNum)
    y.append(thisLabel)


In [None]:
# check labels distribution
from collections import Counter
import pandas  as pd
c = Counter(y)
df = pd.DataFrame.from_dict(c, orient='index').reset_index()
df['percentage'] = 100*df[0]/sum(df[0])
df

Fairly equally distribute, except for a lot of 0s. We'll see if we need to take this into account for the training.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
print('Train size: {}'.format(len(y_train)))
print('Test size: {}'.format(len(y_test)))

In [None]:
def do_fit(clf, x_train, y_train, x_test, y_test, modelName):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    ac = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print('{} accuracy: {}'.format(modelName, ac))
    print(cm)
    for i in range(0,len(cm)):
        print(cm[i][i]*100/sum(cm[i]))
    print(60*'*')

In [None]:
# i tried a few k, accuracy is always pretty much the same
clf = KNeighborsClassifier(5)
do_fit(clf, x_train, y_train, x_test, y_test, 'K-Neighbors')

clf = DecisionTreeClassifier(random_state=0, max_depth=10)
do_fit(clf, x_train, y_train, x_test, y_test, 'Decision Tree')

clf = RandomForestClassifier(random_state=1, max_depth=10, n_estimators=100)
do_fit(clf, x_train, y_train, x_test, y_test, 'Random Forest')

Random forest is more accurate.

In [None]:
import pickle

clf = RandomForestClassifier(random_state=1, max_depth=10, n_estimators=100)
clf.fit(x_train, y_train)

file = open('sudoku_clf.kb', 'wb')
pickle.dump(clf, file)
file.close()