# Download and extract data

In [2]:
pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import gdown
import zipfile
import os

# Đường dẫn đến file zip trên Google Drive
google_drive_link = "https://drive.google.com/uc?id=1PdWkZe8Vt6xdsTj8KeqxhAd5Tmx4pyNh"

# Tải file zip từ Google Drive
output_zip = "data.zip"
gdown.download(google_drive_link, output_zip, quiet=False)

# Giải nén file zip
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall("data_folder")  # Thay "data_folder" bằng đường dẫn thư mục bạn muốn lưu trữ dữ liệu giải nén

# Xóa file zip sau khi đã giải nén
os.remove(output_zip)

Downloading...
From (original): https://drive.google.com/uc?id=1PdWkZe8Vt6xdsTj8KeqxhAd5Tmx4pyNh
From (redirected): https://drive.google.com/uc?id=1PdWkZe8Vt6xdsTj8KeqxhAd5Tmx4pyNh&confirm=t&uuid=66e19fe1-7b5a-499f-9322-17c5544f1c29
To: /kaggle/working/data.zip
100%|██████████| 105M/105M [00:00<00:00, 149MB/s]  


In [4]:
from PIL import Image
import cv2
import numpy as np

In [5]:
H = 128
W = 128

In [6]:
def read_image(image):
    '''
        Đọc một ảnh đơn và chuyển đổi nó sang RGB trong OpenCV từ tên file.
    '''
    return cv2.cvtColor(cv2.imread(image), cv2.COLOR_BGR2RGB)

def apply_ben_preprocessing(image):
    '''
        Áp dụng tiền xử lý Ben's trên một ảnh trong định dạng OpenCV.
    '''
    return cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0), 10), -4, 128)

def apply_denoising(image):
    '''
        Áp dụng khử nhiễu trên một ảnh trong định dạng OpenCV.
        Khử nhiễu được thực hiện với cường độ gấp đôi so với khuyến nghị từ tài liệu OpenCV.
    '''
    return cv2.fastNlMeansDenoisingColored(image, None, 20, 20, 7, 21)

In [7]:
# #threshold
def threshold(image_path):
  img = read_image(image_path)
  before = apply_ben_preprocessing(img)
  after = apply_denoising(before)
  img = cv2.resize(after, (H, W))
  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  ret, th1 = cv2.threshold(img, 130, 255, cv2.THRESH_BINARY_INV)
  return th1

In [8]:
# Function to load images from folders
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):

        image_path = os.path.join(folder, filename)
        # Áp dụng filter cho ảnh
        img_process = threshold(image_path)

        images.append(img_process)


    return images


# Load model and label encoder have been saved

In [9]:
import gdown
google_drive_url_model = "https://drive.google.com/uc?id=1mmwUU13pbi9sCGDKUKAQXekDuSvXR-LM"

# Đường dẫn đến nơi lưu trữ file model trên Kaggle
output_path_model = "/kaggle/working/model_chu_nom.keras"

# Tải file từ Google Drive
gdown.download(google_drive_url_model, output_path_model, quiet=False)

google_drive_url_label_encoder = "https://drive.google.com/uc?id=1ABYxcD3f10BbE0k0OF8RiwLFW7JB9fdu"

# Đường dẫn đến nơi lưu trữ file lable encoder trên Kaggle
output_path_label_encoder = "/kaggle/working/label_encoder.joblib"

# Tải file từ Google Drive
gdown.download(google_drive_url_label_encoder, output_path_label_encoder, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1mmwUU13pbi9sCGDKUKAQXekDuSvXR-LM
From (redirected): https://drive.google.com/uc?id=1mmwUU13pbi9sCGDKUKAQXekDuSvXR-LM&confirm=t&uuid=01460748-49f8-415c-b67d-4c232d647092
To: /kaggle/working/model_chu_nom.keras
100%|██████████| 48.5M/48.5M [00:01<00:00, 31.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ABYxcD3f10BbE0k0OF8RiwLFW7JB9fdu
To: /kaggle/working/label_encoder.joblib
100%|██████████| 34.4k/34.4k [00:00<00:00, 30.1MB/s]


'/kaggle/working/label_encoder.joblib'

In [10]:
import joblib

label_encoder1 = joblib.load('label_encoder.joblib')


# Predict

In [11]:
import os
from keras.models import load_model


# Định nghĩa hàm predict cho mỗi ảnh
def predict_image(image_path, model, label_encoder):
    # Tiền xử lý ảnh và thực hiện dự đoán
    img_process = threshold(image_path)
    img_process = np.expand_dims(img_process, axis=0)  # Thêm chiều cho batch
    prediction = model.predict(img_process)
    label_encoded = np.argmax(prediction)
    label = label_encoder.inverse_transform([label_encoded])[0]
    
    # Trả về kết quả dự đoán
    return label

# Function to load images from folders and make predictions
def load_images_and_predict(folder, model, label_encoder):
    predictions = []
    for filename in os.listdir(folder):
        image_path = os.path.join(folder, filename)
        label = predict_image(image_path, model, label_encoder)
        predictions.append({'image_name': os.path.splitext(filename)[0], 'label': label})
    
    return predictions

# Load model từ file checkpoint đã lưu
model = load_model('model_chu_nom.keras')

# Load ảnh từ thư mục và thực hiện dự đoán
images_dir = '/kaggle/working/data_folder/wb_recognition_dataset/val/images'
predictions = load_images_and_predict(images_dir, model, label_encoder1)


2024-05-22 13:55:06.275649: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 13:55:06.275740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 13:55:06.373269: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


I0000 00:00:1716386120.608312     114 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33

# Save prediction results and calculate accuracy with labels.csv

In [13]:
import pandas as pd

# Chuyển danh sách predictions thành DataFrame
predictions_df = pd.DataFrame(predictions)

predictions_df['image_name'] = predictions_df['image_name'].astype('int64')

# Sắp xếp DataFrame theo cột 'image_name'
predictions_df = predictions_df.sort_values(by='image_name')

# Lưu DataFrame vào file CSV
predictions_df.to_csv('predictions.csv', index=False)

In [14]:
def count_differences(file1_path, file2_path):
    count = 0

    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
        file1_lines = file1.readlines()
        file2_lines = file2.readlines()

        # Check if both files have the same number of lines
        if len(file1_lines) != len(file2_lines):
            raise ValueError("Files do not have the same number of lines.")

        for line1, line2 in zip(file1_lines, file2_lines):
            if line1.strip() != line2.strip():
                count += 1

    return count


# Example usage:
file1_path = '/kaggle/working/predictions.csv'
file2_path = '/kaggle/working/data_folder/wb_recognition_dataset/val/labels.csv'
differences = count_differences(file1_path, file2_path)
print(f"Number of different lines: {differences}")
print(f"Number of similar lines: {1394 - differences}")
print("Percentage of similarity: " + str((1394 - differences) / 1394 * 100))

Number of different lines: 158
Number of similar lines: 1236
Percentage of similarity: 88.66571018651362
