In [None]:
!pip install easyocr

In [None]:
import os
import pandas as pd
import numpy as np
import easyocr
from difflib import SequenceMatcher
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

In [None]:
IS_KAGGLE = os.path.exists('/kaggle/input')

if IS_KAGGLE:
    KAGGLE_DATASET_NAME = 'ocr-data'
    CSV_PATH = f'/kaggle/input/{KAGGLE_DATASET_NAME}/ocr-data.csv'
    DATASET_DIR = f'/kaggle/input/{KAGGLE_DATASET_NAME}/dataset'
    print('Dang chay tren KAGGLE')
else:
    CSV_PATH = 'ocr-data.csv'
    DATASET_DIR = 'dataset'
    print('Dang chay tren LOCAL')

print(f'CSV_PATH: {CSV_PATH}')
print(f'DATASET_DIR: {DATASET_DIR}')

In [None]:
df = pd.read_csv(CSV_PATH)
print(f"Cac cot trong CSV: {df.columns.tolist()}")
print(f"\nTong so mau: {len(df)}")

type_col = df.columns[3]
type_counts = df[type_col].value_counts()
print(f"\nPhan bo theo loai:")
for text_type, count in type_counts.items():
    print(f"  - {text_type}: {count} anh")

df.head()

In [None]:
print("Dang khoi tao EasyOCR reader...")
reader = easyocr.Reader(['vi', 'en'], gpu=True)
print("Khoi tao thanh cong!")

In [None]:
def extract_text_from_image(reader, image_path):
    try:
        result = reader.readtext(image_path)
        extracted_text = " ".join([text for _, text, _ in result])
        return extracted_text.strip()
    except Exception as e:
        print(f"Loi khi xu ly anh {image_path}: {e}")
        return ""

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    return " ".join(text.lower().strip().split())

def calculate_similarity(text1, text2):
    normalized_text1 = normalize_text(text1)
    normalized_text2 = normalize_text(text2)
    if not normalized_text1 or not normalized_text2:
        return 0.0
    return SequenceMatcher(None, normalized_text1, normalized_text2).ratio()

In [None]:
results = []

id_col = df.columns[0]
image_col = df.columns[1]
label_col = df.columns[2]
type_col = df.columns[3]

print(f"Dang xu ly {len(df)} anh...")
print(f"LUU Y: So sanh su dung CHU THUONG\n")

for idx, row in df.iterrows():
    image_id = row[id_col]
    image_name = row[image_col]
    ground_truth = str(row[label_col])
    text_type = row[type_col]
    
    image_path = os.path.join(DATASET_DIR, image_name)
    extracted_text = extract_text_from_image(reader, image_path)
    
    ground_truth_lower = normalize_text(ground_truth)
    ocr_result_lower = normalize_text(extracted_text)
    similarity = calculate_similarity(ground_truth, extracted_text)
    
    results.append({
        'ID': image_id,
        'Image': image_name,
        'Text Type': text_type,
        'Ground Truth': ground_truth,
        'Ground Truth (lowercase)': ground_truth_lower,
        'OCR Result': extracted_text,
        'OCR Result (lowercase)': ocr_result_lower,
        'Similarity': similarity,
        'Char Count': len(ground_truth)
    })
    
    print(f"[{idx + 1}/{len(df)}] [{text_type}] {image_name} - Similarity: {similarity:.2%}")

print("\nHoan thanh xu ly tat ca anh!")

In [None]:
results_df = pd.DataFrame(results)

typed_df = results_df[results_df['Text Type'] == 'Chu danh may']
handwritten_df = results_df[results_df['Text Type'] == 'Chu viet tay']

print(f"Tong: {len(results_df)} anh")
print(f"Chu danh may: {len(typed_df)} anh")
print(f"Chu viet tay: {len(handwritten_df)} anh")

In [None]:
print("=" * 80)
print("BAO CAO 1: THONG KE TONG QUAN")
print("=" * 80)
print("(So sanh su dung CHU THUONG)")

avg_all = results_df['Similarity'].mean()
avg_typed = typed_df['Similarity'].mean()
avg_handwritten = handwritten_df['Similarity'].mean()

print(f"\n{'Loai chu':<20} {'So luong':<10} {'Trung binh':<12} {'Min':<10} {'Max':<10} {'Std':<10}")
print("-" * 72)
print(f"{'Chu danh may':<20} {len(typed_df):<10} {avg_typed:<12.2%} {typed_df['Similarity'].min():<10.2%} {typed_df['Similarity'].max():<10.2%} {typed_df['Similarity'].std():<10.3f}")
print(f"{'Chu viet tay':<20} {len(handwritten_df):<10} {avg_handwritten:<12.2%} {handwritten_df['Similarity'].min():<10.2%} {handwritten_df['Similarity'].max():<10.2%} {handwritten_df['Similarity'].std():<10.3f}")
print("-" * 72)
print(f"{'TONG THE':<20} {len(results_df):<10} {avg_all:<12.2%} {results_df['Similarity'].min():<10.2%} {results_df['Similarity'].max():<10.2%} {results_df['Similarity'].std():<10.3f}")

print(f"\n=> CHENH LECH: {abs(avg_typed - avg_handwritten):.2%}")
if avg_typed > avg_handwritten:
    print("   EasyOCR nhan dang CHU DANH MAY tot hon.")
else:
    print("   EasyOCR nhan dang CHU VIET TAY tot hon.")

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 2: PHAN LOAI THEO DO CHINH XAC (4 NGUONG)")
print("=" * 80)

def get_category_counts(df):
    """Phan loai theo 4 nguong:
    - Xuat sac: >= 90%
    - Tot: 80% - 90%
    - Trung binh: 50% - 80%
    - Kem: < 50%
    """
    excellent = len(df[df['Similarity'] >= 0.9])                                    # >= 90%
    good = len(df[(df['Similarity'] >= 0.8) & (df['Similarity'] < 0.9)])           # 80-90%
    average = len(df[(df['Similarity'] >= 0.5) & (df['Similarity'] < 0.8)])        # 50-80%
    poor = len(df[df['Similarity'] < 0.5])                                          # < 50%
    return excellent, good, average, poor

typed_cats = get_category_counts(typed_df)
handwritten_cats = get_category_counts(handwritten_df)
all_cats = get_category_counts(results_df)

print(f"\n{'Muc do':<25} {'Chu danh may':<15} {'Chu viet tay':<15} {'Tong':<10}")
print("-" * 65)
print(f"{'Xuat sac (>=90%)':<25} {typed_cats[0]:<15} {handwritten_cats[0]:<15} {all_cats[0]:<10}")
print(f"{'Tot (80-90%)':<25} {typed_cats[1]:<15} {handwritten_cats[1]:<15} {all_cats[1]:<10}")
print(f"{'Trung binh (50-80%)':<25} {typed_cats[2]:<15} {handwritten_cats[2]:<15} {all_cats[2]:<10}")
print(f"{'Kem (<50%)':<25} {typed_cats[3]:<15} {handwritten_cats[3]:<15} {all_cats[3]:<10}")
print("-" * 65)

# Ty le theo tung muc
typed_excellent_pct = typed_cats[0] / len(typed_df) * 100
typed_good_pct = typed_cats[1] / len(typed_df) * 100
typed_avg_pct = typed_cats[2] / len(typed_df) * 100
typed_poor_pct = typed_cats[3] / len(typed_df) * 100

handwritten_excellent_pct = handwritten_cats[0] / len(handwritten_df) * 100
handwritten_good_pct = handwritten_cats[1] / len(handwritten_df) * 100
handwritten_avg_pct = handwritten_cats[2] / len(handwritten_df) * 100
handwritten_poor_pct = handwritten_cats[3] / len(handwritten_df) * 100

print(f"\nTY LE PHAN TRAM:")
print(f"\n{'Muc do':<25} {'Chu danh may':<15} {'Chu viet tay':<15}")
print("-" * 55)
print(f"{'Xuat sac (>=90%)':<25} {typed_excellent_pct:<15.1f}% {handwritten_excellent_pct:<15.1f}%")
print(f"{'Tot (80-90%)':<25} {typed_good_pct:<15.1f}% {handwritten_good_pct:<15.1f}%")
print(f"{'Trung binh (50-80%)':<25} {typed_avg_pct:<15.1f}% {handwritten_avg_pct:<15.1f}%")
print(f"{'Kem (<50%)':<25} {typed_poor_pct:<15.1f}% {handwritten_poor_pct:<15.1f}%")

# Ty le tot tro len (>=80%)
typed_good_up = (typed_cats[0] + typed_cats[1]) / len(typed_df) * 100
handwritten_good_up = (handwritten_cats[0] + handwritten_cats[1]) / len(handwritten_df) * 100

# Ty le chap nhan (>=50%)
typed_accept = (typed_cats[0] + typed_cats[1] + typed_cats[2]) / len(typed_df) * 100
handwritten_accept = (handwritten_cats[0] + handwritten_cats[1] + handwritten_cats[2]) / len(handwritten_df) * 100

print(f"\nTONG HOP:")
print(f"  Ty le TOT tro len (>=80%):")
print(f"    - Chu danh may: {typed_good_up:.1f}%")
print(f"    - Chu viet tay: {handwritten_good_up:.1f}%")
print(f"  Ty le CHAP NHAN DUOC (>=50%):")
print(f"    - Chu danh may: {typed_accept:.1f}%")
print(f"    - Chu viet tay: {handwritten_accept:.1f}%")

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 3: PHAN TICH THEO DO DAI VAN BAN")
print("=" * 80)

def categorize_length(char_count):
    if char_count <= 30:
        return 'Ngan (<=30 ky tu)'
    elif char_count <= 80:
        return 'Trung binh (31-80)'
    else:
        return 'Dai (>80 ky tu)'

results_df['Length Category'] = results_df['Char Count'].apply(categorize_length)

print(f"\n{'Do dai':<25} {'Loai chu':<15} {'So luong':<10} {'TB Similarity':<15}")
print("-" * 65)

for length_cat in ['Ngan (<=30 ky tu)', 'Trung binh (31-80)', 'Dai (>80 ky tu)']:
    for text_type in ['Chu danh may', 'Chu viet tay']:
        subset = results_df[(results_df['Length Category'] == length_cat) & (results_df['Text Type'] == text_type)]
        if len(subset) > 0:
            print(f"{length_cat:<25} {text_type:<15} {len(subset):<10} {subset['Similarity'].mean():<15.2%}")

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 4: BIEU DO TRUC QUAN")
print("=" * 80)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Bar chart so sanh trung binh
categories = ['Chu danh may', 'Chu viet tay', 'Tong the']
averages = [avg_typed, avg_handwritten, avg_all]
colors_bar = ['#3498db', '#e74c3c', '#2ecc71']
bars = axes[0, 0].bar(categories, averages, color=colors_bar, edgecolor='black')
axes[0, 0].set_ylabel('Do tuong dong', fontsize=11)
axes[0, 0].set_title('So sanh do tuong dong trung binh', fontsize=12, fontweight='bold')
axes[0, 0].set_ylim(0, 1)
for bar, avg in zip(bars, averages):
    axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                    f'{avg:.1%}', ha='center', fontsize=10, fontweight='bold')

# 2. Histogram - Chu danh may
axes[0, 1].hist(typed_df['Similarity'], bins=12, edgecolor='black', color='#3498db', alpha=0.7)
axes[0, 1].axvline(avg_typed, color='red', linestyle='--', linewidth=2, label=f'TB: {avg_typed:.1%}')
axes[0, 1].axvline(0.9, color='green', linestyle=':', linewidth=2, label='90%')
axes[0, 1].axvline(0.8, color='blue', linestyle=':', linewidth=2, label='80%')
axes[0, 1].axvline(0.5, color='orange', linestyle=':', linewidth=2, label='50%')
axes[0, 1].set_xlabel('Do tuong dong')
axes[0, 1].set_ylabel('So luong')
axes[0, 1].set_title('Phan phoi - CHU DANH MAY', fontsize=12, fontweight='bold')
axes[0, 1].legend(fontsize=7)

# 3. Histogram - Chu viet tay
axes[0, 2].hist(handwritten_df['Similarity'], bins=12, edgecolor='black', color='#e74c3c', alpha=0.7)
axes[0, 2].axvline(avg_handwritten, color='blue', linestyle='--', linewidth=2, label=f'TB: {avg_handwritten:.1%}')
axes[0, 2].axvline(0.9, color='green', linestyle=':', linewidth=2, label='90%')
axes[0, 2].axvline(0.8, color='purple', linestyle=':', linewidth=2, label='80%')
axes[0, 2].axvline(0.5, color='orange', linestyle=':', linewidth=2, label='50%')
axes[0, 2].set_xlabel('Do tuong dong')
axes[0, 2].set_ylabel('So luong')
axes[0, 2].set_title('Phan phoi - CHU VIET TAY', fontsize=12, fontweight='bold')
axes[0, 2].legend(fontsize=7)

# 4. Boxplot so sanh
bp = axes[1, 0].boxplot([typed_df['Similarity'], handwritten_df['Similarity']], 
                        labels=['Chu danh may', 'Chu viet tay'], patch_artist=True)
bp['boxes'][0].set_facecolor('#3498db')
bp['boxes'][1].set_facecolor('#e74c3c')
axes[1, 0].axhline(0.9, color='green', linestyle=':', linewidth=2, label='90%')
axes[1, 0].axhline(0.8, color='blue', linestyle=':', linewidth=2, label='80%')
axes[1, 0].axhline(0.5, color='orange', linestyle=':', linewidth=2, label='50%')
axes[1, 0].set_ylabel('Do tuong dong')
axes[1, 0].set_title('Boxplot so sanh', fontsize=12, fontweight='bold')
axes[1, 0].legend(fontsize=8)

# 5. Stacked bar - Phan loai theo 4 nguong
x = np.arange(2)
width = 0.6
excellent_vals = [typed_cats[0], handwritten_cats[0]]
good_vals = [typed_cats[1], handwritten_cats[1]]
average_vals = [typed_cats[2], handwritten_cats[2]]
poor_vals = [typed_cats[3], handwritten_cats[3]]

axes[1, 1].bar(x, excellent_vals, width, label='Xuat sac (>=90%)', color='#27ae60')
axes[1, 1].bar(x, good_vals, width, bottom=excellent_vals, label='Tot (80-90%)', color='#3498db')
axes[1, 1].bar(x, average_vals, width, bottom=np.array(excellent_vals)+np.array(good_vals), label='Trung binh (50-80%)', color='#f39c12')
axes[1, 1].bar(x, poor_vals, width, bottom=np.array(excellent_vals)+np.array(good_vals)+np.array(average_vals), label='Kem (<50%)', color='#e74c3c')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(['Chu danh may', 'Chu viet tay'])
axes[1, 1].set_ylabel('So luong anh')
axes[1, 1].set_title('Phan loai theo 4 nguong', fontsize=12, fontweight='bold')
axes[1, 1].legend(fontsize=8)

# 6. Grouped bar - Ty le %
x2 = np.arange(4)
width2 = 0.35
typed_pcts = [typed_excellent_pct, typed_good_pct, typed_avg_pct, typed_poor_pct]
handwritten_pcts = [handwritten_excellent_pct, handwritten_good_pct, handwritten_avg_pct, handwritten_poor_pct]

axes[1, 2].bar(x2 - width2/2, typed_pcts, width2, label='Chu danh may', color='#3498db')
axes[1, 2].bar(x2 + width2/2, handwritten_pcts, width2, label='Chu viet tay', color='#e74c3c')
axes[1, 2].set_xticks(x2)
axes[1, 2].set_xticklabels(['Xuat sac\n>=90%', 'Tot\n80-90%', 'TB\n50-80%', 'Kem\n<50%'])
axes[1, 2].set_ylabel('Ty le (%)')
axes[1, 2].set_title('So sanh ty le theo nguong', fontsize=12, fontweight='bold')
axes[1, 2].legend()

plt.tight_layout()
plt.savefig('easyocr_analysis_report.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nDa luu bieu do: 'easyocr_analysis_report.png'")

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 5: TOP 5 TOT NHAT VA KEM NHAT")
print("=" * 80)

for text_type, type_df in [('CHU DANH MAY', typed_df), ('CHU VIET TAY', handwritten_df)]:
    print(f"\n--- {text_type} ---")
    
    print("\nTop 5 TOT NHAT:")
    top5 = type_df.nlargest(5, 'Similarity')[['ID', 'Image', 'Similarity']]
    for _, row in top5.iterrows():
        print(f"  {row['Image']}: {row['Similarity']:.2%}")
    
    print("\nTop 5 KEM NHAT:")
    bottom5 = type_df.nsmallest(5, 'Similarity')[['ID', 'Image', 'Similarity']]
    for _, row in bottom5.iterrows():
        print(f"  {row['Image']}: {row['Similarity']:.2%}")

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 6: KET QUA CHI TIET - CHU DANH MAY")
print("=" * 80)
pd.set_option('display.max_colwidth', 40)
typed_df.sort_values('Similarity', ascending=False)[['ID', 'Image', 'Similarity', 'Ground Truth (lowercase)', 'OCR Result (lowercase)']]

In [None]:
print("\n" + "=" * 80)
print("BAO CAO 6: KET QUA CHI TIET - CHU VIET TAY")
print("=" * 80)
handwritten_df.sort_values('Similarity', ascending=False)[['ID', 'Image', 'Similarity', 'Ground Truth (lowercase)', 'OCR Result (lowercase)']]

In [None]:
results_df.to_csv('easyocr_all_results.csv', index=False, encoding='utf-8-sig')
typed_df.to_csv('easyocr_typed_results.csv', index=False, encoding='utf-8-sig')
handwritten_df.to_csv('easyocr_handwritten_results.csv', index=False, encoding='utf-8-sig')

print("Da xuat ket qua:")
print("  - easyocr_all_results.csv")
print("  - easyocr_typed_results.csv")
print("  - easyocr_handwritten_results.csv")
print("  - easyocr_analysis_report.png")

In [None]:
print("\n" + "=" * 80)
print("TONG KET DANH GIA MO HINH EASYOCR")
print("=" * 80)

print(f"""
MO HINH: EasyOCR
NGON NGU: Tieng Viet (vi), Tieng Anh (en)
TONG SO ANH: {len(results_df)}

PHUONG PHAP: So sanh su dung CHU THUONG
PHAN LOAI 4 NGUONG:
  - Xuat sac: >= 90%
  - Tot: 80% - 90%
  - Trung binh: 50% - 80%
  - Kem: < 50%

============ KET QUA CHINH ============

1. CHU DANH MAY ({len(typed_df)} anh):
   - Trung binh: {avg_typed:.2%}
   - Xuat sac (>=90%): {typed_cats[0]} anh ({typed_excellent_pct:.1f}%)
   - Tot (80-90%): {typed_cats[1]} anh ({typed_good_pct:.1f}%)
   - Trung binh (50-80%): {typed_cats[2]} anh ({typed_avg_pct:.1f}%)
   - Kem (<50%): {typed_cats[3]} anh ({typed_poor_pct:.1f}%)

2. CHU VIET TAY ({len(handwritten_df)} anh):
   - Trung binh: {avg_handwritten:.2%}
   - Xuat sac (>=90%): {handwritten_cats[0]} anh ({handwritten_excellent_pct:.1f}%)
   - Tot (80-90%): {handwritten_cats[1]} anh ({handwritten_good_pct:.1f}%)
   - Trung binh (50-80%): {handwritten_cats[2]} anh ({handwritten_avg_pct:.1f}%)
   - Kem (<50%): {handwritten_cats[3]} anh ({handwritten_poor_pct:.1f}%)

3. TONG THE ({len(results_df)} anh):
   - Trung binh: {avg_all:.2%}
   - Ty le Tot tro len (>=80%): {(all_cats[0]+all_cats[1])/len(results_df)*100:.1f}%
   - Ty le Chap nhan (>=50%): {(all_cats[0]+all_cats[1]+all_cats[2])/len(results_df)*100:.1f}%

============ DANH GIA ============
""")

print("CHU DANH MAY: ", end="")
if avg_typed >= 0.9:
    print("XUAT SAC!")
elif avg_typed >= 0.8:
    print("TOT")
elif avg_typed >= 0.5:
    print("TRUNG BINH")
else:
    print("KEM")

print("CHU VIET TAY: ", end="")
if avg_handwritten >= 0.9:
    print("XUAT SAC!")
elif avg_handwritten >= 0.8:
    print("TOT")
elif avg_handwritten >= 0.5:
    print("TRUNG BINH")
else:
    print("KEM")

print(f"\nKET LUAN: ", end="")
if avg_typed > avg_handwritten:
    print(f"EasyOCR phu hop hon voi CHU DANH MAY (chenh lech {abs(avg_typed-avg_handwritten):.2%})")
else:
    print(f"EasyOCR phu hop hon voi CHU VIET TAY (chenh lech {abs(avg_typed-avg_handwritten):.2%})")