In [1]:
pip install googletrans pandas openpyxl


Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting httpx==0.13.3 (from googletrans)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans)
  Downloading hstspreload-2024.6.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans)
  Downloading h11-0.9.0-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting h2==

In [2]:
import pandas as pd
from googletrans import Translator
import time

# Load the Excel file
file_path = '/kaggle/input/ocrtext/Ocr500 (2).xlsx'
data = pd.read_excel(file_path)

# Initialize the translator
translator = Translator()

# Function to translate text to English with retry logic and logging
def translate_to_english(text):
    try:
        if not pd.isnull(text) and isinstance(text, str) and text.strip() != '':
            # Attempt to translate with a retry mechanism
            attempts = 3
            for attempt in range(attempts):
                try:
                    translated = translator.translate(text, dest='en')
                    return translated.text
                except Exception as e:
                    if attempt < attempts - 1:
                        time.sleep(1)  # Wait before retrying
                    else:
                        return f"Translation error after {attempts} attempts: {str(e)}"
        else:
            return text
    except Exception as e:
        return f"Error in translation function: {str(e)}"

# Apply the translation function to the 'Extracted Text' column
data['translated_text'] = data['Extracted Text'].apply(translate_to_english)

# Save the results to a new Excel file
output_file_path = '/kaggle/working/Ocr500_translated.xlsx'
data.to_excel(output_file_path, index=False)

# Display the translated text data
data[['Image Name', 'Extracted Text', 'translated_text']].head()


Unnamed: 0,Image Name,Extracted Text,translated_text
0,PRS208C4002621,"শীতল ক্যান্টনমেন্ট ডায়াগনস্টিক সেন্টার, রংপুর...",Translation error after 3 attempts: 'NoneType'...
1,PRS208C4017521,IBN SINA\nIBN SINA MEDICAL COLLEGE HOSPITAL\n1...,Translation error after 3 attempts: 'NoneType'...
2,PRS208C4018544,অধ্যাপক ডাঃ শেখ নুরুল ফাত্তাহ রুমি\nএমবিবিএস (...,Translation error after 3 attempts: 'NoneType'...
3,PRS208C4018582,"অধ্যাপক ডাঃ মওদুদুল হক\nএমবিবিএস, এমডি, পিএইচড...",Translation error after 3 attempts: 'NoneType'...
4,PRS208C4018608,"অধ্যাপক ডাঃ বেগম হোসনে আরা\nএমবিবিএস, এফসিপিএস...",Translation error after 3 attempts: 'NoneType'...


In [None]:
import os
import shutil
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract



# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Loop through each file in the source directory
for filename in files:
    # Construct file path
    file_path = os.path.join(source_dir, filename)

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Append filename and extracted text to lists
    filenames.append(filename)
    extracted_texts.append(extracted_text)

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [7]:
import pandas as pd

# Load the Excel file
file_path = '/kaggle/input/translated/Ocr500_transliterated4.xlsx'
excel_data = pd.ExcelFile(file_path)

# Load the sheets into DataFrames
sheet1 = pd.read_excel(excel_data, sheet_name='Sheet1')
sheet3 = pd.read_excel(excel_data, sheet_name='Sheet3')

# List of new Row Labels to be matched (assuming they are provided separately)
new_row_labels = ["বহির্বিভাগীয় রোগীর টিকিট","OPD/EMERGENCY TICKET","OPD /EMERGENCY TICKET", "OPD / EMERGENCY TICKET","বহির্বিভাগীয় রোগীর টিকেট","জরুরী বিভাগ রোগীর টিকিট",
                   "বহিবিভাগীয় রোগীর টিকেট","জরুরী বিভাগ",
                  "বহিঃ বিভাগের রোগীর টিকিট",
                  "বহিঃ বিভাগ টিকিট",  "বহির্বিভাগ রোগীর টিকিট","বহিঃ বিভাগ চিকিট",  "বহিঃ বিভাগ",
                "বহির্বিভাগের রোগীর টিকিট", "OPD PRESCRIPTION",  "বাংলাদেশ ফরম নং ৭৬৯","OPD ROOM","বহির্বিভাগের রোগীর টিকেট", "বহির্বিভাগ","ব্যবস্থাপত্র",
                        "OPD TICKET","Outpatient Ticket","বহি বিভাগ রোগীর টিকেট",
               "রোগী ভর্তির ফরম ও রোগ বৃত্তান্ত","রোগী ভর্তির ফরম ও রোগ বৃত্তাত্ত","রোগী ভর্তির ফরম","OUTDOOR CHECKUP TICKET",
                 "বহিরবিভাগের রোগীর টিকেট",
                   "Registration Card",    "বহিঃবিভাগীয় রোগীর টিকিট",  "জরুরী" ,  "বর্হিবিভাগ রোগীর টিকিট",
                  "বর্হিবিভাগীয় রোগীর টিকিট", "বহি বিভাগ",  "বিভাগের রোগীর টিকিট",  "ৰৰ্হি বিভাগীয় রোগীর টিকিট"
]

# Function to match Row Labels with Extracted Text
def match_row_labels(extracted_text, labels):
    for label in labels:
        if label in extracted_text:
            return label
    return None

# Apply the matching function to create SLIP_Identifier column
sheet1['SLIP_Identifier'] = sheet1['Extracted Text'].apply(lambda x: match_row_labels(x, new_row_labels))

# Write 'slip' in Type1 column if SLIP_Identifier is found
sheet1['Type1'] = sheet1['SLIP_Identifier'].apply(lambda x: 'slip' if pd.notna(x) else None)

# Save the updated sheet back to a new Excel file
output_file_path = '/kaggle/working/path_to_your_updated_file_with_type1.xlsx'
with pd.ExcelWriter(output_file_path) as writer:
    sheet1.to_excel(writer, sheet_name='Sheet1', index=False)
    sheet3.to_excel(writer, sheet_name='Sheet3', index=False)

print(f"Updated file saved at {output_file_path}")


Updated file saved at /kaggle/working/path_to_your_updated_file_with_type1.xlsx


In [6]:
import pandas as pd
import re

file_path = '/kaggle/input/physician-name/M05D28T17Alamin_Missing1.xlsx'  # Replace with the actual path to your Excel file
data = pd.read_excel(file_path)

def extract_keyword_and_two_words(text):
    # Look for 'Dr' or 'ডাঃ' and extract the keyword and the two words following these keywords
    match = re.search(r'(Dr|ডা)\.?([\S]+) ([\S]+)', text, re.IGNORECASE)
    if match:
        return f"{match.group(1)} {match.group(2)} {match.group(3)}"
    return None 

# Apply the function to extract the keyword and two words after the keyword from the 'Extracted Text' column
data['doctor_name'] = data['Extracted Text'].apply(extract_keyword_and_two_words)

data.head()

output_file_path = '/kaggle/working/extracted_doctor_names.xlsx'  
data.to_excel(output_file_path, index=False)



In [None]:
import os
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
from IPython.display import display

# Ensure Tesseract OCR is installed
!apt-get update
!apt-get install -y tesseract-ocr
!apt-get install -y tesseract-ocr-ben
!apt-get install -y training-tools
!pip install pytesseract pillow pandas

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Set the batch size
batch_size = 50
num_batches = (len(files) + batch_size - 1) // batch_size

# Function to preprocess an image
def preprocess_image(image_path, output_path):
    with Image.open(image_path) as img:
        # Convert to grayscale
        img = img.convert("L")

        # Adjust brightness and contrast
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2)

        # Reduce noise
        img = img.filter(ImageFilter.MedianFilter())

        # Sharpen the image
        img = img.filter(ImageFilter.SHARPEN)

        # Resize the image to one-third its original dimensions
        original_width, original_height = img.size
        target_width = original_width // 3
        target_height = original_height // 3
        img_resized = img.resize((target_width, target_height), Image.ANTIALIAS)

        # Save the preprocessed image
        img_resized.save(output_path)

# Create a list of image files and corresponding box files
image_files = [os.path.join(source_dir, f) for f in files if f.endswith('.png')]
box_files = [f.replace('.png', '.box') for f in image_files]

# Preprocess images and save them
preprocessed_dir = '/kaggle/input/prescription-image'
os.makedirs(preprocessed_dir, exist_ok=True)

preprocessed_image_files = []
for file in image_files:
    preprocessed_image_path = os.path.join(preprocessed_dir, os.path.basename(file))
    preprocess_image(file, preprocessed_image_path)
    preprocessed_image_files.append(preprocessed_image_path)

# Function to train Tesseract on a set of images and their corresponding box files
def train_tesseract(image_files, box_files, output_dir):
    # Combine all box files into a single list
    box_file_list = " ".join(box_files)
    
    # Generate the unicharset file
    os.system(f"unicharset_extractor {box_file_list}")
    
    # Create the font_properties file
    with open(os.path.join(output_dir, "font_properties"), "w") as f:
        f.write("Bangla 0 0 0 0 0")
    
    # Generate the .tr files
    for image_file, box_file in zip(image_files, box_files):
        os.system(f"tesseract {image_file} {image_file.replace('.png', '')} -l ben nobatch box.train")
    
    # Generate the character set files
    os.system(f"mftraining -F {os.path.join(output_dir, 'font_properties')} -U unicharset -O {os.path.join(output_dir, 'output_unicharset')} {box_file_list}")
    os.system(f"cntraining {box_file_list}")
    
    # Combine the training data into a single traineddata file
    os.system(f"combine_tessdata {os.path.join(output_dir, 'ben.unicharset')}")

# Train Tesseract on the preprocessed images
train_tesseract(preprocessed_image_files, box_files, "/kaggle/working")

# Apply the trained model to perform OCR on the preprocessed images
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(files))
    
    batch_filenames = files[start_idx:end_idx]
    batch_extracted_texts = []
    
    # Loop through each file in the batch
    for filename in batch_filenames:
        # Construct file path for preprocessed image
        file_path = os.path.join(preprocessed_dir, os.path.basename(filename))

        # Open the preprocessed image
        with Image.open(file_path) as preprocessed_image:
            # Perform OCR on the preprocessed image using the custom trained model
            extracted_text = pytesseract.image_to_string(preprocessed_image, lang='ben')

        # Append filename and extracted text to lists
        filenames.append(filename)
        batch_extracted_texts.append(extracted_text)

        # Display the image and extracted text
        display(preprocessed_image)
        print(f"Extracted Text from {filename}:\n{extracted_text}\n")
    
    # Add batch extracted texts to the main extracted texts list
    extracted_texts.extend(batch_extracted_texts)

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd
from IPython.display import display

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Set the batch size
batch_size = 50
num_batches = (len(files) + batch_size - 1) // batch_size

# Function to resize an image to one-third of its original size
def resize_image(image_path):
    with Image.open(image_path) as img:
        original_width, original_height = img.size
        target_width = original_width 
        target_height = original_height 
        img_resized = img.resize((target_width, target_height), Image.ANTIALIAS)
    return img_resized

# Loop through each batch of files
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(files))
    
    batch_filenames = files[start_idx:end_idx]
    batch_extracted_texts = []
    
    # Loop through each file in the batch
    for filename in batch_filenames:
        # Construct file path
        file_path = os.path.join(source_dir, filename)

        # Resize the image
        resized_image = resize_image(file_path)

        # Perform OCR on the resized image
        extracted_text = pytesseract.image_to_string(resized_image)

        # Append filename and extracted text to lists
        filenames.append(filename)
        batch_extracted_texts.append(extracted_text)

        # Display the image and extracted text
        display(resized_image)
        print(f"Extracted Text from {filename}:\n{extracted_text}\n")
    
    # Add batch extracted texts to the main extracted texts list
    extracted_texts.extend(batch_extracted_texts)

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [None]:
import os
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
from IPython.display import display

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Set the batch size
batch_size = 50
num_batches = (len(files) + batch_size - 1) // batch_size

# Function to resize an image to one-third of its original size
def resize_image(image_path):
    with Image.open(image_path) as img:
        original_width, original_height = img.size
        target_width = original_width // 3
        target_height = original_height // 3
        img_resized = img.resize((target_width, target_height), Image.ANTIALIAS)
    return img_resized

# Function to enhance image quality
def enhance_image(image):
    # Apply contrast enhancement
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # Increase contrast by a factor of 2
    
    # Apply noise removal (Gaussian blur)
    image = image.filter(ImageFilter.GaussianBlur(radius=1))
    
    return image

# Loop through each batch of files
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(files))
    
    batch_filenames = files[start_idx:end_idx]
    batch_extracted_texts = []
    
    # Loop through each file in the batch
    for filename in batch_filenames:
        # Construct file path
        file_path = os.path.join(source_dir, filename)

        # Resize the image
        resized_image = resize_image(file_path)
        
        # Enhance the image quality
        enhanced_image = enhance_image(resized_image)

        # Perform OCR on the enhanced image
        extracted_text = pytesseract.image_to_string(enhanced_image)

        # Append filename and extracted text to lists
        filenames.append(filename)
        batch_extracted_texts.append(extracted_text)

        # Display the image and extracted text
        display(enhanced_image)
        print(f"Extracted Text from {filename}:\n{extracted_text}\n")
    
    # Add batch extracted texts to the main extracted texts list
    extracted_texts.extend(batch_extracted_texts)

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Loop through each file in the source directory
for filename in files[:10]:  # Process only the first 5 files
    # Construct file path
    file_path = os.path.join(source_dir, filename)

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Append filename and extracted text to lists
    filenames.append(filename)
    extracted_texts.append(extracted_text)

    # Display the image and extracted text
    display(image)
    print(f"Extracted Text from {filename}:\n{extracted_text}\n")

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)



In [None]:
import os
import shutil
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly
output_dir = '/kaggle/working/renamed-images'     # New directory for renamed images

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
extracted_texts = []

# Loop through each file in the source directory
for filename in files:
    # Construct file paths
    file_path = os.path.join(source_dir, filename)
    output_file_path = os.path.join(output_dir, filename)

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Rename the file with the extracted text
    new_filename = os.path.join(output_dir, extracted_text + '.jpg')  # Assuming it's a jpg file
    shutil.copyfile(file_path, new_filename)

    # Append extracted text to the list
    extracted_texts.append(extracted_text)

# Create a DataFrame from the extracted texts
data = {"Original Filename": files, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store renamed filenames and extracted texts
renamed_filenames = []
extracted_texts = []

# Loop through each file in the source directory
for idx, filename in enumerate(files[:5], start=1):  # Process only the first 5 files
    # Construct file path
    file_path = os.path.join(source_dir, filename)

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Clean the extracted text
    cleaned_text = ''.join(c for c in extracted_text if c.isprintable())

    # Rename the filename
    new_filename = f"pres{idx}.jpg"

    # Append renamed filename and cleaned text to lists
    renamed_filenames.append(new_filename)
    extracted_texts.append(cleaned_text)

    # Save the image with the new filename
    new_file_path = os.path.join(source_dir, new_filename)
    os.rename(file_path, new_file_path)

# Create a DataFrame from the lists
data = {"Filename": renamed_filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
excel_file_path = '/kaggle/working/extracted_data.xlsx'  # Adjust the path as needed
df.to_excel(excel_file_path, index=False)

# Display the path to the saved Excel file
print(f"Excel file saved to: {excel_file_path}")


In [None]:
import difflib

def calculate_match_percentage(physician_name, extracted_text):
    # Convert both strings to lowercase for better comparison
    physician_name = physician_name.lower()
    extracted_text = extracted_text.lower()

    # Get the ratio of similarity between the physician name and the extracted text
    similarity_ratio = difflib.SequenceMatcher(None, physician_name, extracted_text).ratio()

    # Convert the ratio to a percentage
    match_percentage = round(similarity_ratio * 100)

    return match_percentage

# Test data
data = [
    {"Filename": "PRS208C6014044.jpg", "Extracted Text": "oo weworoyrss aaapoy Â© SE PIIEA JONUs sy tyiedrey)Be", "PHY_NM": "DR. ANJAN BEPARY", "PHY_ID": "DHA25813"},
    {"Filename": "PRS208C6020458.jpg", "Extracted Text": "Wis Gils It PET Seysanta, Fie, Cras), awa (Raa fb)Ie, Fy, er Cas cee coe aresorte dee eee Ser eet(cee, Btreie aetem, Bem)â€˜aaa Teco ves atest Creme cam)seh sete Cre, s,s eMSe fee worm Set, teDr. Md. Noor Kutubul AlamMBBS, BCS (Health) FCPS(EN.1)Ear, Nose Throat Spestals & Head-Neck SurgeonAdvanced Training in Endoscope Thytoid & Neck Surgery(Korea, Tata Cencer Hospital, Indi)Advanced! Training in Miro ear Surgery (France & Bangali)Assstant Profesor, Department of ENTJashore Medical College Hospital, ashoreBMDC No, A:39409 RS Mukter hossain 1D wai 32) wf; 08/04/202508047024510Phone: 018852648051 Cap. Esolok 20 mg .Chief Complaint Sens >t sean oieÂ© FBSENSATIONIN 2 Tab. Vifas 120 mgTHROAT rest oa rem en+ cough3 Tab. Deflux 10 mgiagnosi Se yet saa toraagnosis :+ CHRONIC PHARYNGITIS: 4. RHINOMIST NASAL SPRAYR40raca cl tea Signature", "PHY_NM": "DR. MD. ZAHANGIR ALAM (BABU)", "PHY_ID": "DHA14913"},
    # Add more data entries as needed
]

# Calculate match percentage for each entry
for entry in data:
    match_percentage = calculate_match_percentage(entry["PHY_NM"], entry["Extracted Text"])
    entry["Match_Percentage"] = match_percentage

# Print the results
print("Filename\tExtracted Text\tPHY_NM\tPHY_ID\tMatch_Percentage")
for entry in data:
    print(f"{entry['Filename']}\t{entry['Extracted Text']}\t{entry['PHY_NM']}\t{entry['PHY_ID']}\t{entry['Match_Percentage']}")


In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# Output directory for renamed files
output_dir = '/kaggle/output'  # Adjust the path accordingly

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store renamed filenames and extracted texts
renamed_filenames = []
extracted_texts = []

# Loop through each file in the source directory
for idx, filename in enumerate(files[:100], start=1):  # Process only the first 5 files
    # Construct file paths
    file_path = os.path.join(source_dir, filename)
    new_file_path = os.path.join(output_dir, f"pres{idx}.jpg")

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image with fine-tuning
    extracted_text = pytesseract.image_to_string(image, config='--oem 1 --psm 6')

    # Clean the extracted text
    cleaned_text = ''.join(c for c in extracted_text if c.isprintable())

    # Append renamed filename and cleaned text to lists
    renamed_filenames.append(new_file_path)
    extracted_texts.append(cleaned_text)

    # Save the image with the new filename
    image.save(new_file_path)

# Create a DataFrame from the lists
data = {"Filename": renamed_filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
excel_file_path = '/kaggle/working/extracted_data.xlsx'  # Adjust the path as needed
df.to_excel(excel_file_path, index=False)

# Display the path to the saved Excel file
print(f"Excel file saved to: {excel_file_path}")


In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store filenames and extracted texts
filenames = []
extracted_texts = []

# Loop through each file in the source directory
for filename in files[:309]:  # Process only the first 5 files
    # Construct file path
    file_path = os.path.join(source_dir, filename)

    # Open the image file
    image = Image.open(file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Clean the extracted text
    cleaned_text = ''.join(c for c in extracted_text if c.isprintable())

    # Append filename and cleaned text to lists
    filenames.append(filename)
    extracted_texts.append(cleaned_text)

# Create a DataFrame from the lists
data = {"Filename": filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
excel_file_path = '/kaggle/working/extracted_data.xlsx'  # Adjust the path as needed
df.to_excel(excel_file_path, index=False)

# Display the path to the saved Excel file
print(f"Excel file saved to: {excel_file_path}")


In [None]:

import os
import shutil
import pytesseract
from PIL import Image
import pandas as pd

# Ensure that Tesseract OCR is installed
!sudo apt-get install tesseract-ocr

# Install pytesseract
!pip install pytesseract

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# New directory for renamed files
renamed_dir = '/kaggle/working/renamed_images'
os.makedirs(renamed_dir, exist_ok=True)

# List all files in the source directory
files = os.listdir(source_dir)

# Initialize lists to store renamed filenames and extracted texts
renamed_filenames = []
extracted_texts = []

# Loop through each file in the source directory
for idx, filename in enumerate(files[:309], start=1):  # Process only the first 309 files
    # Construct file paths
    file_path = os.path.join(source_dir, filename)
    new_filename = f"pres{idx}.jpg"
    new_file_path = os.path.join(renamed_dir, new_filename)

    # Copy the image file with the new filename
    shutil.copyfile(file_path, new_file_path)

    # Open the image file
    image = Image.open(new_file_path)

    # Perform OCR on the image
    extracted_text = pytesseract.image_to_string(image)

    # Clean the extracted text
    cleaned_text = ''.join(c for c in extracted_text if c.isprintable())

    # Append renamed filename and cleaned text to lists
    renamed_filenames.append(new_filename)
    extracted_texts.append(cleaned_text)

# Create a DataFrame from the lists
data = {"Filename": renamed_filenames, "Extracted Text": extracted_texts}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
excel_file_path = '/kaggle/working/extracted_data.xlsx'  # Adjust the path as needed
df.to_excel(excel_file_path, index=False)

# Display the path to the saved Excel file
print(f"Excel file saved to: {excel_file_path}")


In [None]:

import os
from PIL import Image

# Source directory
source_dir = '/kaggle/input/prescription-image'  # Adjust the path accordingly

# Output directory for training data
training_data_dir = '/kaggle/working/training_data'  # Adjust the path as needed

# Create the output directory if it doesn't exist
os.makedirs(training_data_dir, exist_ok=True)

# Function to generate training data files
def generate_training_data(image_file, text):
    base_name = os.path.splitext(os.path.basename(image_file))[0]
    with open(os.path.join(training_data_dir, f"{base_name}.gt.txt"), 'w') as f:
        f.write(text)
    # Tesseract requires TIFF format for training data
    image = Image.open(image_file)
    image.save(os.path.join(training_data_dir, f"{base_name}.tif"))

# Loop through each file in the source directory
for filename in os.listdir(source_dir):
    file_path = os.path.join(source_dir, filename)
    # Here you need to extract the ground truth text for each image
    # You can manually annotate the text or use an existing dataset with annotations
    ground_truth_text = "Dr. Asit BaranPGT (Ortho), D-orthe (Course)9 arg, wo, sre| eres 9 Grope mrers wD Medical Officercarn, Gan wratet sm mea 5| ret corre Bors ES r       t MetivbboyKeod sf RestAeacderd\ ¢  Foe we er oe rae seecontaSHIP o"
    generate_training_data(file_path, ground_truth_text)


In [None]:
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt

# Load images
image_files = [
    '/mnt/data/PRS208C4002621.jpg',
    '/mnt/data/PRS208C4017521.jpg',
    '/mnt/data/PRS208C4018544.jpg',
    '/mnt/data/PRS208C4018582.jpg',
    '/mnt/data/PRS208C4018608.jpg',
    '/mnt/data/PRS208C4018624.jpg',
    '/mnt/data/PRS208C4018627.jpg'
]

processed_images = []

for file in image_files:
    # Read the image
    image = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
    
    # Resize the image
    resized_image = cv2.resize(image, (128, 32))
    
    # Binarize the image
    _, binary_image = cv2.threshold(resized_image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Append processed image
    processed_images.append(binary_image)

# Display the first processed image as an example
plt.imshow(processed_images[0], cmap='gray')
plt.show()
