In [None]:
## Script to rename images based on the labels within them
# uses tesseract to read the images

In [131]:
import os
from PIL import Image
import pytesseract
import pandas as pd
import re

In [27]:
## Before we can get this to work, we have to install tesseract because pytesseract is just a wrapper script. I installed it with homebrew
#brew install tesseract

# if calling tesseract in your terminal doesn't work, you can set the directory of tesseract with this:

#get the full path of tesseract
#brew info tesseract
#pytesseract.pytesseract.tesseract_cmd = r'/usr/local/Cellar/tesseract/5.3.4_1/bin'

In [None]:
#### DO NOT RUN IN JUPYTER NOTEBOOK ###########

# I want to convert the images to tiff because I don't think tesseract with read the raw images.
# Update: it can read .CR2 files, but it takes longer

#This can be accomplished with this for loop, but I also want to correct for white balance, which you can do pretty easily in RawTherepee
for i in ../all_raw/*.CR2
do
    BASE=$( basename $i | sed 's/.CR2//g')
    echo $BASE
    
    sips -s format tiff $i --out ./${BASE}.tiff
done

## Pre-processing the images before I rename them

I wanted to preprocess them into .tif images before I rename them, so that it doesn't take tesseract as much time to go through them. I shot the images in raw (.CR2 for canon), so we can easily correct white balance after the fact. If anyone is doing this again, I would try turn the auto white balance off on the camera, so you hopefully don't have to correct for white balance. In the grape image processing guide here: https://github.com/underhillanna/GrapeImageAnalysis, they also corrected for white balance. I tested it out to make sure that correcting the white balance actually makes a difference and the black background looks a lot more consistent in the white-balanced photos, so I am going to do it with all the images.

I'm going to use RawTherapee (https://rawtherapee.com/) to run through this and export as a 8 bit tif file quickly.


1. In the "File Browser" tab of RawTherapee, open up the folder with all the .CR2 files.
2. Go to the Editor tab. Double click on the first image (make sure the check mark in the left hand corner shows up) Click on the white balance tab (on the right side, the thrid option). 
3. Click White Balance to expand the menu. Select "Method:Custom".
4. Select "Pick"
5. Once the dropper comes up, click on the white label. This will automatically correct the white balance. This will save a .pp3 file to where that image is saved. Now we can use this to correct the white balance for all the images.
6. Double click on the next image (again make sure that the check mark shows up)
7. Under "Processing Profiles" select the folder. Select the .pp3 from the first image that you fixed.
8. Right click on the image that you just edited. Go to "Processing profile operations" and select "Copy".
9. Right click on the image again and click "Select all" (make sure all the squares are now highlighted.
10. Right click again. Go to "Processing profile operations" and select "paste". It'll take a little bit to process if you have a lot of images. Mine froze and got the spinning ball of death for ~3 min and then was able to process everything. All together it took no more than 5 min. At this point I would go through and make sure all the images are the right direction and make sure the white balance didn't get weird on anything. If something is wrong, double click the image and fix whatever you want. It won't have any impact on the other images.
11. Make sure you have all images selected. Right click again and use the "Put to queue" with the gears next to it. This will put the images in the processing queue. From there you can export as a 8 bit tif file to where ever you want. 16 bit tif files would need to be converted to 8 bit if you are going to use them in the other downstream processing I at least have planned.
12. Once you have where you want to save all the images and the file format set up, toggle the on/off switch for the "Queue".

At this point, I separated the images into folders for the the ones that have full inflorescences, four bracts, and only 1 bract. This will make it easier for renaming.


Notes: 
* F2-06-113 doesn't have 3 full inflorescence pictures (find IMG_2637.CR2 is a double)
* F2-06-225 doesn't have 3 full inflorescence pictures (noted in sheet)
* F2-06-104 doesn't have 3 full inflorescence pictures ( no idea why)
* F2-06-248 has the 262 label in it
* f2-06-235 only has one full inflorescence and one four-bract and one bract

Full inflorescence pictures: 346
Four bract pictures: 349
One bract pictures: 349

In [121]:
# Function to extract text from an image using Tesseract OCR --psm 1 or 12 works pretty well 12 results in some weird things though
# image_path = path to image
# psm = which page segmentation option to use
    # options:
        #   0    Orientation and script detection (OSD) only.
        #   1    Automatic page segmentation with OSD.
        #   2    Automatic page segmentation, but no OSD, or OCR.
        #   3    Fully automatic page segmentation, but no OSD. (Default)
        #   4    Assume a single column of text of variable sizes.
        #   5    Assume a single uniform block of vertically aligned text.
        #   6    Assume a single uniform block of text.
        #   7    Treat the image as a single text line.
        #   8    Treat the image as a single word.
        #   9    Treat the image as a single word in a circle.
        #  10    Treat the image as a single character.
        #  11    Sparse text. Find as much text as possible in no particular order.
        #  12    Sparse text with OSD.
        #  13    Raw line. Treat the image as a single text line,
        #                         bypassing hacks that are Tesseract-specific.
        
# greyscale = which pillow greyscale method to use
    # options:
        # none
        # LA
        # L

def extract_text_from_image(image_path, seg, greyscale):
    
    psm_config = f"--psm {seg} -c tessedit_char_whitelist=0123456789F-"
    
    if greyscale == "none":
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, config=psm_config)
        
    if greyscale == "LA":
        image = Image.open(image_path).convert('LA')
        text = pytesseract.image_to_string(image, config=psm_config)
        
    if greyscale == "L":
        image = Image.open(image_path).convert('L')
        text = pytesseract.image_to_string(image, config=psm_config)
    
    return text.strip()

In [122]:
# Function to call the extract text from image function and then actually change the name of the files
# options:
    # image_dir = directory where all the files are that you want to iterate through
    # file_end = what to end the file name with
    # seg1 = first segmentation to try
    # d = dataframe to output name conversions to
    # seg2 = second segmentation option to try

def rename_image(image_dir, file_end, seg1, d, seg2):
    # clear the label list to ensure the correct numbering happens
    label_list = []

    for filename in sorted(os.listdir(image_dir)):
        if filename.endswith(".tif"):
            image_path = os.path.join(image_dir, filename)
            # Extract text from the image
            label = extract_text_from_image(image_path, seg1, "none")

            # if it was able to read the ID, finish it
            if label.startswith("F2"):
                label_list.append(label)
                # Define your renaming logic here
                new_filename = f"{label}_{label_list.count(label)}_{file_end}.tif"
                #output conversion to a dataframe
                d.append((new_filename, filename, 'No_greyscale', seg1))
                # Rename the file
                os.rename(image_path, os.path.join(image_dir, new_filename))

                #print(label + str(label_list.count(label)) + "\t" + filename)


            # if it was not able to read the ID, move on to trying to convert the image to greyscale
            else: 
                label_LA = extract_text_from_image(image_path, seg1, "LA")
                if label_LA.startswith("F2"):
                    label_list.append(label_LA)
                    # Define your renaming logic here
                    new_filename = f"{label_LA}_{label_list.count(label_LA)}_{file_end}.tif"
                    #output conversion to a dataframe
                    d.append((new_filename, filename, 'LA', seg1))
                    # Rename the file
                    os.rename(image_path, os.path.join(image_dir, new_filename))

                    #print("LA" + label_LA + str(label_list.count(label_LA)) + "\t" + filename)

                # If the LA greyscale convert didn't work, try the other one
                else:
                    label_L = extract_text_from_image(image_path, seg1, "L")
                    if label_L.startswith("F2"):
                        label_list.append(label_L)
                        # Define your renaming logic here
                        new_filename = f"{label_L}_{label_list.count(label_L)}_{file_end}.tif"
                        #output conversion to a dataframe
                        d.append((new_filename, filename, 'L', seg1))
                        # Rename the file
                        os.rename(image_path, os.path.join(image_dir, new_filename))

                        #print("L" + label_L + str(label_list.count(label_L)) + "\t" + filename)

                    else:
                        # Extract text from the image
                        label = extract_text_from_image(image_path, seg2, "none")

                        # if it was able to read the ID, finish it
                        if label.startswith("F2"):
                            label_list.append(label)
                            # Define your renaming logic here
                            new_filename = f"{label}_{label_list.count(label)}_{file_end}.tif"
                            #output conversion to a dataframe
                            d.append((new_filename, filename, 'No_greyscale', seg2))
                            # Rename the file
                            os.rename(image_path, os.path.join(image_dir, new_filename))

                            #print(label + str(label_list.count(label)) + "\t" + filename)


                        # if it was not able to read the ID, move on to trying to convert the image to greyscale
                        else: 
                            label_LA = extract_text_from_image(image_path, seg2, "LA")
                            if label_LA.startswith("F2"):
                                label_list.append(label_LA)
                                # Define your renaming logic here
                                new_filename = f"{label_LA}_{label_list.count(label_LA)}_{file_end}.tif"
                                #output conversion to a dataframe
                                d.append((new_filename, filename, 'LA', seg2))
                                # Rename the file
                                os.rename(image_path, os.path.join(image_dir, new_filename))

                                #print("LA" + label_LA + str(label_list.count(label_LA)) + "\t" + filename)


                            # If the LA greyscale convert didn't work, try the other one
                            else:
                                label_L = extract_text_from_image(image_path, seg2, "L")
                                if label_L.startswith("F2"):
                                    label_list.append(label_L)
                                    # Define your renaming logic here
                                    new_filename = f"{label_L}_{label_list.count(label_L)}_{file_end}.tif"
                                    #output conversion to a dataframe
                                    d.append((new_filename, filename, 'L', seg2))
                                    # Rename the file
                                    os.rename(image_path, os.path.join(image_dir, new_filename))

                                    #print("L" + label_L + str(label_list.count(label_L)) + "\t" + filename)


                            # if none of that worked, name the file unknown.
                                else:
                                    label_list.append("unknown")
                                    # Define your renaming logic here
                                    new_filename = f"unknown_{label_list.count(label_L)}_{file_end}.tif"
                                    #output conversion to a dataframe
                                    d.append((new_filename, filename, 'L', seg2))
                                    # Rename the file
                                    os.rename(image_path, os.path.join(image_dir, new_filename))

                                    #print("unknown" + "\t" + filename)

In [123]:
d = []

rename_image(image_dir = "pictures/all_tif/1.full_inflorescence/", 
             file_end = "full",
             seg1 = 1,
             d = d,
             seg2 = 12)

rename_image(image_dir = "pictures/all_tif/2.four_bracts/", 
             file_end = "four",
             seg1 = 1,
             d = d,
             seg2 = 12)

rename_image(image_dir = "pictures/all_tif/3.one_bract/", 
             file_end = "one",
             seg1 = 1,
             d = d,
             seg2 = 12)

In [126]:
conversion = pd.DataFrame(d, columns=('New_Name', 'Old_Name', 'Grey_scale', 'segmentation'))

In [None]:
# keep a record of original names and what it was converted to
conversion.to_excel("pictures/all_tif/rename_conversions.xlsx")

In [157]:
#Now that the pictures are renamed, I want to make sure that we have all the same names for each group of pictures

full_names = os.listdir("pictures/all_tif/1.full_inflorescence/")

full_names_short = []

for i in full_names:
    new_name = re.sub('_full.tiff', '', i)
    full_names_short.append(new_name)
    
    
#full_names_short

In [143]:
four_names = os.listdir("pictures/all_tif/2.four_bracts/")

four_names_short = []

for i in four_names:
    new_name = re.sub('_four.tiff', '', i)
    four_names_short.append(new_name)
    
#four_names_short

In [145]:
one_names = os.listdir("pictures/all_tif/3.one_bract/")

one_names_short = []

for i in one_names:
    new_name = re.sub('_one.tiff', '', i)
    one_names_short.append(new_name)
    
#one_names_short

In [158]:
# convert to tuple and see what's different about them

full_names_set = set(full_names_short)
four_names_set = set(four_names_short)
one_names_set = set(one_names_short)

In [147]:
list(four_names_set - one_names_set)

['F2-06-248_1_four.tif']

In [149]:
list(one_names_set - four_names_set)

['F2-06-248_1_one.tif']

In [153]:
list(full_names_set - four_names_set)

['F2-06-248_1_full.tif']

In [154]:
list(full_names_set - one_names_set)

['F2-06-248_1_full.tif']

In [159]:
list(four_names_set - full_names_set)

['F2-06-225_3', 'F2-06-113_3', 'F2-06-248_1_four.tif', 'F2-06-104_3']

In [160]:
list(one_names_set - full_names_set)

['F2-06-225_3', 'F2-06-104_3', 'F2-06-113_3', 'F2-06-248_1_one.tif']

In [None]:
## 248 is just the the difference between .tiff and .tif, so that's not a big desl. 225, 113, and 104 all had one
# full inflorescence missing, so that's okay too. We're good to move on to image analysis now!!!
# first I'm going to replace .tiff with .tif though