### Extract each characters from original captcha 

- data preprocessing by openCV
- cut by contours and width (define by historical dataset, rule based)

In [4]:
import os
import os.path
import cv2
import glob
import imutils



### Extract letter from Image



In [6]:
CAPTCHA_IMAGE_FOLDER = "image_label_file_amazon"
captcha_image_files = glob.glob(os.path.join(CAPTCHA_IMAGE_FOLDER, "*"))
filename = os.path.basename(captcha_image_files[0])
image = cv2.imread(captcha_image_files[0])
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
cv2.imshow('image',image)
cv2.waitKey(0)
cv2.destroyAllWindows()



- glob.glob >> file pattern matching
- os.path.basename >> extract file name

In [7]:
CAPTCHA_IMAGE_FOLDER = "image_label_file_amazon"
OUTPUT_FOLDER = "extracted_letter_images_amazon"

## Get a list of all captcha images
## glob.glob >> file pattern matching
captcha_image_files = glob.glob(os.path.join(CAPTCHA_IMAGE_FOLDER, "*"))
counts = {}

## Loop over files

for (i, captcha_image_file) in enumerate(captcha_image_files):
    print("processing image {}/{}".format(i+1,len(captcha_image_files)))
    
    ## Since the filename contains the captcha text
    ## grab the base filename as the text
    
    filename = os.path.basename(captcha_image_file)
    ## split the file name
    captcha_correct_text = os.path.splitext(filename)[0]
    
    ## Load image and convert to grayscale
    image = cv2.imread(captcha_image_file)
    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    
    ## Add some extra padding around the image (constant border)
    gray = cv2.copyMakeBorder(gray,20,20,20,20,cv2.BORDER_CONSTANT,value = (255,255,255))
    
    ## threshold the image (convert to black and white)
    ## https://blog.csdn.net/on2way/article/details/46812121
    
    thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    
    ## find contours (只接受GRAYSCALE IMAGE)
    ## RETR_EXTERNAL >> 外輪廓檢測
    contours = cv2.findContours(thresh.copy(),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    
    ## len(contours) >> 幾組輪廓
    # Hack for compatibility with different OpenCV versions
    contours = contours[0] if imutils.is_cv2() else contours[1]
    
    letter_image_regions = []
    
    ## Loop through each of the contours and extract the letters
    ## if contour are too wide, cut it
    letter_left = 6
    for contour in contours:
        
        ## get the rectangle that contains the contour
        
        (x,y,w,h) = cv2.boundingRect(contour)
        if w >10:
            # Compare the width and height of the contour to detect letters that
            # are conjoined into one chunk
            
            if w >=125 and letter_left >=5:
                letters_width = int(w / 5)
                letter_image_regions.append((x, y, letters_width, h))
                letter_image_regions.append((x + letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width+letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width+letters_width+letters_width, y, letters_width, h))

                letter_left = letter_left - 5

            elif w >=95 and letter_left >= 4:
                letters_width = int(w / 4)
                letter_image_regions.append((x, y, letters_width, h))
                letter_image_regions.append((x + letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width+letters_width, y, letters_width, h))

                letter_left = letter_left - 4

            elif w >68 and letter_left >= 3:
                letters_width = int(w / 3)
                letter_image_regions.append((x, y, letters_width, h))
                letter_image_regions.append((x + letters_width, y, letters_width, h))
                letter_image_regions.append((x + letters_width+letters_width, y, letters_width, h))

                letter_left = letter_left - 3

            elif w >=38 and letter_left >= 2:
                half_width = int(w / 2)
                letter_image_regions.append((x, y, half_width, h))
                letter_image_regions.append((x + half_width, y, half_width, h))

                letter_left = letter_left - 2

            else:
                letter_image_regions.append((x, y, w, h))
                letter_left = letter_left - 1            
         
    if len(letter_image_regions) != 6:
        print("length not equal to six")
        print(captcha_correct_text)
        continue
        
    letter_image_regions = sorted(letter_image_regions,key = lambda x: x[0])
    
    ## Save letter as a single image
    
    for letter_bounding_box,letter_text in zip(letter_image_regions,captcha_correct_text):
        # Grab the coordinates of the letter in the image
        
        x,y,w,h = letter_bounding_box
        # Extract the letter from the original image with a 2-pixel margin around the edge
        letter_image = gray[y - 2:y + h + 2, x - 2:x + w + 2]
        
        # Get the folder to save the image in
        
        save_path = os.path.join(OUTPUT_FOLDER,letter_text)
        
        # if the output directory does not exist, create it
        if not os.path.exists(save_path):
            os.makedirs(save_path)    
           
        # write the letter image to a file
        ## dict.get(key,default)
        count = counts.get(letter_text,1)
        p = os.path.join(save_path, "{}.png".format(str(count).zfill(6)))
        cv2.imwrite(p, letter_image)

        # increment the count for the current key
        counts[letter_text] = count + 1


                
        
        
        

    
    
    

processing image 1/733
processing image 2/733
processing image 3/733
processing image 4/733
processing image 5/733
processing image 6/733
processing image 7/733
processing image 8/733
processing image 9/733
processing image 10/733
processing image 11/733
processing image 12/733
processing image 13/733
processing image 14/733
processing image 15/733
processing image 16/733
processing image 17/733
processing image 18/733
processing image 19/733
processing image 20/733
processing image 21/733
processing image 22/733
processing image 23/733
processing image 24/733
processing image 25/733
processing image 26/733
processing image 27/733
processing image 28/733
processing image 29/733
processing image 30/733
processing image 31/733
processing image 32/733
processing image 33/733
processing image 34/733
processing image 35/733
processing image 36/733
processing image 37/733
processing image 38/733
processing image 39/733
processing image 40/733
processing image 41/733
processing image 42/733
p