# Crack captcha with OCR

## Download the captchas(run this only once)

```
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random
```

```
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
```

```
response = requests.get("https://captcha.com/captcha-examples.html?cst=corg",headers = {"USER-AGENT":user_agent})
```

```
soup = BeautifulSoup(response.text,"lxml")
```

```
base_url = "https://captcha.com/"
image_urls = [base_url + x["src"] for x in soup.select(".captcha_sample")]
image_captcha_type = [x["alt"].split(" ")[2] for x in soup.select(".captcha_sample")]
df = pd.DataFrame({"captcha_type":image_captcha_type,
                 "captcha_url": image_urls})
df["local_path"] = np.nan
df["text"] = np.nan
```

```
# I already generated a captchas folder
for index,row in df.iterrows():
    response = requests.get(row["captcha_url"],headers = {"USER-AGENT":user_agent})
    local_path = f"captchas/{row['captcha_type']}.jpg"
    row["local_path"] = local_path
    with open(local_path,"wb") as f:
        f.write(response.content)
```

## Label them manually(By modifying the file name)

original file name - text.jpg

# Load all information

In [143]:
import os

In [144]:
image_path = ["captchas/" + x for x in os.listdir("captchas")]
images_df = pd.DataFrame({"image_path":image_path})
images_df["captcha_type"] = [x.split("-")[0] for x in os.listdir("captchas")]
images_df["text"] = [x.split("-")[1].split(".")[0] for x in os.listdir("captchas")]

# OCR

## No Pre Processing

### OCR - pytesseract

In [145]:
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'D:\Working_Space\Tesseract-OCR\tesseract'

In [146]:
images_df["pytesseract_no"] = np.nan
for index,row in images_df.iterrows():
    text = pytesseract.image_to_string(Image.open(row["image_path"])).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_no"] = int(1)
    else:
        images_df.at[index,"pytesseract_no"] = int(0)

In [147]:
images_df["pytesseract_no"].mean()

0.0

All failed

### Google Cloud Vision API

In [148]:
import io
import os

# Imports the Google Cloud client library
from google.cloud import vision
from google.cloud.vision import types

# Instantiates a client
client = vision.ImageAnnotatorClient.from_service_account_json("credential/cloud-vision-key.json")

In [149]:
def detect_text(path):
    image_PIl = Image.open(path)
    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)
    image_context = vision.types.ImageContext(language_hints=["en","zh"])

    response = client.text_detection(image=image,image_context=image_context)
    texts = response.text_annotations
    if len(texts) == 0:
        return("")
    else:
        return(texts[0].description.strip())

In [150]:
images_df["api_no"] = np.nan
for index,row in images_df.iterrows():
    text = detect_text(row["image_path"]).strip()
    if text == row["text"]:
        images_df.at[index,"api_no"] = int(1)
    else:
        images_df.at[index,"api_no"] = int(0)

In [151]:
images_df["api_no"].mean()

0.06666666666666667

About 7% Success Rate

## Preprocessing -gray,remove noise, Gaussian blur

In [152]:
import cv2

### OCR - pytesseract

In [153]:
images_df["pytesseract_1"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.GaussianBlur(img, (5, 5), 0)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = pytesseract.image_to_string(Image.open("temp.jpg")).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_1"] = int(1)
    else:
        images_df.at[index,"pytesseract_1"] = int(0)

In [155]:
images_df["pytesseract_1"].mean()

0.0

All Failed

### Google Cloud Vision API

In [156]:
images_df["api_1"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.GaussianBlur(img, (5, 5), 0)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = detect_text("temp.jpg").strip()
    if text == row["text"]:
        images_df.at[index,"api_1"] = int(1)
    else:
        images_df.at[index,"api_1"] = int(0)

In [159]:
images_df["api_1"].mean()

0.1

10% success rate

## Preprocessing --gray,remove noise,Gaussian blur,binary

### OCR - pytesseract

In [162]:
images_df["pytesseract_2"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.GaussianBlur(img, (5, 5), 0)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = pytesseract.image_to_string(Image.open("temp.jpg")).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_2"] = int(1)
    else:
        images_df.at[index,"pytesseract_2"] = int(0)

In [163]:
images_df["pytesseract_2"].mean()

0.016666666666666666

Success Rate 2%

### Google Cloud Vision API

In [164]:
images_df["api_2"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.GaussianBlur(img, (5, 5), 0)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = detect_text("temp.jpg").strip()
    if text == row["text"]:
        images_df.at[index,"api_2"] = int(1)
    else:
        images_df.at[index,"api_2"] = int(0)

In [165]:
images_df["api_2"].mean()

0.05

Success Rate Dropped to 5%. It seems like binarization doesn't work well for google cloud vision api.

Reference:
1. https://medium.freecodecamp.org/getting-started-with-tesseract-part-i-2a6a6b1cf75e
2. https://medium.freecodecamp.org/getting-started-with-tesseract-part-ii-f7f9a0899b3f

## Proprocessing - Median blurring

### OCR - pytesseract

In [166]:
images_df["pytesseract_3"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.medianBlur(img, 3)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = pytesseract.image_to_string(Image.open("temp.jpg")).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_3"] = int(1)
    else:
        images_df.at[index,"pytesseract_3"] = int(0)

In [169]:
images_df["pytesseract_3"].mean()

0.016666666666666666

2%

### Google Cloud Vision API

In [170]:
images_df["api_3"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.medianBlur(img, 3)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = detect_text("temp.jpg").strip()
    if text == row["text"]:
        images_df.at[index,"api_3"] = int(1)
    else:
        images_df.at[index,"api_3"] = int(0)

In [171]:
images_df["api_3"].mean()

0.1

10%, this blur method is similar to gaussian

 ## Proprocessing - Bilateral  blurring

### OCR - pytesseract

In [173]:
images_df["pytesseract_4"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.bilateralFilter(img,9,75,75)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = pytesseract.image_to_string(Image.open("temp.jpg")).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_4"] = int(1)
    else:
        images_df.at[index,"pytesseract_4"] = int(0)

In [175]:
images_df["pytesseract_4"].mean()

0.016666666666666666

### Google Cloud Vision API

In [176]:
images_df["api_4"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.bilateralFilter(img,9,75,75)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = detect_text("temp.jpg").strip()
    if text == row["text"]:
        images_df.at[index,"api_4"] = int(1)
    else:
        images_df.at[index,"api_4"] = int(0)

In [177]:
images_df["api_4"].mean()

0.1

##  Adaptive - Simple Thresholds

### OCR - pytesseract

In [183]:
images_df["pytesseract_5"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.bilateralFilter(img,9,75,75)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = pytesseract.image_to_string(Image.open("temp.jpg")).strip()
    if text == row["text"]:
        images_df.at[index,"pytesseract_5"] = int(1)
    else:
        images_df.at[index,"pytesseract_5"] = int(0)

In [184]:
images_df["pytesseract_5"].mean()

0.0

### Google Cloud Vision API

In [185]:
images_df["api_5"] = np.nan
for index,row in images_df.iterrows():
    #read image
    img = cv2.imread(row["image_path"])
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Apply blur to smooth out the edges
    img = cv2.bilateralFilter(img,9,75,75)
    # Apply threshold to get image with only b&w (binarization)
    img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    #write to temp file
    cv2.imwrite("temp.jpg",img)
    text = detect_text("temp.jpg").strip()
    if text == row["text"]:
        images_df.at[index,"api_5"] = int(1)
    else:
        images_df.at[index,"api_5"] = int(0)

In [186]:
images_df["api_5"].mean()

0.03333333333333333

## Conclusion:

It's hard to recognize captcha with OCR or text recgonizing services since captcha is designed to not be able to be recognized by machines. The highest success rate is 10% using google cloud vision(text recognition) services.

However, if we really need to solve a kind of captcha, we can use captcha solving services(peole mannual sovling captcha for us) or machine learning after we collect enough samples.