In [1]:
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np

from bs4 import BeautifulSoup
import requests

%matplotlib inline

In [2]:
IMAGES_PATH = '../../data/mosaic/'

def find_mosaic_url(img_path, verbose=False):
    img=Image.open(img_path)
    img = img.rotate(-90)

    ocr_text = pytesseract.image_to_string(img)
    ocr_title = ocr_text.split('\n\n')[0]

    if verbose:
        print(f"OCR text: {ocr_text}")
        print(f"OCR title: {ocr_title}")
        plt.imshow(np.asarray(img))
    
    google_url = 'https://www.google.co.uk/search'
    mosaic_url = 'https://mosaicscience.com/'
    params = {
        'q': f"site:{mosaic_url} {ocr_title}"
    }

    response = requests.get(google_url, params=params)

    soup = BeautifulSoup(response.text, 'html.parser')
    first_result = soup.find('h3', {'class': 'r'})

    if not first_result:
        google_url = 'https://www.google.co.uk/search'
        params = {
            'q': f"{ocr_text}"
        }

        response = requests.get(google_url, params=params)

        soup = BeautifulSoup(response.text, 'html.parser')
        first_result = soup.find('h3', {'class': 'r'})

    result_url = None
    if first_result:
        result_url = first_result.a['href'].split('/url?q=')[1].split('&')[0]
        
        
    if verbose:
        print(result_url)
        
    return result_url

In [3]:
dataset = [
    ('IMG_3896.JPG', 'https://mosaicscience.com/story/parkinsons-disease-psychosis'),
    ('IMG_5346.JPG', 'https://mosaicscience.com/story/experience-expertise-care-health-research'),
    ('IMG_1473.JPG', 'https://mosaicscience.com/story/developmental-delay-disorders-milestones-childhood-genetic'),
    ('IMG_4463.JPG', 'https://mosaicscience.com/story/why-good-people-turn-bad-online-science-trolls-abuse'),
    ('IMG_4910.JPG', 'https://mosaicscience.com/story/sick-building-syndrome-buildings-or-people'),
    ('IMG_5888.JPG', 'https://mosaicscience.com/story/deep-brain-stimulation-depression-clinical-trial'),
    ('IMG_4199.JPG', 'https://mosaicscience.com/story/anosognosia-assisted-outpatient-treatment-lauras-law'),
    ('IMG_7357.JPG', 'https://mosaicscience.com/story/gonorrhoea-gonorrhea-STI-Thailand-clap-USA-AMR-super'),
    ('IMG_1663.JPG', 'https://mosaicscience.com/story/violence-crime-knife-chicago-glasgow-gang-epidemic-gun-health-prevention')
]

for img_name, mosaic_url in dataset:
    img_path = f"{IMAGES_PATH}/{img_name}"
    retrieved_mosaic_url = find_mosaic_url(img_path)
    
    success = (mosaic_url==retrieved_mosaic_url)
    print(img_name, mosaic_url, retrieved_mosaic_url, success)

IMG_3896.JPG https://mosaicscience.com/story/parkinsons-disease-psychosis https://mosaicscience.com/story/parkinsons-disease-psychosis True
IMG_5346.JPG https://mosaicscience.com/story/experience-expertise-care-health-research https://mosaicscience.com/story/experience-expertise-care-health-research True
IMG_1473.JPG https://mosaicscience.com/story/developmental-delay-disorders-milestones-childhood-genetic https://mosaicscience.com/story/developmental-delay-disorders-milestones-childhood-genetic True
IMG_4463.JPG https://mosaicscience.com/story/why-good-people-turn-bad-online-science-trolls-abuse https://mosaicscience.com/story/why-good-people-turn-bad-online-science-trolls-abuse True
IMG_4910.JPG https://mosaicscience.com/story/sick-building-syndrome-buildings-or-people https://mosaicscience.com/story/sick-building-syndrome-buildings-or-people True
IMG_5888.JPG https://mosaicscience.com/story/deep-brain-stimulation-depression-clinical-trial https://mosaicscience.com/story/deep-brain-s