In [1]:
import json
import re
from google.cloud import vision
from google.cloud import storage

In [13]:
import os

# Import Google Credentials in Environ Varaibles
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "infared-81804c88c517.json"

In [14]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])

In [15]:
storage_client = storage.Client()

class Bucket:
    
    def __init__(self, bucket):
        self.bucket_name = bucket
        self.bucket = storage_client.get_bucket(bucket)
    
    def objects_list(self):
        object_files = [ i.name for i in list(self.bucket.list_blobs())]
        print(object_files)
    
    def upload(self, upload_file):
        filename = '{}'.format(str(upload_file.split("/")[-1]))
        create_blob = self.bucket.blob(filename)
        with open(upload_file, "rb") as f:
            create_blob.upload_from_file(f)
            
            print("-----   upload finished and make object to public and public_url  __________")
            create_blob.make_public()
            print("Object Public Url", create_blob.public_url)
            print("File Meta Details", create_blob.self_link)
            print("Bucket URL  gs://{0}/{1}".format(self.bucket_name, filename))
        
        return "gs://{0}/{1}".format(self.bucket_name, filename)

In [16]:
bucket_name = "anvibucket"
b = Bucket(bucket_name)
b.objects_list()

upload_file = './files/Invoice_607940909.pdf'
b.upload(upload_file)

['./files/Invoice_607940909.pdf', '/MY.PNG', 'Invoice_607940909.pdf', 'MY.PNG', 'SUNIL_RESUME.pdf', 'SUNIL_RESUME.pdfoutput-1-to-2.json', 'arun.PNG', 'download2.JPG', 'saini.docx']
-----   upload finished and make object to public and public_url  __________
Object Public Url https://storage.googleapis.com/anvibucket/Invoice_607940909.pdf
File Meta Details https://www.googleapis.com/storage/v1/b/anvibucket/o/Invoice_607940909.pdf
Bucket URL  gs://anvibucket/Invoice_607940909.pdf


'gs://anvibucket/Invoice_607940909.pdf'

In [17]:
async_detect_document('gs://anvibucket/Invoice_607940909.pdf', 'gs://anvibucket/Invoice_607940909.text')

Waiting for the operation to finish.
Output files:
Invoice_607940909.textoutput-1-to-2.json
Full text:

aws
Account number:
360575614141
Amazon Web Services Statement
Email or talk to us about your AWS account or bill, visit aws.amazon.com/contact-us/
Statement Summary
Statement Number:
607940909
Statement Date:
December 3, 2020
TOTAL AMOUNT DUE
INR 904.86
Bill to Address:
ATTN: Anirudh Loya
103, Surabhi Shraddha, Ahuja Estates
Hyderabad, Telangana, 500001, IN
This Account Summary is for the billing period November 1 - November 30, 2020
Greetings from Amazon Internet Services Private Ltd. We're writing to provide you with an account summary of your use of AWS services. Additional
information about your bill, individual service charge details, and your account history are available on the Account Activity Page.
Summary
AWS Service Charges
Charges
$12.22
$10.36
Credits
$0.00
GST
$1.86
Total for this statement in USD
$12.22
Total for this statement (1 USD = 74.04750000 INR)"
INR 904.86
1
