In [1]:
import os, io
import json
import re
from google.cloud import vision
from google.cloud import storage

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "infared-81804c88c517.json"

In [3]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    
    mime_type = 'application/pdf'
    batch_size = 2
    
    client = vision.ImageAnnotatorClient()
    
    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()
    
    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])   

In [4]:
storage_client = storage.Client()

class Bucket:
    
    def __init__(self, bucket):
        self.bucket_name = bucket
        self.bucket = storage_client.get_bucket(bucket)
    
    def objects_list(self):
        object_files = [ i.name for i in list(self.bucket.list_blobs())]
        print(object_files)
    
    def upload(self, upload_file):
        filename = '{}'.format(str(upload_file.split("/")[-1]))
        create_blob = self.bucket.blob(filename)
        with open(upload_file, "rb") as f:
            create_blob.upload_from_file(f)
            
            print("-----   upload finished and make object to public and public_url  __________")
            create_blob.make_public()
            print("Object Public Url", create_blob.public_url)
            print("File Meta Details", create_blob.self_link)
            print("Bucket URL  gs://{0}/{1}".format(self.bucket_name, filename))
        
        return "gs://{0}/{1}".format(self.bucket_name, filename)

In [5]:
bucket_name = "anvibucket"
b = Bucket(bucket_name)
b.objects_list()

upload_file = './universal-botanika-Agreement.pdf'
b.upload(upload_file)

['2.Vehicle-Parking-S.S.-Enterprises.Textoutput-1-to-2.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-1-to-4.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-11-to-12.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-13-to-14.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-13-to-15.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-15-to-15.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-3-to-4.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-5-to-6.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-5-to-8.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-7-to-8.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-9-to-10.json', '2.Vehicle-Parking-S.S.-Enterprises.Textoutput-9-to-12.json', '2.Vehicle-Parking-S.S.-Enterprises.pdf', 'Invoice_607940909.pdf', 'Invoice_607940909.textoutput-1-to-2.json', 'SUNIL_RESUME.Textoutput-1-to-2.json', 'SUNIL_RESUME.pdf', 'West-Germany-Agreement.Textoutput-1-to-2.json', 'West-Germany-Agreement.Textoutput-1-t

'gs://anvibucket/universal-botanika-Agreement.pdf'

In [6]:
async_detect_document('gs://anvibucket/universal-botanika-Agreement.pdf', 'gs://anvibucket/universal-botanika-Agreement.Text')

Waiting for the operation to finish.
Output files:
universal-botanika-Agreement.Textoutput-1-to-2.json
universal-botanika-Agreement.Textoutput-1-to-4.json
universal-botanika-Agreement.Textoutput-3-to-4.json
Full text:

DATED THIS 15th DAY OF FEBRUARY, 2014
Between
UNIVERSAL REALTORS PVT LTD
(the "Employer”)
And
TEMPUS INFRA PROJECTS PVT LTD
(the “Contractor")
SUPPLEMENTAL AGREEMENT
FOR SHORT CLOSURE OF
CONTRACT



In [23]:
import json
# file= open('abcd.txt','a')
# with open('universal-botanika-Agreement.Textoutput-1-to-2.json') as json_file:
#     data= json.load(json_file)
#     for item in data:
#         file.write(item)
#         print(item)
        

    
# with open('universal-botanika-Agreement.Textoutput-1-to-2.json','r') as json_file:
#     data = json.dumps(json_file)
#     data= json.loads(data)
    
#     for item in data:
#         print(item)

json_data = "universal-botanika-Agreement.Textoutput-1-to-2.json"
data = json.dumps(json_data)

with open('abcd.txt', 'w') as f:
    f.write(data)

   
        
        
        
        


In [8]:
# file = open('new_json.txt', 'a+') 
# with open('SUNIL_RESUME.Textoutput-1-to-2.json') as json_file:
#     for line in json_file:                                 #Iterate Each Line
#         data= json.loads(line.strip())                     #Use json.loads 
#         for item in data:
#             file.write(item)
#             print(item)

In [9]:
import json
data = json.load(open('universal-botanika-Agreement.Textoutput-1-to-2.json'))

for i in data['inputConfig']:
    print(i)
for j in data['responses']:
    print(j)
print(type(data))


gcsSource
mimeType
{'fullTextAnnotation': {'pages': [{'property': {'detectedLanguages': [{'languageCode': 'en', 'confidence': 0.83}, {'languageCode': 'la', 'confidence': 0.15}]}, 'width': 1264, 'height': 1752, 'blocks': [{'property': {'detectedLanguages': [{'languageCode': 'en', 'confidence': 1}]}, 'boundingBox': {'normalizedVertices': [{'x': 0.22943038, 'y': 0.14497717}, {'x': 0.76265824, 'y': 0.14497717}, {'x': 0.76265824, 'y': 0.1660959}, {'x': 0.22943038, 'y': 0.1660959}]}, 'paragraphs': [{'property': {'detectedLanguages': [{'languageCode': 'en', 'confidence': 1}]}, 'boundingBox': {'normalizedVertices': [{'x': 0.22943038, 'y': 0.14497717}, {'x': 0.76265824, 'y': 0.14497717}, {'x': 0.76265824, 'y': 0.1660959}, {'x': 0.22943038, 'y': 0.1660959}]}, 'words': [{'property': {'detectedLanguages': [{'languageCode': 'en'}]}, 'boundingBox': {'normalizedVertices': [{'x': 0.22943038, 'y': 0.14497717}, {'x': 0.31962025, 'y': 0.14497717}, {'x': 0.31962025, 'y': 0.1660959}, {'x': 0.22943038, 'y':