In [1]:
import os, io
import json
import re
from google.cloud import vision
from google.cloud import storage

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "infared-81804c88c517.json"

In [3]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    
    mime_type = 'application/pdf'
    batch_size = 2
    
    client = vision.ImageAnnotatorClient()
    
    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()
    
    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        blob.make_public()
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])  
    
    first_page_response = response['responses'][1]
    annotation = first_page_response['fullTextAnnotation']
    print(annotation['text'])
    
    

In [4]:
storage_client = storage.Client()

class Bucket:
    
    def __init__(self, bucket):        
        self.bucket_name = bucket
        self.bucket = storage_client.get_bucket(bucket)
    
    def objects_list(self):
        
        object_files = [ i.name for i in list(self.bucket.list_blobs())]
        print(object_files)
    
    def upload(self, upload_file):
        
        filename = '{}'.format(str(upload_file.split("/")[-1]))
        create_blob = self.bucket.blob(filename)
        with open(upload_file, "rb") as f:
            create_blob.upload_from_file(f)
            
            print("-----   upload finished and make object to public and public_url  __________")
            create_blob.make_public()
            print("Object Public Url", create_blob.public_url)
            print("File Meta Details", create_blob.self_link)
            print("Bucket URL  gs://{0}/{1}".format(self.bucket_name, filename))
        
        return "gs://{0}/{1}".format(self.bucket_name, filename)

In [5]:
bucket_name = "anvibucket"
b = Bucket(bucket_name)
b.objects_list()

upload_file = './extec21.pdf'
b.upload(upload_file)

['Invoice_607940909.pdf', 'Invoice_607940909.textoutput-1-to-2.json']
-----   upload finished and make object to public and public_url  __________
Object Public Url https://storage.googleapis.com/anvibucket/extec21.pdf
File Meta Details https://www.googleapis.com/storage/v1/b/anvibucket/o/extec21.pdf
Bucket URL  gs://anvibucket/extec21.pdf


'gs://anvibucket/extec21.pdf'

In [6]:

async_detect_document('gs://anvibucket/extec21.pdf', 'gs://anvibucket/extec21.Text')

Waiting for the operation to finish.
Output files:
extec21.Textoutput-1-to-2.json
Full text:

Jonathan Phillips, PMP
Cell 206.670.2355
jphillips103@earthlink.net
244 Blanchard Street
Seattle, WA 98102
Senior Manager, Project/Program
More than 10 years of experience applying people, process, problem-solving, and technical
skills to improve individual, team and organizational performance. Designed, developed, and
implemented technical infrastructure that led to organization being twice distinguished among
the top 100 Managed Service Providers (MSPs) in the world. Personally recognized as one of
the top 250 people in the MSP community by MSPmentor. An impassioned team leader who
mentors with purpose and understands that strong working relationships create great teams
and produce exceptional results.
Strengths
Contingency Planner... Project Evangelist & Team Builder... Research & Strategic Analysis
Provide Structure, Direction & Vision... Problem Solver... Customer Facing...
"Jonathan is a

In [8]:
# import json

# # Instantiate a Google Cloud Storage client and specify required bucket and file

# blob = bucket.blob('https://storage.googleapis.com/anvibucket/Invoice_607940909.textoutput-1-to-2.json')

# # Download the contents of the blob as a string and then parse it using json.loads() method
# data = json.loads(blob.download_as_string(client=None))

In [24]:
# Reading gcs files with gcsfs
import gcsfs
import json

gcs_file_system = gcsfs.GCSFileSystem(project="infared")
gcs_json_path = "gs://anvibucket/Invoice_607940909.textoutput-1-to-2.json"
with gcs_file_system.open(gcs_json_path) as f:
  json_dict = json.load(f)
  #print(json_dict)
    
outFile = open('output.txt', 'w')
for element in json_dict:
    tempList = json_dict
    for dict in tempList:
        print(dict['string'])
        outFile.write(dict['string']+'\n')

outFile.close()

TypeError: string indices must be integers

In [7]:
import requests
import json
 
r = requests.get("https://storage.googleapis.com/anvibucket/extec21.Textoutput-1-to-2.json")
res = r.json()
 
# Extract specific node content.
#print(res)
 
# Dump data as string
data = json.dumps(res)
print(data)

{"inputConfig": {"gcsSource": {"uri": "gs://anvibucket/extec21.pdf"}, "mimeType": "application/pdf"}, "responses": [{"fullTextAnnotation": {"pages": [{"property": {"detectedLanguages": [{"languageCode": "en", "confidence": 0.98}]}, "width": 612, "height": 792, "blocks": [{"property": {"detectedLanguages": [{"languageCode": "en", "confidence": 1}]}, "boundingBox": {"normalizedVertices": [{"x": 0.374183, "y": 0.065656565}, {"x": 0.622549, "y": 0.065656565}, {"x": 0.622549, "y": 0.08207071}, {"x": 0.374183, "y": 0.08207071}]}, "paragraphs": [{"property": {"detectedLanguages": [{"languageCode": "en", "confidence": 1}]}, "boundingBox": {"normalizedVertices": [{"x": 0.374183, "y": 0.065656565}, {"x": 0.622549, "y": 0.065656565}, {"x": 0.622549, "y": 0.08207071}, {"x": 0.374183, "y": 0.08207071}]}, "words": [{"property": {"detectedLanguages": [{"languageCode": "en"}]}, "boundingBox": {"normalizedVertices": [{"x": 0.374183, "y": 0.065656565}, {"x": 0.4738562, "y": 0.065656565}, {"x": 0.4738562

In [None]:
# import json
# file= open('abcd.txt','a')
# with open('universal-botanika-Agreement.Textoutput-1-to-2.json') as json_file:
#     data= json.load(json_file)
#     for item in data:
#         file.write(item)
#         print(item)
        

    

   
        
        
        
        


In [None]:
# file = open('new_json.txt', 'a+') 
# with open('SUNIL_RESUME.Textoutput-1-to-2.json') as json_file:
#     for line in json_file:                                 #Iterate Each Line
#         data= json.loads(line.strip())                     #Use json.loads 
#         for item in data:
#             file.write(item)
#             print(item)

In [None]:
# with open('SUNIL_RESUME.Textoutput-1-to-2.json', 'r') as f:
#     distros_dict = json.load(f)

# for distro in distros_dict:
#     print(distro)

In [None]:
# import json
# data = json.load(open('universal-botanika-Agreement.Textoutput-1-to-2.json'))

# for i in data['inputConfig']:
#     print(i)
# for j in data['responses']:
#     print(j)
# print(type(data))


In [None]:
# import urllib.request as request
# import json

# with request.urlopen('https://storage.googleapis.com/anvibucket/extec21.Textoutput-1-to-2.json') as response:
# #     source = response.read()
# #     data = json.loads(response)
#     print(response)



In [None]:
# import requests
# from requests.exceptions import HTTPError

# try:
#     response = requests.get('https://storage.googleapis.com/anvibucket/extec21.Textoutput-1-to-2.json')
#     response.raise_for_status()
#     # access JSOn content
#     jsonResponse = response.json()
#     print("Entire JSON response")
#     #print(jsonResponse)

# except HTTPError as http_err:
#     print(f'HTTP error occurred: {http_err}')
# except Exception as err:
#     print(f'Other error occurred: {err}')


In [None]:
# import requests
# r = requests.get('https://storage.googleapis.com/anvibucket/extec21.Textoutput-1-to-2.json')
# json_data = json.dumps(r)
# data = json.loads(json_data)

# with open('./xyz.txt','w') as fd:
#     fd.write(r.text)
# print("done")

In [5]:
import json 

# Opening JSON file 
with open('SUNIL_RESUME.Textoutput-1-to-2.json') as json_file: 
    data = json.load(json_file) 
    print("Type:", type(data)) 
    
    with open('sunil.txt', 'w') as f:
        for key, value in data.items():
            
            f.write(data.text)
    f.close()



Type: <class 'dict'>


AttributeError: 'dict' object has no attribute 'text'