In [6]:
import requests
import fitz  # PyMuPDF

def download_and_read_pdf(file_id, destination):
    # Download part
    URL = "https://drive.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params={'id': file_id}, stream=True)
    token = get_confirm_token(response)
    if token:
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)
    save_response_content(response, destination)
    
    # Reading part
    text = read_pdf_text(destination)
    return text

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:
                f.write(chunk)

def read_pdf_text(path):
    text = ''
    with fitz.open(path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [16]:
def read_pdf_from_url(file_id):
    URL = "https://drive.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params={'id': file_id}, stream=True)
    token = get_confirm_token(response)
    if token:
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    # Directly use the response content with fitz.open
    pdf_bytes = response.content
    text = read_pdf_text(pdf_bytes)
    return text

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def read_pdf_text(pdf_bytes):
    text = ''
    # Directly pass the bytes to fitz.open
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

In [17]:
file_id = '1PZLdGojG8zeJGfh3WYGDJtaPSBK7dj3p' # Replace YOUR_FILE_ID_HERE with the actual file ID
destination = 'test_pdf.pdf' # Replace YOUR_DESTINATION_PATH_HERE with the path where you want to save the PDF
pdf_text = read_pdf_from_url(file_id)
print(pdf_text)

Centralized & standardized data 
governance and management
Statistical and actionable insights
Predictions about future outcomes
Monetize your data to drive higher 
revenue and C-SAT with our suite of 
Data & Analytics Solutions.
Coforge BPS’s suite of Data & Analytics Solutions can help organizations drive revenue, monetize data, reduce cost, 
avoid risk and enhance customer engagement with:
Data & Analytics Solutions
Data Science 
(AI & ML)
Data Analytics & 
Predictions
Data Management & 
Governance
Data Engineering & 
Technology Adoption
Our Data & Analytics Solutions incorporate:
Strong financial services domain 
expertise
Customized solutions for financial 
services industry
Support end to end data journey from 
data management to analytics
Customized delivery model for
optimal results
Business Value
Centralize and standardize data with Data Management 
& Governance
Obtain future insights with Data Analytics & Predictions
Get statistics and actionable insights to make 
key busines

In [20]:
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload
from io import BytesIO

# Convert the string to bytes and prepare it as a file-like object
file_stream = BytesIO(pdf_text.encode())

# Path to your service account key file
SERVICE_ACCOUNT_FILE = 'bold-circuit-392015-118933d56a23.json'
SCOPES = ['https://www.googleapis.com/auth/drive']

credentials = Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)

service = build('drive', 'v3', credentials=credentials)

folder_id = '1ylDka56ebKOnGMgiB9UMBvB6UzHfHM6T'  # Replace with the actual folder ID

file_metadata = {
    'name': 'test_pdf.txt',
    'parents': [folder_id]
}

# Create the media file upload object and specify the mimetype
media = MediaIoBaseUpload(file_stream, mimetype='text/plain', resumable=True)

# Upload the file
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()

print('File ID: %s' % file.get('id'))


File ID: 1SgZyQ0iJDOoDbo5zkjou_6YPJ8VGb8Gn


In [25]:
from database_utils import *
from openai_utils import get_embedding

def save_pdf_to_DB(file_name, id, text):
    client, db, collection = connect_to_mongodb()
    doc = {'name': file_name, 'id': id, 'filetype': 'pdf', 'semantic_embedding': get_embedding(text), 'topic': 'CoForge'}
    collection.insert_one(doc)



def get_pdf_contents_from_DB(file_name):
    client, db, collection = connect_to_mongodb()
    doc = collection.find_one({'name': file_name})
    if not doc:
        return None
    doc_id = doc['id']
    return read_pdf_from_url(doc_id)


connect_to_mongodb()

Pinged your deployment. You successfully connected to MongoDB!


(MongoClient(host=['ac-ujp5wzn-shard-00-02.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-01.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-00.h8gtitv.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-pvy4a8-shard-0', ssl=True),
 Database(MongoClient(host=['ac-ujp5wzn-shard-00-02.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-01.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-00.h8gtitv.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-pvy4a8-shard-0', ssl=True), 'news'),
 Collection(Database(MongoClient(host=['ac-ujp5wzn-shard-00-02.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-01.h8gtitv.mongodb.net:27017', 'ac-ujp5wzn-shard-00-00.h8gtitv.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-pvy4a8-shard-0', ssl=True), 'news'), 'articles'))

In [26]:
#save_pdf_to_DB('test_pdf.pdf', file_id, pdf_text)#file.get('id'), pdf_text)
get_pdf_contents_from_DB('test_pdf.pdf')

Pinged your deployment. You successfully connected to MongoDB!


"Centralized & standardized data \ngovernance and management\nStatistical and actionable insights\nPredictions about future outcomes\nMonetize your data to drive higher \nrevenue and C-SAT with our suite of \nData & Analytics Solutions.\nCoforge BPS’s suite of Data & Analytics Solutions can help organizations drive revenue, monetize data, reduce cost, \navoid risk and enhance customer engagement with:\nData & Analytics Solutions\nData Science \n(AI & ML)\nData Analytics & \nPredictions\nData Management & \nGovernance\nData Engineering & \nTechnology Adoption\nOur Data & Analytics Solutions incorporate:\nStrong financial services domain \nexpertise\nCustomized solutions for financial \nservices industry\nSupport end to end data journey from \ndata management to analytics\nCustomized delivery model for\noptimal results\nBusiness Value\nCentralize and standardize data with Data Management \n& Governance\nObtain future insights with Data Analytics & Predictions\nGet statistics and actionab

In [15]:
get_pdf_contents_from_DB('test_pdf.pdf', 'test_retrieve_text.txt')

Pinged your deployment. You successfully connected to MongoDB!


In [18]:
from database_utils import *
client, db, collection = connect_to_mongodb()
doc = collection.delete_one({'name': 'test_pdf.pdf'})
doc2 = collection.find_one({'name': 'test_pdf.pdf'})
print(doc2)

Pinged your deployment. You successfully connected to MongoDB!
None
