In [1]:
from pymilvus import connections
import os 

from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Milvus

from dotenv import load_dotenv

from pymilvus import CollectionSchema, FieldSchema, DataType, connections, utility, Collection
from sentence_transformers import SentenceTransformer
import re
import PyPDF2
from fastcoref import spacy_component
import spacy 
import torch.nn.functional as F

from nltk.tokenize import sent_tokenize

from gensim.models import Word2Vec,KeyedVectors

In [2]:
# Set your Milvus server host and port
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

Connected to Milvus!


In [9]:
def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        return 'Collection created!'
    else:
        collection = Collection(collection_name)
        return 'Collection already exists.'
    return collection


In [10]:
def fast_coref_arch():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
    "fastcoref",
    config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )
    return None

In [12]:
define_collection("aa")

'Collection created!'

In [4]:
from pymilvus import connections
import os 

from pymilvus import CollectionSchema, FieldSchema, DataType, connections, utility, Collection
from sentence_transformers import SentenceTransformer
import re
import PyPDF2
import spacy 
import torch.nn.functional as F


nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Set your Milvus server host and port
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

def fast_coref_arch():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
    "fastcoref",
    config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )
    return None

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):
    return text

def extract_text_from_pdf(pdf_path):
    fast_coref_arch()
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page].extract_text()
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text = text.lower()

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{page}"
                    metadata_page = page
                    text_list.append(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

            except:
                pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)

def main():
    folder_path = r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\Bot_NLP\pdff\test_pdf\New folder (2)"
    
    # Define the collection here
    collection_name = "your_collection_name"
    collection = define_collection(collection_name)

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(folder_path, file_name)
            text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(file_path)
            collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])
            
            # Create an index on the "embeddings" field
            index_params = {
                'metric_type': 'L2',
                'index_type': "HNSW",
                'efConstruction': 40,
                'M': 20
            }
            collection.create_index(field_name="embeddings", index_params=index_params)
            print('Index created.')

            collection.load()
            print("Collection loaded.")
            
            return 'Data inserted into the collection.' 

if __name__ == "__main__":
    main()



Connected to Milvus!
Collection already exists.


In [5]:
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def fast_coref_arch():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )
    return None

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):
    return text

def extract_text_from_pdf(pdf_path):
    fast_coref_arch()
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page].extract_text()
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text = text.lower()

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{page}"
                    metadata_page = page
                    text_list.append(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

            except:
                pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf', methods=['POST'])
def upload_pdf():
    # Define the collection here
    collection_name = "aa"
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):
        # Save the uploaded PDF file
        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)
        
        # Extract text and metadata from the PDF
        text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(uploaded_file_path)

        # Insert data into Milvus collection
        collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        collection.load()
        print("Collection loaded.")

        return jsonify({'message': 'Data inserted into the collection.'}), 200

    return jsonify({'error': 'Invalid file format'}), 400

if __name__ == '__main__':
    app.run(debug=False)





Connected to Milvus!


01/17/2024 17:48:58 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
01/17/2024 17:48:59 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
01/17/2024 17:48:59 - INFO - 	 [33mPress CTRL+C to quit[0m


## With multithreading

In [9]:
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from concurrent.futures import ProcessPoolExecutor

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process_pdf_page(pdf_path, page, collection):
    text, embeddings, metadata, metadata_page = extract_text_from_pdf_page(pdf_path, page)
    print(text)
    collection.insert([metadata, metadata_page, embeddings, text])
    



def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):## logic for removing header and footer should be written here
    return text

# def process_pdf_page(pdf_path, page):
#     text, embeddings, metadata, metadata_page = extract_text_from_pdf_page(pdf_path, page)
#     collection.insert([metadata, metadata_page, embeddings, text])

def extract_text_from_pdf_page(pdf_path, page):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )                
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for current_page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[current_page].extract_text()
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text = text.lower()

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{current_page}"
                    metadata_page = current_page
                    text_list.append(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, current_page, text_list, embedding_list, metadata_list, metadata_page_list)

            except Exception as e:
                print(f"Error processing page {current_page}: {str(e)}")

    return text_list, embedding_list, metadata_list, metadata_page_list


def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)
        # print("*************")
        # print(cluster_len)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        # print(text_list)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf/<collection_name>', methods=['POST'])
def upload_pdf(collection_name):
    # Define the collection here
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):

        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)

        # Use multiprocessing to parallelize page processing
        with ProcessPoolExecutor() as executor:
            pages = range(len(PyPDF2.PdfReader(uploaded_file_path).pages))
            executor.map(lambda page: process_pdf_page(uploaded_file_path, page, collection), pages)

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        return jsonify({'message': 'Data inserted into the collection.'}), 200

    return jsonify({'error': 'Invalid file format'}), 400


if __name__ == '__main__':
    app.run(debug=False)



Connected to Milvus!


01/18/2024 12:00:02 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
01/18/2024 12:00:02 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
01/18/2024 12:00:02 - INFO - 	 [33mPress CTRL+C to quit[0m


Collection already exists.


01/18/2024 12:00:06 - INFO - 	 127.0.0.1 - - [18/Jan/2024 12:00:06] "POST /upload-pdf/aa1 HTTP/1.1" 200 -


Index created.


# First API

In [3]:
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# def fast_coref_arch(txt):
    
#     return text

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):## logic for removing header and footer should be written here
    return text

def extract_text_from_pdf(pdf_path):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )                
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page].extract_text()
            # print(len(text))
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text = text.lower()
                # print(text)

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{page}"
                    metadata_page = page
                    text_list.append(text)
                    # print(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

            except:
                pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)
        # print("*************")
        # print(cluster_len)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        # print(text_list)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf/<collection_name>', methods=['POST'])
def upload_pdf(collection_name):
    # Define the collection here
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):

        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)
        
        # Extract text and metadata from the PDF
        text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(uploaded_file_path)

        # Insert data into Milvus collection
        collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        return jsonify({'message': 'Data inserted into the collection.'}), 200

    return jsonify({'error': 'Invalid file format'}), 400


if __name__ == '__main__':
    app.run(debug=False)



Connected to Milvus!


01/31/2024 11:00:03 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
01/31/2024 11:00:03 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
01/31/2024 11:00:03 - INFO - 	 [33mPress CTRL+C to quit[0m


### Answer Api

In [4]:
from flask import Flask, request, jsonify
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema, connections, utility
from sentence_transformers import SentenceTransformer
import torch
import os

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Load pre-trained model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def define_collection(collection_name):
    print("in define func")
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

@app.route('/search-answers', methods=['POST'])
def search_answers():
    data = request.get_json()
    print(data)
    collection_name = data.get('collection_name', '')
    query = data.get('query', '')
    print(collection_name)
    print(query)
    # Define and load the Milvus collection
    collection = define_collection(collection_name)
    collection.load()
    print("Collection loaded.")

    # Encode the query
    query_encode = model.encode(query.lower())

    # Perform a search to get answers
    search_results = collection.search(data=[query_encode], anns_field="embeddings",
                                      param={"metric": "L2", "offset": 0},
                                      output_fields=["metadata", "metadata_page", "text"],
                                      limit=10, consistency_level="Strong")
    print(search_results)
    # Extract relevant information from search results
    answers_final = [search_results[0][i].entity.text for i in range(0,len(search_results[0]))]

    return jsonify({'answers_final': answers_final}), 200

if __name__ == '__main__':
    app.run(debug=False)


Connected to Milvus!
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [39]:
import nltk

nltk.download('punkt')  # Download the necessary data for tokenization

from nltk.tokenize import word_tokenize

text = "thedegree ofchange undergone byacoal asitmatures from peat toanthracite is known ascoalification . coalification hasanimportant bearing oncoal's physical andchemical properties andisreferred toasthe'rank' ofthecoal. ranking isdetermined bythedegree oftransformation oftheoriginal plant material tocarbon . therank ofcoals, from those with theleast carbon tothose with themost carbon, arelignite, sub-bituminous, bituminous andanthracitecoalification "

# Tokenize the text into words
words = word_tokenize(text)

readable_sentence = ' '.join(words)

# Print the improved sentence
print(readable_sentence)


thedegree ofchange undergone byacoal asitmatures from peat toanthracite is known ascoalification . coalification hasanimportant bearing oncoal 's physical andchemical properties andisreferred toasthe'rank ' ofthecoal . ranking isdetermined bythedegree oftransformation oftheoriginal plant material tocarbon . therank ofcoals , from those with theleast carbon tothose with themost carbon , arelignite , sub-bituminous , bituminous andanthracitecoalification


[nltk_data] Downloading package punkt to C:\Users\Palash Ashok
[nltk_data]     Bhosale\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
import re

text = "ofchange undergone byacoal asitmatures peat toanthracite known ascoalification hasanimportant bearing oncoal physical andchemical properties andisreferred ofthecoal isdetermined bythedegree oftransformation oftheoriginal plant material tocarbon ofcoals theleast carbon tothose themost carbon arelignite bituminous andanthracitecoalification"


words = re.findall(r'[A-Z]?[a-z]+|[A-Z]+', text)


common_prefixes = ['of', 'by', 'to', 'and', "a", "an"]
processed_words = []
for word in words:
    for prefix in common_prefixes:
        if word.startswith(prefix):
            processed_words.extend([prefix, word[len(prefix):]])
            break
    else:
        processed_words.append(word)
readable_sentence = ' '.join(processed_words)
print(readable_sentence)


of change undergone by acoal a sitmatures peat to anthracite known a scoalification hasanimportant bearing oncoal physical and chemical properties and isreferred of thecoal isdetermined by thedegree of transformation of theoriginal plant material to carbon of coals theleast carbon to those themost carbon a relignite bituminous and anthracitecoalification


In [47]:
from nltk.corpus import stopwords

In [50]:
print("by" in [stopwords.words("English")])

False


In [26]:
import nltk

nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Palash Ashok
[nltk_data]     Bhosale\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Second API

In [None]:
import nltk

nltk.download('punkt')

from nltk.tokenize import word_tokenize
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# def fast_coref_arch(txt):
    
#     return text

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):## logic for removing header and footer should be written here
    return text

def extract_text_from_pdf(pdf_path):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )                
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page].extract_text()
            # print(len(text))
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text= text.lower()
                words = word_tokenize(text)

                readable_sentence = ' '.join(words)
                text=readable_sentence
                # print(text)

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{page}"
                    metadata_page = page
                    text_list.append(text)
                    # print(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

            except:
                pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)
        # print("*************")
        # print(cluster_len)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        # print(text_list)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf/<collection_name>', methods=['POST'])
def upload_pdf(collection_name):
    # Define the collection here
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):

        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)
        
        # Extract text and metadata from the PDF
        text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(uploaded_file_path)

        # Insert data into Milvus collection
        collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        return jsonify({'message': 'Data inserted into the collection.'}), 200
        
    return jsonify({'error': 'Invalid file format'}), 400


if __name__ == '__main__':
    app.run(debug=False)



[nltk_data] Downloading package punkt to C:\Users\Palash Ashok
[nltk_data]     Bhosale\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Connected to Milvus!


02/02/2024 11:43:04 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
02/02/2024 11:43:05 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
02/02/2024 11:43:05 - INFO - 	 [33mPress CTRL+C to quit[0m


Collection already exists.


Some weights of the model checkpoint at biu-nlp/lingmess-coref were not used when initializing LingMessModel: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LingMessModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LingMessModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
02/02/2024 11:45:42 - INFO - 	 missing_keys: []
02/02/2024 11:45:42 - INFO - 	 unexpected_keys: []
02/02/2024 11:45:42 - INFO - 	 mismatched_keys: []
02/02/2024 11:45:42 - INFO - 	 error_msgs: []
02/02/2024 11:45:42 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M
02/02/2024 11:45:42 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

02/02/2024 11:45:42 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Collection already exists.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema, connections
from sentence_transformers import SentenceTransformer

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Load pre-trained model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Create a Milvus collection
collection_name = 'aa'
document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)
embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)

# Check if collection exists; create if not
if not utility.has_collection(collection_name):
    collection = Collection(name=collection_name, schema=schema, using='default')
    print('Collection created!')
else:
    collection = Collection(collection_name)
    print('Collection already exists.')

# Perform a simple search
query = "Who can modify the License Fee?"
query_encode = model.encode(query.lower())
search_results = collection.search(data=[query_encode], anns_field="embeddings",
                                   param={"metric": "L2", "offset": 0},
                                   output_fields=["metadata", "metadata_page", "text"],
                                   limit=10, consistency_level="Strong")

print(f"Search results: {search_results}")


01/17/2024 16:04:13 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


Connected to Milvus!


01/17/2024 16:04:14 - INFO - 	 Use pytorch device: cpu


Collection already exists.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results: ['["id: 446812261165838582, distance: 90.8446044921875, entity: {\'metadata\': \'upload_folder\\\\\\\\tellecommunication_2_unabletoparse (1).pdf_3\', \'metadata_page\': 3, \'text\': \'\'}", "id: 446812261165838584, distance: 90.8446044921875, entity: {\'metadata\': \'upload_folder\\\\\\\\tellecommunication_2_unabletoparse (1).pdf_5\', \'metadata_page\': 5, \'text\': \'\'}", "id: 446812261165838583, distance: 90.8446044921875, entity: {\'metadata\': \'upload_folder\\\\\\\\tellecommunication_2_unabletoparse (1).pdf_4\', \'metadata_page\': 4, \'text\': \'\'}", "id: 446812261165838580, distance: 90.8446044921875, entity: {\'metadata\': \'upload_folder\\\\\\\\tellecommunication_2_unabletoparse (1).pdf_1\', \'metadata_page\': 1, \'text\': \'\'}", "id: 446812261165838585, distance: 90.8446044921875, entity: {\'metadata\': \'upload_folder\\\\\\\\tellecommunication_2_unabletoparse (1).pdf_6\', \'metadata_page\': 6, \'text\': \'\'}", "id: 446812261165838586, distance: 90.84460449

In [None]:
print(query)
print()
answers_final

What is objective of Digital literacy?



['objective is to understand \uf09bdigital literacy \uf09btypes of digital literacy \uf09bdigital application software  \uf09bexamples of digital application software \uf09bmicrosoft suite ms -word \uf09bmicrosoft suite ms -excel \uf09bmicrosoft suite ms -powerpoint',
 'digital literacy  commonly used application software ',
 'types of digital literacy \uf09bliteracy in social network services. \uf09bliteracy in digital application software. \uf09bliteracy in content and information searching. \uf09bliteracy in online frauds like phishing, e -mail frauds etc.',
 'page 20  of 20                               the primary objective of ntp -2012 is maximizing public good by  making available affordable, reliable and secure  telecommunication and broadband services across the entire  country. the main thrust of ntp -2012 is on the multiplier effect  and transformational impact of such services on the  overall  economy. ntp -2012 recognizes the role of such services in furthering the  nation

# Third API

In [51]:
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# def fast_coref_arch(txt):
    
#     return text

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):## logic for removing header and footer should be written here
    return text

def extract_text_from_pdf(pdf_path):
    text=" "
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )                
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page].extract_text()
            # print(len(text))
    try:
        doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
        text = doc._.resolved_text
        text = re.sub("\n", " ", text)
        text = text.lower()
        # print(text)

        if len(text) <= 1300:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(text)
            # print(text)
            embeddings = model.encode(text)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        else:
            process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

    except:
        pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)
        # print("*************")
        # print(cluster_len)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        # print(text_list)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf/<collection_name>', methods=['POST'])
def upload_pdf(collection_name):
    # Define the collection here
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):

        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)
        
        # Extract text and metadata from the PDF
        text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(uploaded_file_path)

        # Insert data into Milvus collection
        collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        return jsonify({'message': 'Data inserted into the collection.'}), 200

    return jsonify({'error': 'Invalid file format'}), 400


if __name__ == '__main__':
    app.run(debug=False)



Connected to Milvus!


01/25/2024 16:24:50 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
01/25/2024 16:24:50 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
01/25/2024 16:24:50 - INFO - 	 [33mPress CTRL+C to quit[0m


Collection created!


01/25/2024 16:25:40 - INFO - 	 missing_keys: []
01/25/2024 16:25:40 - INFO - 	 unexpected_keys: []
01/25/2024 16:25:40 - INFO - 	 mismatched_keys: []
01/25/2024 16:25:40 - INFO - 	 error_msgs: []
01/25/2024 16:25:40 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M
01/25/2024 16:25:40 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

01/25/2024 16:25:41 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

01/25/2024 16:25:49 - ERROR - 	 Exception on /upload-pdf/a1b2 [POST]
Traceback (most recent call last):
  File "c:\Users\Palash Ashok Bhosale\anaconda3\envs\bot\lib\site-packages\flask\app.py", line 2190, in wsgi_app
    response = self.full_dispatch_request()
  File "c:\Users\Palash Ashok Bhosale\anaconda3\envs\bot\lib\site-packages\flask\app.py", line 1486, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "c:\Users\Palash Ashok Bhosale\anaconda3\envs\bot\lib\site-packages\flask\app.py", line 1484, in full_dispatch_request
    rv = self.dispatch_request()
  File "c:\Users\Palash Ashok Bhosale\anaconda3\envs\bot\lib\site-packages\flask\app.py", line 1469, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "C:\Users\Palash Ashok Bhosale\AppData\Local\Temp\ipykernel_2400\2810014999.py", line 158, in upload_pdf
    collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])
  File "c:\Users\Pa

Collection already exists.


: 

# Model_384 ingestion api

In [5]:
import nltk

nltk.download('punkt')

from nltk.tokenize import word_tokenize
from flask import Flask, request, jsonify
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility
import os
import spacy
import re
import PyPDF2
from fastcoref import spacy_component
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

app = Flask(__name__)

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print('Connected to Milvus!')

# Create 'upload_folder' directory if it doesn't exist
if not os.path.exists('upload_folder'):
    os.makedirs('upload_folder')

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# def fast_coref_arch(txt):
    
#     return text

def define_collection(collection_name):
    document_id = FieldSchema(name='document_id', dtype=DataType.INT64, is_primary=True, auto_id=True)
    metadata = FieldSchema(name='metadata', dtype=DataType.VARCHAR, max_length=15000)
    metadata_page = FieldSchema(name='metadata_page', dtype=DataType.INT64)    
    embeddings = FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=384)
    text = FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=60000)
    schema = CollectionSchema(fields=[document_id, metadata, metadata_page, embeddings, text], enable_dynamic_field=True)
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema, using='default')
        print('Collection created!')
    else:
        collection = Collection(collection_name)
        print('Collection already exists.')
    return collection

def process(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, sentence_embeddings

def cluster_text(sentences, sentence_embeddings, threshold):
    clusters = [[0]]
    for i in range(1, len(sentences)):
        similarity = F.cosine_similarity(sentence_embeddings[i - 1:i], sentence_embeddings[i:i + 1]).item()
        if similarity < threshold:
            clusters.append([])
        clusters[-1].append(i)
    return clusters

def clean_text(text):## logic for removing header and footer should be written here
    return text

def extract_text_from_pdf(pdf_path):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref",
        config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
    )                
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_list = []
        embedding_list = []
        metadata_list = []
        metadata_page_list = []

        for page in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page].extract_text()
            # print(len(text))
            try:
                doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
                text = doc._.resolved_text
                text = re.sub("\n", " ", text)
                text= text.lower()
                words = word_tokenize(text)

                readable_sentence = ' '.join(words)
                text=readable_sentence
                # print(text)

                if len(text) <= 1300:
                    metadata = f"{pdf_path}_{page}"
                    metadata_page = page
                    text_list.append(text)
                    # print(text)
                    embeddings = model.encode(text)
                    embedding_list.append(embeddings)
                    metadata_list.append(metadata)
                    metadata_page_list.append(metadata_page)
                else:
                    process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)

            except:
                pass

    return text_list, embedding_list, metadata_list, metadata_page_list

def process_large_text(text, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list):
    threshold = 0.3
    sentences, sentence_embeddings = process(text)
    clusters = cluster_text(sentences, sentence_embeddings, threshold)

    for cluster in clusters:
        cluster_txt = clean_text(' '.join([str(sentences[i]) for i in cluster]))
        cluster_len = len(cluster_txt)
        # print("*************")
        # print(cluster_len)

        if cluster_len < 80:
            continue
        elif cluster_len > 1300:
            threshold = 0.6
            process_large_text(cluster_txt, pdf_path, page, text_list, embedding_list, metadata_list, metadata_page_list)
        else:
            metadata = f"{pdf_path}_{page}"
            metadata_page = page
            text_list.append(cluster_txt)
            embeddings = model.encode(cluster_txt)
            embedding_list.append(embeddings)
            metadata_list.append(metadata)
            metadata_page_list.append(metadata_page)
        # print(text_list)

@app.route('/')
def index():
    return 'Welcome to the PDF Ingestion API!'

@app.route('/upload-pdf/<collection_name>', methods=['POST'])
def upload_pdf(collection_name):
    # Define the collection here
    collection = define_collection(collection_name)

    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith(".pdf"):

        uploaded_file_path = os.path.join("upload_folder", file.filename)
        file.save(uploaded_file_path)
        
        # Extract text and metadata from the PDF
        text_list, embedding_list, metadata_list, metadata_page_list = extract_text_from_pdf(uploaded_file_path)

        # Insert data into Milvus collection
        collection.insert([metadata_list, metadata_page_list, embedding_list, text_list])

        # Create an index on the "embeddings" field
        index_params = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'efConstruction': 40,
            'M': 20
        }
        collection.create_index(field_name="embeddings", index_params=index_params)
        print('Index created.')

        return jsonify({'message': 'Data inserted into the collection.'}), 200
        
    return jsonify({'error': 'Invalid file format'}), 400


if __name__ == '__main__':
    app.run(debug=False)



[nltk_data] Downloading package punkt to C:\Users\Palash Ashok
[nltk_data]     Bhosale\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Connected to Milvus!


02/01/2024 14:40:12 - INFO - 	 Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
02/01/2024 14:40:13 - INFO - 	 Use pytorch device: cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
02/01/2024 14:40:13 - INFO - 	 [33mPress CTRL+C to quit[0m
