### Importing Packages

In [None]:
!pip install sentence-transformers==2.2.2

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import pdfplumber
from configparser import ConfigParser
from pymilvus import connections, Collection, utility
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Milvus
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import CrossEncoder
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from prompt import prompt_generator
from sentence_transformers import SentenceTransformer


In [None]:
!pip install ibm_watson_machine_learning

In [4]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

# Initialize embeddings
embeddings = HuggingFaceInstructEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_kwargs={"device": "cpu"}
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


### RAG- using Milvus Vector DB

In [None]:
class ZiraatBankQA:
    def __init__(self, config_path):
        self.config = self.load_config(config_path)
       
        self.creds ,self.project_id = self.get_wml_creds()
        self.model_id = self.config['DEFAULT']['ModelID']
        self.embeddings = HuggingFaceInstructEmbeddings(model_name=self.config['DEFAULT']['EmbeddingsModel'])
#         self.embeddings = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=['\n\n', '\n', '.', ' '],
            chunk_size=768,
            chunk_overlap=100,
            length_function=len
        )
        # self.collection_name = self.config['Milvus']['CollectionName']
        self.host = self.config['Milvus']['Host']
        self.port = self.config['Milvus']['Port']
        self.user = self.config['Milvus']['User']
        self.password = self.config['Milvus']['Password']
        self.server_pem_path = self.config['Milvus']['ServerPemPath']
        self.server_name = self.config['Milvus']['ServerName']

    def load_config(self, config_path):
        config = ConfigParser()
        config.read(config_path)
        return config

    def get_wml_creds(self):
        api_key = "1xB9UAYxbDnLuEF1INyZn3vAF9KkvvKnTzxBq0-FUuiR"
        ibm_cloud_url = "https://us-south.ml.cloud.ibm.com"
        project_id = "86cc43a6-c2f0-4e3e-a6e9-426ac8cf8f7b"
        if api_key is None or ibm_cloud_url is None or project_id is None:
            print("Ensure you copied the .env file that you created earlier into the same directory as this script")
        else:
            creds = {
                "url": ibm_cloud_url,
                "apikey": api_key 
            }
        return creds ,project_id

    def send_to_watsonxai(self, prompt):
        params = {
            GenParams.DECODING_METHOD: "greedy",
            GenParams.MIN_NEW_TOKENS: 1,
            GenParams.MAX_NEW_TOKENS: 200,
            GenParams.TEMPERATURE: 0,
        }
        model = Model(model_id=self.model_id, params=params, credentials=self.creds, project_id=self.project_id)
        response = model.generate_text(prompt)
        return response

    def load_documents(self, folder_path):
        text_chunks = []
        files = glob.glob(os.path.join(folder_path, '*.pdf'))

        for file in tqdm(files):
            with pdfplumber.open(file) as pdf:
                data = ''.join([page.extract_text() for page in pdf.pages])

            created_text_chunks = self.text_splitter.create_documents([data])
            for chunks in created_text_chunks:
                chunks.metadata['file'] = file
                text_chunks.append(chunks)

        return text_chunks

    def create_vector_store(self, text_chunks,connection_name):
        connections.connect(
            "default", 
            host=self.host, 
            port=self.port, 
            secure=True, 
            server_pem_path=self.server_pem_path, 
            server_name=self.server_name, 
            user=self.user, 
            password=self.password
        )

        if utility.has_collection(connection_name):
            utility.drop_collection(connection_name)

        vector_db = Milvus.from_documents(
            text_chunks,
            self.embeddings,
            connection_args={
                "host": self.host,
                "port": self.port,
                "secure": True,
                "server_pem_path": self.server_pem_path,
                "server_name": self.server_name,
                "user": self.user,
                "password": self.password
            },
            collection_name=connection_name
        )

        collection = Collection(connection_name)
        collection.load()
        return vector_db

    def perform_qa(self, df, query):
        context = "\n\n".join(df['paragraph'])
        prompt = prompt_generator(context, query)
        response = self.send_to_watsonxai(prompt)
        return response, context

    def create_model(self, model_name):
        model = CrossEncoder(model_name, max_length=512)
        return model
    
    def main(self, query, vector_db, model):
        docs = vector_db.similarity_search_with_score(query, k=12, ef=7)
        
        _docs = pd.DataFrame(
            [(query, doc[0].page_content, doc[0].metadata.get('file'), doc[1]) for doc in docs],
            columns=['query', 'paragraph', 'document', 'relevent_score']
        )
        scores = model.predict(_docs[['query', 'paragraph']].to_numpy())
        _docs['score'] = scores
        df = _docs[:12]

        response, context = self.perform_qa(df, query)
        return response, context

### LLAMA3 Inference Q1

In [None]:
if __name__ == "__main__":
    query = "M.Ö. 3000’lerden itibaren Smyrna’nın yerleşim yeri neresi olmuştur?"
    folder_path = 'Large'
    config_path = 'config.ini'
    ziraat_bank_qa = ZiraatBankQA(config_path)
    text_chunks = ziraat_bank_qa.load_documents(folder_path)
    vector_db = ziraat_bank_qa.create_vector_store(text_chunks)
    model = ziraat_bank_qa.create_model('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
    response, context = ziraat_bank_qa.main(query, vector_db, model)
    print(response)


### LLAMA3 Inference Q2

In [None]:
query = "Tekâlîf-i fevkalâde vergisi nedir? Hangi kısımlardan oluşur? "
response, context = ziraat_bank_qa.main(query, vector_db)
print(response)

In [None]:
query = "11. yüzyılda Smyrna bölgesinde ne olmuştur?"
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

In [None]:
query = "Smyrna ilk kuruluşu nerede ve ne zaman olmuştur? "
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

In [None]:
query = "Arzava ve Assuva'nın Smyrna açısından önemi nedir? "
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

In [None]:
query = "Charles Texier tarafından Smyrna konusunda hangi çalışmalar yapılmıştır?"
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

In [None]:
query = "Aiollerin Smyrna çevresinde kurduğu federasyonda hangi kentler bulunmaktadır?"
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

In [None]:
query = "İzmir iktisat kongresinde tüccar grubu tarafından belirlenen başlıca ilkeler nelerdir?"
response, context = ziraat_bank_qa.main(query, folder_path)
print(response)

### Validation

In [None]:
import pandas as pd

# Specify the path to your Excel file
excel_file_path = 'Validation_set/validation_set_yns.xlsx'

# Read the Excel file into a DataFrame using the openpyxl engine
df = pd.read_excel(excel_file_path, engine='openpyxl')


responses = []

for query in df['Soru']:
    response, context = ziraat_bank_qa.main(query, folder_path)
    responses.append(response)
    
# Add the questions & responses to the DataFrame
df['response'] = responses

#save the model response

# fixed encoding issue.
df.to_excel('output.xlsx', engine='openpyxl', encoding='utf-8-sig')

### 