import os
import re

def convert_notation(text):
    # Regular expression to find the pattern `X.Y:` where X is the over number and Y is the ball number
    pattern = re.compile(r'(\d+)\.(\d+):')
    
    # Function to format the match into the new format
    def replace(match):
        over = match.group(1)
        ball = match.group(2)
        return f"over {over} ball {ball}:"
    
    # Use re.sub to replace all occurrences in the text
    result = pattern.sub(replace, text)
    return result

def process_file(input_filename, output_filename):
    # Read the content from the input file
    with open(input_filename, 'r') as infile:
        text = infile.read()
    
    # Convert the notation
    converted_text = convert_notation(text)
    
    # Write the converted text to the output file
    with open(output_filename, 'w') as outfile:
        outfile.write(converted_text)

def process_files_in_directory(input_directory, output_directory):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        
    file_number = 1
    
    # List all files in the directory
    for filename in os.listdir(input_directory):
        input_path = os.path.join(input_directory, filename)
        
        # Process only .txt files
        if filename.endswith('.txt') and os.path.isfile(input_path):
            output_filename = f"{file_number}.txt"
            output_path = os.path.join(output_directory, output_filename)
            
            # Process and save the file
            process_file(input_path, output_path)
            print(f"Processed {filename} -> {output_filename}")
            
            file_number += 1

# Example usage
input_directory = '/kaggle/input/ougcgutxyreztew'
output_directory = 'bye'

process_files_in_directory(input_directory, output_directory)


## RAG System Using Llama2 With Hugging Face

In [None]:
!pip install pypdf

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

In [None]:
## Embedding
!pip install -q sentence_transformers

In [None]:
!pip install -q llama-index-llms-huggingface

In [None]:
!pip install -q llama-index-readers-file # Install the missing package for file readers

In [None]:
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt

In [None]:
documents=SimpleDirectoryReader("/kaggle/input/t20-wc").load_data()

In [None]:
system_prompt="""
You are an expert assistant. Answer the question based on the given context.
DON'T REFER TO THE PREVIOUS ANSWERS FOR THE CURRENT ANSWERS.
"""
## Default format supportable by LLama2
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [None]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="google/gemma-2-9b",
    model_name="google/gemma-2-9b",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

In [None]:
!pip install -q langchain-community langchain-core

In [None]:
!pip install -q -U llama-index-core llama-index-llms-openai llama-index-embeddings-openai

In [None]:
!pip install -q llama-index-legacy

In [None]:
from llama_index.core import ServiceContext

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from llama_index import ServiceContext
from llama_index.legacy.embeddings.langchain import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

In [None]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
service_context

In [None]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [None]:
index

In [None]:
query_engine=index.as_query_engine()

In [None]:
response=query_engine.query("Who won the match between Sri Lanka vs West Indies on 2016-03-20?")

In [None]:
type(response)

In [None]:
response = query_engine.query("Who won the match between Sri Lanka vs West Indies on 2016-03-20?")

# Extract and print the relevant part of the response
if response and hasattr(response, 'source_nodes'):
    for node_with_score in response.source_nodes:
        if hasattr(node_with_score, 'node') and hasattr(node_with_score.node, 'text'):
            # Extract the text content
            text = node_with_score.node.text
            # Find the result line in the text
            for line in text.split('\n'):
                if line.startswith('Result:'):
                    print(line)
                    break
else:
    print("No relevant response found.")


In [None]:
response = query_engine.query("Who won the match between Namibia vs India on 2021-11-08?")

# Extract and print the relevant part of the response
if response and hasattr(response, 'source_nodes'):
    for node_with_score in response.source_nodes:
        if hasattr(node_with_score, 'node') and hasattr(node_with_score.node, 'text'):
            # Extract the text content
            text = node_with_score.node.text
            # Find the result line in the text
            for line in text.split('\n'):
                if line.startswith('Result:'):
                    print(line)
                    break
else:
    print("No relevant response found.")


In [None]:
response = query_engine.query("Who was the man of the match between New Zealand vs Namibia happened in 2021?")

# Extract and print the relevant part of the response
if response and hasattr(response, 'source_nodes'):
    for node_with_score in response.source_nodes:
        if hasattr(node_with_score, 'node') and hasattr(node_with_score.node, 'text'):
            # Extract the text content
            text = node_with_score.node.text
            # Find and print the "Man of the Match" line in the text
            for line in text.split('\n'):
                if '0.5' in line:
                    print(line)
                    break
else:
    print("No relevant response found.")


In [None]:
response = query_engine.query("Who won the match between India vs Bangladesh happened in 2016?")

# Extract and print the relevant part of the response
if response and hasattr(response, 'source_nodes'):
    for node_with_score in response.source_nodes:
        if hasattr(node_with_score, 'node') and hasattr(node_with_score.node, 'text'):
            # Extract the text content
            text = node_with_score.node.text
            # Print the entire text for debugging purposes
            print("Full text content:\n", text)

            # Find the result line in the text
            for line in text.split('\n'):
                # Print each line for debugging purposes
                print("Checking line:", line)
                if line.startswith('9.4:'):
                    print("Match result:", line)
                    break
else:
    print("No relevant response found.")
