In [None]:
import pandas as pd
import re
import numpy as np  
from faiss import IndexFlatL2, IndexIDMap
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Define dictionaries to store financial data
assets = {
    "Assets 1": 800,
    "Assets 2": 1200,
    "Assets 3": 4000,
    "Assets 4": 1500,
    "Assets 5": 9000,
    "Assets 6": 3000
}

liabilities = {
    "Liabilities 1": 2000,
    "Liabilities 2": 1500,
    "Liabilities 3": 3000,
    "Liabilities 4": 300,
    "Liabilities 5": 1800
}

income = {
    "Income 1": 9000,
    "Income 2": 1500,
    "Income 3": 3000
}

expenses = {
    "Expense 1": 500,
    "Expense 2": 1000,
    "Expense 3": 800,
    "Expense 4": 1100,
    "Expense 5": 1800,
    "Expense 6": 1250,
    "Expense 7": 1000,
    "Expense 8": 3500
}

class TextLoader:
    def __init__(self, filename):
        self.filename = filename

    def load(self):
        with open(self.filename, 'r') as file:
            lines = file.readlines()

        balance_sheet_lines = []
        income_statement_lines = []
        current_section = None

        for line in lines:
            if 'Personal Balance Sheet as of' in line:
                current_section = 'balance_sheet'
                continue
            elif 'Personal Income Statement for the year' in line:
                current_section = 'income_statement'
                continue
            elif '----------------------------------------------' in line:
                continue

            if current_section == 'balance_sheet':
                balance_sheet_lines.append(line.strip())
            elif current_section == 'income_statement':
                income_statement_lines.append(line.strip())

        balance_sheet_df = self._lines_to_dataframe(balance_sheet_lines)
        income_statement_df = self._lines_to_dataframe(income_statement_lines)

        documents = [
            Document(content=balance_sheet_df.to_string(index=False)),
            Document(content=income_statement_df.to_string(index=False))
        ]
        return documents

    def _lines_to_dataframe(self, lines):
        data = []
        for line in lines:
            if line:
                parts = [part.strip() for part in line.split('|')]
                if len(parts) == 3:
                    data.append(parts)
        
        df = pd.DataFrame(data[1:], columns=data[0])
        return df

class Document:
    def __init__(self, content):
        self.content = content

def clean_and_preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return tokens

loader = TextLoader("C:\\Users\\sulek\\OneDrive\\Desktop\\financial rag system\\Personal Financial Statement.txt")
documents = loader.load()

if not documents:
    print("No documents loaded. Please check the input file.")
else:
    processed_documents = [" ".join(clean_and_preprocess(doc.content)) for doc in documents]

    def create_embeddings(texts):
        model_name = "all-mpnet-base-v2"
        model = SentenceTransformer(model_name)
        embeddings = model.encode(texts)
        return embeddings

    embeddings = create_embeddings(processed_documents)
    index = IndexIDMap(IndexFlatL2(embeddings.shape[1]))
    index.add_with_ids(embeddings, np.array(range(len(embeddings))))

    model_name = "distilbert-base-cased-distilled-squad"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    def formulate_query(question):
        return clean_and_preprocess(question)

    def generate_answer(model, tokenizer, question, document):
        inputs = tokenizer(question, document.content, return_tensors='pt')
        outputs = model(**inputs)
        answer_start = outputs.start_logits.argmax()
        answer_end = outputs.end_logits.argmax()
        answer = tokenizer.decode(inputs.input_ids[0][answer_start:answer_end + 1])
        return answer

    def answer_question(question):
        query = formulate_query(question)
        query_embedding = create_embeddings([" ".join(query)])
        dids, distances = index.search(np.array(query_embedding), k=3)
        doc_index = int(dids[0][0])
        document = documents[doc_index]
        
        if "total income" in question:
            total_income = sum(income.values())
            return f"Total Income: ${total_income:.2f}"
        elif "total liabilities" in question:
            total_liabilities = sum(liabilities.values())
            return f"Total Liabilities: ${total_liabilities:.2f}"
        elif "total expenses" in question:
            total_expenses = sum(expenses.values())
            return f"Total Expenses: ${total_expenses:.2f}"
        elif "net income" in question:
            total_income = sum(income.values())
            total_expenses = sum(expenses.values())
            net_income = total_income - total_expenses
            return f"Net Income: ${net_income:.2f}"
        elif "how much do i owe" in question:
            return "\n".join([f"{liability}: ${amount:.2f}" for liability, amount in liabilities.items()])
        elif "highest income source" in question:
            highest_income_source = max(income, key=income.get)
            return f"Highest Income Source: {highest_income_source}"
        elif "lowest expense category" in question:
            lowest_expense_category = min(expenses, key=expenses.get)
            return f"Lowest Expense Category: {lowest_expense_category}"
        elif "most valuable asset" in question:
            most_valuable_asset = max(assets, key=assets.get)
            return f"Most Valuable Asset: {most_valuable_asset}"
        elif "least valuable asset" in question:
            least_valuable_asset = min(assets, key=assets.get)
            return f"Least Valuable Asset: {least_valuable_asset}"
        else:
            return "Sorry, I couldn't understand your question."

    while True:
        user_question = input("Ask me a question about your finances (e.g., what is my total income?): ")
        answer = answer_question(user_question)
        print(f"Chatbot: {answer}")


Ask me a question about your finances (e.g., what is my total income?): what is the highest income source?
Chatbot: Highest Income Source: Income 1
