In [None]:
# The goal of this Jupyter Notebook is to use an LLM for text classification, i.e., for support cases. The idea is to 
# categorize the support cases and additionally provide a reason for the categorization and also a possible solution 
# to help and relieve the support team.

# See also for advantages and disadvantages of using an LLM to classify text:
# https://sarah-packowski.medium.com/when-and-why-would-you-use-an-llm-for-text-classification-94b39ddc2947

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

# We use Llama.cpp for running an LLM locally:
# https://python.langchain.com/docs/guides/local_llms
# and a model from Mistral AI https://mistral.ai/
# Reason: Mistral AI models are open source and are ranked quite good on this leaderboard:
# https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard
# See also: https://www.reddit.com/r/LocalLLaMA/comments/18hh3qm/best_local_llm_for_german/
# https://mistral.ai/news/mixtral-of-experts/ (too slow on my laptop but better than Mistral-7B)
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
# https://huggingface.co/TheBloke (LLM: quantisation, fine tuning)
# https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF (we need the new GGUF format)
llm = LlamaCpp(
    model_path=R"C:\Users\WernerGaisbauer\LLMs\mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    n_gpu_layers=1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

#llm("The first man on the moon was ... Let's think step by step")

In [None]:
# For testing, we use a bunch of German support cases.
# https://www.kaggle.com/datasets/jordanrich/german-emails-in-xml
import pandas as pd

df = pd.read_csv("archive/German_emails.csv")
print(df.shape)
print(df.columns.tolist())

In [None]:
# Loop over the DataFrame's rows.
for index, row in df.iterrows():
    # Access each column by its name, e.g., row['name'], row['age'], row['city']
    print(f"id: {row['id']}, category: {row['category']}, text: {row['text']}, relevantText: {row['relevantText']}")

In [None]:
# Convert rows to a dictionary.
n = 1
row1_dict = df.iloc[n].to_dict()
#print(row1_dict)
support_email1 = row1_dict["relevantText"]
print(n)
print(support_email1)

n = 6
row6_dict = df.iloc[n].to_dict()
#print(row6_dict)
support_email2 = row6_dict["relevantText"]
print(n)
print(support_email2)

In [None]:
# https://learn.deeplearning.ai/langchain/lesson/2/models,-prompts-and-parsers
# See section Parse the LLM output string into a Python dictionary.
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [None]:
kategorien = """Data Warehouse, Technischer Support, Rechnungsprobleme, Kontoverwaltung, Feedback, Andere Anfragen"""

In [None]:
kategorie_schema = ResponseSchema(name="kategorie", description=f"Ordne die Anfrage einer der vorgegebenen Kategorien"
                                                                f" zu: {kategorien}.")
begruendung_schema = ResponseSchema(name="begruendung", description="Gib für die Zuordnung eine kurze Begründung an, "
                                                                    "um die Entscheidung zu erklären.")
loesungsvorschlag_schema = ResponseSchema(name="loesungsvorschlag", description="Gib einen Lösungsvorschlag für die "
                                                                                "Support-Anfrage an.")

response_schemas = [kategorie_schema, 
                    begruendung_schema,
                    loesungsvorschlag_schema]

In [None]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [None]:
format_instructions = output_parser.get_format_instructions()

In [None]:
print(format_instructions)
# Unfortunately, the instructions are in English, but we need German instructions, therefore we build the 
# instructions manually below.
# It seems that there is no localisation out of the box available:
# https://github.com/langchain-ai/langchain/issues/5203

In [None]:
from langchain.prompts import ChatPromptTemplate

In [None]:
format_instructions = f"""Die Ausgabe sollte ein Markdown-Code-Snippet sein, formatiert nach dem folgenden Schema, 
einschließlich der führenden und abschließenden "\`\`\`json" und "\`\`\`":

```json
{{
	"kategorie": string  // Ordne die Anfrage einer der vorgegebenen Kategorien zu: {kategorien}.
	"begruendung": string  // Gib für die Zuordnung eine kurze Begründung an, um die Entscheidung zu erklären.
	"loesungsvorschlag": string  // Gib einen Lösungsvorschlag für die Support-Anfrage an.
}}
```"""

In [None]:
template_string = """\
Bitte lies dir die folgende Support-Anfrage sorgfältig durch und erstelle folgende Informationen:

kategorie: Ordne die Anfrage einer der vorgegebenen Kategorien zu: {kategorien}.
begruendung: Gib für die Zuordnung eine kurze Begründung an, um die Entscheidung zu erklären.
loesungsvorschlag: Gib einen Lösungsvorschlag für die Support-Anfrage an.

Support-Anfrage: {text}

{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template=template_string)

messages = prompt.format_messages(text=support_email1,
                                  format_instructions=format_instructions,
                                  kategorien=kategorien)

In [None]:
print(messages[0].content)

In [None]:
response = llm(messages[0].content)

In [None]:
print(response)

In [None]:
output_dict = output_parser.parse(response)

In [None]:
output_dict

In [None]:
type(output_dict)

In [None]:
output_dict.get('kategorie')