In [1]:
from langchain_openai import ChatOpenAI
api_key="sk..."

chat_model = ChatOpenAI(model="gpt-4o-mini", api_key=api_key, max_retries=0)

In [2]:
from pydantic import BaseModel, Field

class CategoryList(BaseModel):
    """All categories from the content"""

    categories: list[str] = Field(description="The list containing all categories starting with uppercase, if useful")

In [3]:
import csv
import json

def transform(arr):
    text = ""
    for x in arr:
        if type(x) == str:
            text += x
            continue
        if type(x) == dict:
            if 'attributes' in x:
                if 'link' in x['attributes']:
                    text += f"[{x['insert']}]({x['attributes']['link']})"
                elif 'bold' in x['attributes']:
                    text += f"**{x['insert']}**"
                else:
                    text += x['insert']
            continue
        print(x)
    return text

texts = []
with open('test.csv', newline='\n') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    first = True
    for row in spamreader:
        if first:
            first = False
            continue
        arr = json.loads(row[0])
        texts.append(transform(arr))

In [4]:
from langchain_core.messages import SystemMessage, HumanMessage
import time

prompt = """You are an expert in categorization and theme identification. Your task is to analyze each content section and generate **one concise, clear, and meaningful category** that best represents the core idea or main topic.  

### Guidelines for the category:  
1. It must accurately capture the essence of the content.  
2. It should be precise, relevant, and unambiguous.  
3. Only **one category** is allowed per section, regardless of length or complexity.  

### Content Format:  
The sections are presented as follows:  
```
<Content Number X>  
Here goes the content.  
</Content Number X>
```  

Focus on delivering **one-word clarity** while ensuring the category reflects the section's primary theme."""

def combine(text_list):
    result = []
    current_text = ""
    for i, t in enumerate(text_list):
        text = f'<Content Number {i+1}>\n{t}\n</Content Number {i+1}>'
        if len(current_text) == 0:
            current_text = text
        elif len(current_text) + len(text) < 10000:
            current_text = current_text + '\n' + text
        else:
            result.append(current_text)
            current_text = text
    if len(current_text) > 0:
        result.append(current_text)
    return result

async def run_openai(text_list):
    model = chat_model.with_structured_output(CategoryList)
    text_list_chunked = combine(text_list)
    result = CategoryList(categories=[])
    before = time.perf_counter()
    for t in text_list_chunked:
        temp = await model.ainvoke([
            SystemMessage(prompt),
            HumanMessage(t)
        ])
        result.categories.extend(temp.categories)
    after = time.perf_counter()
    print(len(text_list_chunked), after - before)
    result.categories = list(set(result.categories))
    return result

result = await run_openai(texts)

16 31.40297425002791


In [5]:
print(len(result.categories), len(texts))

378 1289


In [6]:
prompt2 = """You are a content analysis expert specializing in identifying relationships between concepts. Based on a provided list of categories, your task is to **combine only those keywords that are naturally and meaningfully combinable** into a single, concise term or phrase.  

### Guidelines for combination:  
1. Combine categories only if their meanings naturally complement or enhance one another.  
2. Ensure the combined term is clear, relevant, and precise.  
3. Do not force combinations; leave unrelated categories as they are.  
4. Aim for brevity while maintaining accuracy and relevance.  

### Input Format:  
You will receive a list of categories in this format:  
```
Category 1  
Category 2  
Category 3  
...  
```  

Analyze and refine the categories list accordingly."""
before = time.perf_counter()
result2 = await chat_model.with_structured_output(CategoryList).ainvoke([
    SystemMessage(prompt),
    HumanMessage("\n".join(result.categories))
])
after = time.perf_counter()
print(after - before)

6.900449345994275


In [7]:
keys = set(result.categories)
print(len(result2.categories))

378


# LLMLingua

In [11]:
from llmlingua import PromptCompressor
import time
import re
llm_lingua = PromptCompressor(
    model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
    use_llmlingua2=True, # Whether to use llmlingua-2
)

def prepare(data):
    text = '.'.join(texts)
    text = re.sub(r"\.(\s*\.)+", ".", text, count=0, flags=re.MULTILINE)
    text = re.sub(r"\?(\s*\?)+", "?", text, count=0, flags=re.MULTILINE)
    return re.sub(r"!(\s*!)+", "!", text, count=0, flags=re.MULTILINE)
    

before = time.perf_counter()
compressed = llm_lingua.compress_prompt(prepare(texts), target_token=10_000, force_tokens = ['?', '!', '.'])
after = time.perf_counter()
print(after - before)

Token indices sequence length is longer than the specified maximum sequence length for this model (25722 > 512). Running this sequence through the model will result in indexing errors


4.999694033002015


In [12]:
from langchain_core.messages import SystemMessage, HumanMessage

prompt = """You are an AI assistant tasked with analyzing a summarized text and extracting a list of general categories based on its content. Your role is to organize the text into high-level, non-contradictory categories that accurately represent the main topics or themes. These categories should be:  

1. **General:** Broad enough to encompass subtopics within the text.  
2. **Non-Contradictory:** Do not overlap in a way that creates logical conflicts.  
3. **Relevant:** Directly related to the main ideas presented in the text.  

Provide the categories as a clear, concise list without additional explanation unless requested."""

before = time.perf_counter()
result3 = await chat_model.with_structured_output(CategoryList).ainvoke([
    SystemMessage(prompt),
    HumanMessage(compressed["compressed_prompt"])
])
after = time.perf_counter()
print(after - before)

2.305834759026766


In [14]:
len(compressed["compressed_prompt"]), len(result3.categories)
#compressed["compressed_prompt"]
result3.categories

['General Chemistry',
 'Biochemistry',
 'Chemical Reactions',
 'Data Analysis in Statistics',
 'Statistical Methods',
 'Experimental Design',
 'Mathematical Concepts',
 'Data Visualization in R',
 'Biological Systems',
 'Chemical Structures',
 'Environmental Chemistry',
 'Physical Chemistry',
 'Analytical Chemistry',
 'Laboratory Techniques',
 'Chemistry Education',
 'Statistical Inference',
 'Sampling Techniques',
 'Probability Theory',
 'Chemical Equilibrium',
 'Acid-Base Chemistry',
 'Thermodynamics']

In [8]:
from typing import Optional
from langchain_core.prompts import SystemMessagePromptTemplate

class CategorySelect(BaseModel):
    """The selected category or null/None for the content"""

    category: Optional[str] = Field(description="The selected category, if available")


prompt = SystemMessagePromptTemplate.from_template("""You are an AI assistant tasked with determining if a piece of content belongs to any category from the following predefined list:

{categories}

Your role:
1. **Match the Content:** Compare the content to each category and decide if it aligns with any of them.
2. **Output the Category:** If the content fits a category, output the most appropriate category from the list.
3. **Exclusion:** If the content does not fit any category, respond with nothing.

Provide only the selected category or nothing as the output, ensuring precision and consistency in your decisions.""")



async def run_category_select(model, categories: list[str], text_list: list[str]):
    shared = prompt.format(categories="\n".join(categories))
    return await model.with_structured_output(CategorySelect).abatch(
        [
            [
                shared,
                HumanMessage(text)
            ] for text in text_list
        ]
    )



In [10]:
from more_itertools import chunked
import time
batched_results = []
i = 0
for text_sub in chunked(texts, 16):
    print(i)
    i += 1
    batch = await run_category_select(chat_model, result2.categories, text_sub)
    batched_results.extend(batch)
    time.sleep(6)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80


In [34]:
import json
from langchain_openai import OpenAIEmbeddings
api_key="sk..."

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key)

with open('llmlingua.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
with open('normal-category.json', 'r', encoding='utf-8') as f:
    data2 = json.load(f)

classified = 0
not_classified = 0
to_embed = []
labels = set()
for i in range(len(texts)):
    if data[i] is None:
        not_classified += 1
        labels.add(data2[i])
        continue
    classified += 1
    to_embed.append(data[i])
    to_embed.append(texts[i])

all_lengths / not_classified
labels
#embedded_vecs = embeddings.embed_documents(to_embed)

{'Acknowledgment',
 'Adjustment',
 'Audio',
 'Beauty',
 'Blood',
 'Bonus',
 'Calculation',
 'Calculator',
 'Chemische',
 'Clarification',
 'Comments',
 'Communication',
 'Comparison',
 'Comparisons',
 'Concern',
 'Concerns',
 'Confusion',
 'Connectivity',
 'Consultation',
 'Correction',
 'Dank',
 'Danke',
 'Data',
 'Decision',
 'Decrease',
 'Definition',
 'Delay',
 'Demand',
 'Diagramm',
 'Difference',
 'Discussion',
 'Display',
 'Document',
 'Elemente',
 'Elements',
 'Energien',
 'Environment',
 'Error',
 'Evaluation',
 'Exam',
 'Example',
 'Exercise',
 'Explanation',
 'Farewell',
 'Feedback',
 'File',
 'Formatting',
 'Formulas',
 'Fragen',
 'Frustration',
 'Geometry',
 'Gleichgewicht',
 'Gleichung',
 'Graphics',
 'Graphing',
 'Gratitude',
 'Greeting',
 'Greetings',
 'Health',
 'Help',
 'Homework',
 'Humor',
 'Hydration',
 'IQ',
 'Importance',
 'Independence',
 'Inequality',
 'Information',
 'Inquiry',
 'Interpretation',
 'Issue',
 'Issues',
 'Klausur',
 'Lecture',
 'Lectures',
 'Link

In [29]:
embedding_sum = 0
for i in range(0, len(embedded_vecs), 2):
    a = embedded_vecs[i]
    b = embedded_vecs[i + 1]
    length = len(a)
    vec_a_val = sum(a[i] * a[i] for i in range(length)) ** 0.5
    vec_b_val = sum(b[i] * b[i] for i in range(length)) ** 0.5
    embedding_sum += sum(a[i] * b[i] for i in range(length)) / (vec_a_val * vec_b_val)
print(classified, not_classified)

# all categories, 0.28168227774694293 1032 class, 257 not class = 0.800620636
# llmlingua, 0.2566732035409, 633 class, 656 not class = 0.491078355
embedding_sum / classified

1032 257


0.2816557796870045

In [43]:
class RelevantCategory(BaseModel):
    """Give information about a category"""
    is_acceptable: bool = Field(description="Whether the category is acceptable and not overly specific")
    explanation: str = Field(description="An explanation for the acceptance")

prompt = "Evaluate whether the following category is meaningful, general, and not overly specific. Respond with true or false and provide a brief explanation for your decision."

cats = ['General Chemistry',
 'Biochemistry',
 'Chemical Reactions',
 'Data Analysis in Statistics',
 'Statistical Methods',
 'Experimental Design',
 'Mathematical Concepts',
 'Data Visualization in R',
 'Biological Systems',
 'Chemical Structures',
 'Environmental Chemistry',
 'Physical Chemistry',
 'Analytical Chemistry',
 'Laboratory Techniques',
 'Chemistry Education',
 'Statistical Inference',
 'Sampling Techniques',
 'Probability Theory',
 'Chemical Equilibrium',
 'Acid-Base Chemistry',
 'Thermodynamics']

model = chat_model.with_structured_output(RelevantCategory)
cat_result = []
for category_sub in chunked(cats, 24):
    cat_output = await model.abatch([
        [
            SystemMessage(prompt),
            HumanMessage(category)
        ] for category in category_sub
    ])
    cat_result.extend(cat_output)
    time.sleep(6)

In [44]:
pos = 0
neg = 0
for x in cat_result:
    if x.is_acceptable:
        pos += 1
    else:
        neg += 1

print(pos / (pos + neg))

1.0
