In [1]:
import pandas as pd
import ollama
import vllm
from openai import OpenAI
from tqdm.auto import tqdm
import json
from IPython.display import display
import os
import ast
import time
import gc
import re
os.chdir(r'C:\Users\meier\OneDrive\Documents\messy_text')
df_text = pd.read_csv('df_text.csv', encoding='utf-8')

In [4]:
# Connect to local vLLM server and check available models
"""

source ~/vllm_venv/bin/activate
vllm serve hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 --quantization awq --port 8000 --host 0.0.0.0 --max-model-len 8192

"""

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="dummy"
)
models = client.models.list()
for model in models.data:
    print(f"{model.id}")

model = 'hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4'

response = client.chat.completions.create(
    model=model,  # <-- string, not object
    messages=[
        {
            'role': 'user',
            'content': 'What is the result of 1+1? Return your answer as JSON with a "result" field containing the numerical answer and an "explanation" field with a brief explanation.',
        },
    ],
    temperature=0.0,
    extra_body={"guided_json": {"type": "object", "properties": {"result": {"type": "integer"}, "explanation": {"type": "string"}}, "required": ["result", "explanation"]}}
)
response_content = response.choices[0].message.content
parsed_response = json.loads(response_content)
print(json.dumps(parsed_response, indent=2))

hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4
{
  "result": 2,
  "explanation": "Basic arithmetic addition of two numbers"
}


In [6]:
# context
code_to_desc_map = {
    'vic_grupo_social': '''Is the victim a member of a distinct social group? ''',
    'amenaza_quien': '''Who carried out the threats?''',
    'captura_metodo': '''What is the method of the capture? Describe the language that the majority of the articles use to make reference to the disappearance.''',
    'captura_tipo': '''The type of place from which the victim disappeared if it is specified. Categories belonging to HURIDOCS (https://www.huridocs.org/resource/micro-thesauri/).''',
    'cautiverio_trato': '''The treatment of the victim while they were in captivity, if specified.''',
    'desenlace': '''The outcome of the disappearance, if specified.''',
    'desenlace_tipo': '''The type of place where the outcome occurred according to HURIDOCS.''',
    'perp_tipo1': '''Which of the categories the perpetrator belongs to. ''',
    'perp_tipo2': '''To which category the perpetrator belongs, if specified.''',
    'proced_contacto1': '''Who has contacted the authorities about the case.''',
    'proced_contacto2': '''Who has contacted the authorities most in the case.''',
    'proced_contactado': '''Which authority responded to the contact.''',
    'Tribunal_tipo': '''The type of tribunal or court, if it is mentioned.''',
    'proced_sent_tipo': '''The type of sentence against the perpetrators or detained individuals, if specified.''',
    'soc_civil': '''Was there a report on the involvement of civil society in this case?'''
}

label_values_map = {
    'vic_grupo_social': [
        'Professionals (Entrepreneur, Engineer, Professor, Journalist, etc)',
        'People that work in service industries (taxi driver, salesman, etc)',
        'Civil servants (Police, mayor, public worker, etc)',
        'Belonging to some sexual identity group (LGBTQ)',
        'People associated with politics',
        'Activists (political activist, human rights, etc)',
        'Organized crime',
        'Students',
        'Land Worker',
        'Other',
        'No information'
    ],
    'amenaza_quien': [
        'Perpetrator',
        'Organized crime',
        'Armed group',
        'Relative',
        'Neighbor',
        'Someone known by the victim',
        'Other',
        'No information'
    ],
    'captura_metodo': [
        'Disappearance',
        'Kidnapping',
        'Scam',
        'Plagio (kidnapping in a legal sense)',
        'Detention/arrest',
        'Military or political operation (raid)',
        'Levantón (kidnapping but pejorative use towards the victim)',
        'No information'
    ],
    'captura_tipo': [
        'Places related to the victim (house, workplace, private property)',
        'Economic, social, industrial, agricultural and service centers',
        'Authorities (government offices, military facilities)',
        'Educational and medical facilities',
        'Places for free expression, association and gatherings',
        'Unoccupied or barren public spaces',
        'Means and routes of transport and places of connection',
        'International and protected spaces',
        'Special centers and barracks for detention',
        'No information'
    ],
    'cautiverio_trato': [
        'Strangulation',
        'Torture',
        'Disappeared',
        'Witness of the torture of their relatives',
        'Identity theft',
        'Dismembered',
        'No information'
    ],
    'desenlace': [
        'Still disappeared',
        'Liberated by captors',
        'Liberated by authorities',
        'Found dead',
        'Escaped or was liberated through their own means',
        'Found alive',
        'Found, but does not specify if dead or alive',
        'No information'
    ],
    'desenlace_tipo': [
        'Places related to the victim (house, workplace, private property)',
        'Economic, social, industrial, agricultural and service centers',
        'Authorities (government offices, military facilities)',
        'Educational and medical facilities',
        'Places for free expression, association and gatherings',
        'Unoccupied or barren public spaces',
        'Means and routes of transport and places of connection',
        'International and protected spaces',
        'Centers and quarters for detention',
        'No information'
    ],
    'perp_tipo1': [
        'State agent (press article does not specify more information)',
        'Municipal police',
        'State police',
        'Federal police',
        'Army',
        'Navy',
        'Air Force',
        'Ministerial police (they depend on the PGR - Office of the Federal Attorney)',
        'Particulars (when you cannot identify their affiliation to an organized criminal group)',
        'Relatives',
        'Has or had a romantic relationship with victim',
        'Organized crime (Z)',
        'Organized crime (Caballeros Templarios)',
        'Organized crime (Cartel de Sinaloa)',
        'Organized crime (Cartel de Jalisco Nueva Generación)',
        'Organized crime (Beltrán Leyva)',
        'Organized crime (Cartel del Golfo)',
        'Organized crime (Cartel de Juárez)',
        'Organized crime (Los Rojos)',
        'Organized crime (Los Ardillos)',
        'Organized crime (La Familia Michoacana)',
        'Organized crime (name is unspecified)',
        'No information'
    ],
    'perp_tipo2': [
        'State agent (press article does not specify more information)',
        'Municipal police',
        'State police',
        'Federal police',
        'Army',
        'Navy',
        'Air Force',
        'Ministerial police (they depend on the PGR - Office of the Federal Attorney)',
        'Particulars (when belonging to an organized criminal group is not identifiable)',
        'Relatives',
        'Has or had a romantic relationship with victim',
        'Organized crime (Z)',
        'Organized crime(Caballeros Templarios)',
        'Organized crime (Cartel de Sinaloa)',
        'Organized crime (Cartel de Jalisco Nueva Generación)',
        'Organized crime (Beltrán Leyva)',
        'Organized crime (Cartel del Golfo)',
        'Organized crime(Cartel de Juárez)',
        'Organized crime (Los Rojos)',
        'Organized crime (Los Ardillos)',
        'Organized crime (La Familia Michoacana)',
        'Organized crime (name is unspecified)',
        'No information'
    ],
    'proced_contacto1': [
        'Relatives',
        'Neighbors',
        'Agents of the State (Office of the inspector general, secretariat of security and civilian protection, municipal committee, governor, mayor, district attorney)',
        'Human rights organizations',
        'Foreign government',
        'Legal representative',
        'Other'
    ],
    'proced_contacto2': [
        'Relatives',
        'Neighbors',
        'Agents of the State (Office of the inspector general, secretariat of security and civilian protection, municipal committee, governor, mayor, district attorney)',
        'Human rights organizations',
        'Foreign government',
        'Legal representative',
        'Other',
        'No information'
    ],
    'proced_contactado': [
        'Municipal police',
        'State police',
        'Federal police',
        'Army',
        'Navy',
        'Air Force',
        'Ministerial police (they are affiliated with the PGR - Office of the Federal Attorney)',
        'District attorney’s office',
        'Office of the inspector general (previous title of district attorney’s office)',
        'Prosecutor',
        'Commision on Human Rights',
        'Governor',
        'Mayor',
        'Other(s)',
        'The article mentions that THERE WAS NOT a response',
        'No information'
    ],
    'Tribunal_tipo': [
        'State',
        'Federal',
        'Military',
        'No information'
    ],
    'proced_sent_tipo': [
        'Consecutive',
        'Condemnatory',
        'Absolving',
        'No information'
    ],
    'soc_civil': [
        'Yes (explain more in the final comments section)',
        'No'
    ]
}

import sys
import pandas as pd

def show_resource(variable, variable_name: str):
    """
    A simple function to calculate and print the memory usage of a 
    variable in Megabytes (MB).
    """
    # This helper function is needed for accuracy on lists/dicts,
    # but the main interface is just show_resource().
    def get_deep_size(obj, seen=None):
        size = sys.getsizeof(obj)
        if seen is None:
            seen = set()
        obj_id = id(obj)
        if obj_id in seen: return 0
        seen.add(obj_id)
        if isinstance(obj, dict):
            size += sum([get_deep_size(v, seen) for v in obj.values()])
            size += sum([get_deep_size(k, seen) for k in obj.keys()])
        elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
            size += sum([get_deep_size(i, seen) for i in obj])
        return size

    if isinstance(variable, pd.DataFrame):
        size_bytes = variable.memory_usage(deep=True).sum()
    else:
        # Gracefully handle if the variable doesn't exist yet
        if variable is None:
            size_bytes = 0
        else:
            size_bytes = get_deep_size(variable)
    
    size_mb = size_bytes / (1024 * 1024)
    print(f"[Memory Log] '{variable_name}' is using {size_mb:.2f} MB")

In [10]:
def process_dataframe_summary_and_classification(df, code_to_desc_map, label_values_map, vllm_client, model_name, start_index=0, early_break=None, inner_loop_break=None, show_resources=False, print_prompts=False, print_response=False, print_progress=False):
    """
    Processes a DataFrame to generate summaries and classifications for text data.

    Args:
        df (pd.DataFrame): The input DataFrame with a 'text' column.
        code_to_desc_map (dict): A dictionary mapping classification codes to their descriptions.
        label_values_map (dict): A dictionary mapping classification codes to their possible values.
        vllm_client: The vLLM client for interacting with the language model.
        model_name (str): The name of the language model to use.
        start_index (int, optional): The index to start processing from. Defaults to 0.
        early_break (int, optional): If provided, the number of rows to process before stopping. Defaults to None.
        inner_loop_break (int, optional): If provided, the number of inner loop iterations before stopping. Defaults to None.
        show_resources (bool, optional): If True, prints memory usage of variables. Defaults to False.
        print_prompts (bool, optional): If True, prints the prompts sent to the model. Defaults to False.
        print_response (bool, optional): If True, prints the responses received from the model. Defaults to False.
        print_progress (bool, optional): If True, prints detailed progress updates. Defaults to False.

    Returns:
        pd.DataFrame: The DataFrame with added columns for summaries and classifications.
    """


    df_processed = df.copy()
    
    row_counter = 0
    columns_initialized = False

    new_columns = ['summary_all_context']
    for key in code_to_desc_map.keys():
        new_columns.append(f'{key}_classification')

    results_list = []

    df_to_process = df_processed.iloc[start_index:]
    total_rows = len(df_to_process)
    if early_break is not None and early_break < total_rows:
        total_rows = early_break
    
    with tqdm(total=total_rows, desc="Summarizing", position=0, leave=True) as pbar:
        for row in df_to_process.itertuples():
            pbar.set_description(f"Summarizing (Index: {row.index})")
            current_row_results = {'index': row.Index}

            if print_progress:
                print(f"Summarizing{row.index}, extracting text)")

            if not columns_initialized:
                for col in new_columns:
                    if col not in df_processed.columns:
                        df_processed[col] = ""
                columns_initialized = True

            row_counter += 1
            text_to_summarize = re.sub(r'\s+', ' ', re.sub(r'https?://\S+|\([^)]*/[^)]*\)|[\ue000-\uf8ff]|\b\d/\d\b', '', str(row.text))).strip()

            prompt_summary = None
            response_summary = None
            prompt_classification = None
            response_classification = None

            if print_progress:
                print(f"Summarizing{row.index}, creating prompt)")

            if text_to_summarize.strip():
                # 1. Create a summary that covers all required info
                prompt_summary = str({
                    'input_text': text_to_summarize,
                    'related_context': code_to_desc_map,
                    'output_format': {
                        'info_found': '<TRUE|FALSE>',
                        'relevant_context': '<list of context keys found, or empty list>',
                        'summary': '<texto in spanish>'
                    },
                    'instructions': [
                        'If the input is an error/missing page (e.g., "Página no encontrada", "404", "no se puede encontrar esa página"), set info_found="FALSE", relevant_context=[], summary=""',
                        'Ignore navigation/site chrome (menú, buscar, categorías, compartir, ThemeGrill, WordPress, cookies, copyright)',
                        'relevant_context should list the keys from the related_context in that are found in the text (e.g., ["vic_grupo_social", "captura_metodo", "perp_tipo1"])',
                        'Extractive summary in Spanish: copy exact spans; DO NOT paraphrase; preserve modality ("soñaba ser", "quería ser", "aspiraba a")',
                        'If no relevant info, relevant_context=[] and summary=""',
                        'NO APOLOGIES, NO FILLER TEXT'
                    ],
                })

                if print_prompts:
                    print("\n ======")
                    print(f"Summarization Prompt for index {row.index}:")
                    print(json.dumps(ast.literal_eval(prompt_summary), indent=2))
                
                if print_progress:
                    print(f"Summarizing{row.index}, sending prompt to model)")

                response_summary = vllm_client.chat.completions.create(
                    model=model_name,
                    messages=[{'role': 'user', 'content': prompt_summary}],
                    temperature=0.0,
                    max_tokens=1024,
                    extra_body={"guided_json": {"type": "object", "properties": {"info_found": {"type": "string"}, "relevant_context": {"type": "array"}, "summary": {"type": "string"}}, "required": ["info_found", "relevant_context", "summary"]}}
                )
                
                if print_progress:
                    print(f"Summarizing{row.index}, extracting result)")

                try:
                    summary_output = json.loads(response_summary.choices[0].message.content).get('summary', 'No relevant information found')
                    if not summary_output:
                        summary_output = 'No relevant information found'
                    text_summarized = summary_output
                    current_row_results['summary_all_context'] = text_summarized
                except Exception as e:
                    print(e)
                    text_summarized = 'No relevant information found'
                    current_row_results['summary_all_context'] = text_summarized
                
                if print_response:
                    print("\n ======")
                    print(f"Summary Result for index {row.index}:")
                    print(json.dumps({"processed_summary": text_summarized}, indent=2))

                if print_progress:
                    print(f"Classifying{row.index})")

                if text_summarized.strip() and text_summarized != 'No relevant information found':
                    with tqdm(code_to_desc_map.items(), total=len(code_to_desc_map), desc="Classifying", leave=False, position=1) as pbar_inner:
                        row_counter_inner = 0
                        for key, desc in pbar_inner:
                            row_counter_inner += 1
                            prompt_classification = str({
                                'input_text': text_summarized,
                                'question': desc,
                                'possible_values': label_values_map.get(key, []),
                                'instructions': [
                                    'OUTPUT FORMAT: Return ONLY {"evidence":"evidence", "result": "your_classification"}',
                                    'DO NOT ECHO THE INPUT, QUESTION, OR POSSIBLE_VALUES IN YOUR RESPONSE',
                                    f'Your result MUST be one of the possible_values: {label_values_map.get(key, [])}',
                                    'If no information is found about this label, return empty string like {"evidence": "no information found about this label", "result": ""}',
                                ]
                            })
                            
                            if print_prompts:
                                print(f"Classification Prompt for index {row.index}:")
                                print(json.dumps(ast.literal_eval(prompt_classification), indent=2))
                                print("====== \n")

                            response_classification = vllm_client.chat.completions.create(
                                model=model_name,
                                messages=[{'role': 'user', 'content': prompt_classification}],
                                temperature=0.0,
                                max_tokens=1048,
                                extra_body={"guided_json": {"type": "object", "properties": {"evidence": {"type": "string"}, "result": {"type": "string"}}, "required": ["evidence", "result"]}}
                            )

                            try:
                                result_classification = response_classification.choices[0].message.content
                                parsed_result = json.loads(result_classification)
                                classification_output = parsed_result.get('result', 'No information')
                                if not classification_output:
                                    classification_output = 'No information'
                                current_row_results[f'{key}_classification'] = classification_output
                            except Exception as e:
                                print(e)
                                current_row_results[f'{key}_classification'] = 'No information'
                            if print_response:
                                print(json.dumps({
                                    "classification_key": key,
                                    "processed_result": current_row_results[f'{key}_classification']
                                }, indent=2))
                            if inner_loop_break is not None and row_counter_inner >= inner_loop_break:
                                break
            
            if show_resources:
                print(f"--- Resource Usage for Index {row.Index} ---")
                show_resource(results_list, 'results_list (cumulative)')
                show_resource(df_processed, 'df_processed')
                show_resource(prompt_summary, 'prompt_summary')
                show_resource(response_summary, 'response_summary')
                show_resource(prompt_classification, 'prompt_classification (last)')
                show_resource(response_classification, 'response_classification (last)')
                print("-" * (len(f"--- Resource Usage for Index {row.Index} ---")))


            results_list.append(current_row_results)
            pbar.update(1)

            # Clean up memory
            del prompt_summary, response_summary, prompt_classification, response_classification, text_to_summarize
            gc.collect()

            # Pause every 5 iterations
            if row_counter % 5 == 0:
                time.sleep(10)
            
            if early_break is not None and row_counter >= early_break: 
                break

    if results_list:
        results_df = pd.DataFrame(results_list).set_index('index')
        df_processed.update(results_df)
        
    return df_processed


In [None]:
# Call the function with all parameters
processed_df = process_dataframe_summary_and_classification(
    df=df_text,
    code_to_desc_map=code_to_desc_map,
    label_values_map=label_values_map,
    vllm_client=client,
    model_name=model,
    start_index=0,
    early_break=10,
    inner_loop_break=5,
    show_resources=False,
    print_prompts=False,
    print_response=False,
    print_progress=False
)

# Display results - show original annotations and new classifications for comparison
new_columns = ['summary_all_context'] + [f'{key}_classification' for key in code_to_desc_map.keys()]
original_annotation_cols = list(code_to_desc_map.keys())
display_cols = ['index'] + original_annotation_cols + new_columns
processed_df[new_columns]


Summarizing:   0%|          | 0/10 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Unterminated string starting at: line 1 column 301 (char 300)


Classifying:   0%|          | 0/15 [00:00<?, ?it/s]

Unterminated string starting at: line 20 column 14 (char 405)


Unnamed: 0,index,vic_grupo_social,amenaza_quien,captura_metodo,captura_tipo,cautiverio_trato,desenlace,desenlace_tipo,perp_tipo1,perp_tipo2,...,desenlace_classification,desenlace_tipo_classification,perp_tipo1_classification,perp_tipo2_classification,proced_contacto1_classification,proced_contacto2_classification,proced_contactado_classification,Tribunal_tipo_classification,proced_sent_tipo_classification,soc_civil_classification


In [13]:
processed_df[new_columns]

Unnamed: 0,summary_all_context,vic_grupo_social_classification,amenaza_quien_classification,captura_metodo_classification,captura_tipo_classification,cautiverio_trato_classification,desenlace_classification,desenlace_tipo_classification,perp_tipo1_classification,perp_tipo2_classification,proced_contacto1_classification,proced_contacto2_classification,proced_contactado_classification,Tribunal_tipo_classification,proced_sent_tipo_classification,soc_civil_classification
0,No relevant information found,,,,,,,,,,,,,,,
1,Abel soñaba ser ingeniero y dejó su pueblo. Se...,Students,Policía de Chilpancingo,Kidnapping,Educational and medical facilities,No information,,,,,,,,,,
2,La policía de Chilpancingo está involucrada en...,Organized crime,Organized crime,Kidnapping,"Authorities (government offices, military faci...",Torture,,,,,,,,,,
3,"El artículo habla sobre varios temas, incluyen...",No information,No information,No information,No information,No information,,,,,,,,,,
4,La desaparición forzada de jóvenes en Chilpanc...,Students,Perpetrator,Disappearance,"Authorities (government offices, military faci...",Torture,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2224,,,,,,,,,,,,,,,,
2225,,,,,,,,,,,,,,,,
2226,,,,,,,,,,,,,,,,
2227,,,,,,,,,,,,,,,,


In [None]:
processed_df.replace(['No information', 'No relevant information found'], '', inplace=True)
processed_df.to_csv('df_text_clas.csv', index=False)