# Chamar o LLM #

In [None]:
import os
from dotenv import load_dotenv

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import AzureChatOpenAI

load_dotenv()

In [None]:
model = AzureChatOpenAI(
            azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
            openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
            deployment_name = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
            api_version = "2023-09-01-preview",
            temperature = 0.0
        )

In [None]:
prompt_template = """
    You are an expert at selecting suppliers that will provide equipments to the company you work for.
    The suppliers fill a spreadsheet with their machines' specifications and based on that, you check if the answers correspond to what your company requires to make them an official supplier.

    In this task, you will analyze this data:
    
    ```
    {df_data}
    ```

    For each supplier answer, you will check if the answer can fill the requirements according to the other fields and return a JSON.

    GUIDELINES:
    - In the dataframe, you must add another field called "COMMENT" and it should only contain "OK" or "NOK". "OK" in case the supplier's answer can fill the requirement or "NOK" in case the supplier's answer does not fill the requirement.
    - You must return a valid JSON structure in your response, without any additional commentary, only the JSON.
    - The JSON structure will be converted to a Dataframe, so return a structure that will make the conversion possible.
    - If there's not enough information to make the analysis, in the field "COMMENT" just write "Not enough information".
    - Don't evaluate the suppliers answers if you don't know if they fill the requirements.
    """

In [None]:
parser = JsonOutputParser()

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["df_data"]
)

chain = prompt | model | parser


In [None]:
def call_llm(df, supplier, section_name):
    result = chain.invoke({"df_data": df})
    pd.DataFrame(result).to_csv(f'result/filler/{section_name.replace("/", " ")}.csv', index=False)

# Extrair e tratar dados da planilha de respostas do fornecedor #

In [None]:
import pandas as pd
from openpyxl import load_workbook

In [None]:
def extract_info(file_path, sheet_name, column_letters):
    """
    Extracts data from specific columns in an Excel sheet.

    Args:
        file_path (str): Path to the input Excel file.
        sheet_name (str): Name of the sheet to extract data from.
        column_letters (list of str): List of column letters to extract data from.

    Returns:
        pd.DataFrame: DataFrame containing the extracted data with column names from row 15.
    """
    # Load workbook and sheet
    wb = load_workbook(file_path, data_only=True)
    sheet = wb[sheet_name]

    # Read column names from row 15
    column_names = [sheet[f'{col}15'].value for col in column_letters]

    # Initialize a dictionary to store the data
    data = {col: [] for col in column_names}

    # Extract values from each column, ignoring the first 13 cells and row 15
    for col_letter, column_name in zip(column_letters, column_names):
        column_data = [cell.value for cell in sheet[col_letter] if cell.row > 13]
        # Add data to the corresponding list
        data[column_name] = column_data

    # Find the maximum length of the lists
    max_length = max(len(lst) for lst in data.values())

    # Standardize the length of the lists with missing values (None)
    for key in data:
        while len(data[key]) < max_length:
            data[key].append(None)

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data)

    return df



Planilhas a serem extraídas:

In [None]:
file_path_khs = '../files/KHS.xlsx'
file_path_krones = '../files/KRONES.xlsx'
file_path_sidel = '../files/SIDEL.xlsx'
sheet_filler = 'CAN Filler'
column_filler = ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

Seções Can Filler:

In [None]:
can_filler_sections = {
    "1.0 PERFORMANCE and WARRANTY": {
        "start": "",
        "end": ""
    },
    "2.0 GENERAL INFORMATION": {
        "start": "",
        "end": ""
    },
    "3.0 PROCESS": {
        "start": "",
        "end": ""
    },
    "4.0 CONSTRUCTIVE CHARACTERISTICS": {
        "start": "",
        "end": ""
    },
    "5.0 FILLER ACCESSORIES": {
        "start": "",
        "end": ""
    },
    "6.0 DIMENSIONS / WEIGHT": {
        "start": "",
        "end": ""
    },
    "7.0 OXYGEN ELIMINATORS DEVICES": {
        "start": "",
        "end": ""
    },
    "8.0 MATERIALS": {
        "start": "",
        "end": ""
    },
    "9.0 CHANGE OVER": {
        "start": "",
        "end": ""
    },
    "10.0 SAFETY": {
        "start": "",
        "end": ""
    },
    "11.0 ELECTRICITY, CONTROL, ALARM, AUTOMATION": {
        "start": "",
        "end": ""
    },
    "12.0 POWER CABINETS": {
        "start": "",
        "end": ""
    },
    "13.0 UTILITIES CONSUMPTION": {
        "start": "",
        "end": ""
    },
    "14.0 MAINTENANCE / DOCUMENTATION / TRAINING": {
        "start": "",
        "end": ""
    },
    "15.0 TEC CAN FILLER ACCEPTANCE": {
        "start": "",
        "end": ""
    },
    "16.0 ZONE DEMANDS DUE TO LOCAL REGULATIONS": {
        "start": "",
        "end": ""
    },
    "17.0 HISTORY OF REVISIONS": {
        "start": "",
        "end": ""
    }
}


Identificar os índices das seções:

In [None]:
def is_section(item):
    return isinstance(item, str) and item in can_filler_sections

In [None]:
def get_section_index(df):
    section_indexes = df[df['ITEM'].apply(is_section)].index.tolist()
    section_indexes.append(len(df))  # adiciona o índice do final do DataFrame
    return section_indexes

In [None]:
def add_indexes(df, indexes, sections_dict):
    for i in range(len(indexes) - 1):
        start_idx = indexes[i]
        end_idx = indexes[i + 1]
        section_name = df.loc[start_idx, 'ITEM']
        sections_dict[section_name]["start"] = start_idx + 1 
        sections_dict[section_name]["end"] = end_idx

Separar planilhas:

In [None]:
def separate_df(df_supplier, sections, section_name):
    df = df_supplier.iloc[sections[section_name]['start']:sections[section_name]['end']].reset_index(drop=True).rename(columns={'CNV': 'Charachteristic', 'UNIT': 'Instruction/Comments'}).drop(index=0, columns=['ITEM', 'ITEM DESCRIPTION', 'INDEX', 'Instruction / Comments', None]).to_csv(index=False)
    call_llm(df, 'khs', section_name)

In [None]:
def separate_df_performance(df_supplier, sections, section_name):
    df = df_supplier.iloc[sections[section_name]['start']:sections[section_name]['end']].reset_index(drop=True).drop(index=[0], columns=['ITEM', None]).to_csv(index=False)
    call_llm(df, 'khs', section_name)

In [None]:
def separate_df_utilities(df_supplier, sections, section_name):
    df = df_supplier.iloc[sections[section_name]['start']:sections[section_name]['end']].reset_index(drop=True).drop(index=[0], columns=['ITEM', 'ITEM DESCRIPTION', 'Instruction / Comments']).rename(columns={'CNV': 'Characteristic', 'INDEX': 'Instruction / Comments'}).dropna(subset=['Characteristic']).to_csv(index=False)
    call_llm(df, 'khs', section_name)


In [None]:
def separate_df_maintenance(df_supplier, sections, section_name):
    df = df_supplier.iloc[sections[section_name]['start']:sections[section_name]['end']].reset_index(drop=True).drop(index=[0], columns=['ITEM', 'INDEX', 'ITEM DESCRIPTION', 'Instruction / Comments']).rename(columns={'CNV': 'Characteristic', 'UNIT': 'Instruction', None: 'Comments'}).dropna(subset=['Characteristic']).to_csv(index=False)
    call_llm(df, 'khs', section_name)

In [None]:
def separate_df_zone(df_supplier, sections, section_name):
    df = df_supplier.iloc[sections[section_name]['start']:sections[section_name]['end']].reset_index(drop=True).drop(index=[0], columns=['ITEM', 'UNIT', 'INDEX']).rename(columns={'CNV': 'Zone', 'ITEM DESCRIPTION': 'Detail','Instruction / Comments': 'Instruction', None: 'Comments'}).to_csv(index=False)
    call_llm(df, 'khs', section_name)

# KHS: # 

In [None]:
df_khs = extract_info(file_path=file_path_khs, sheet_name=sheet_filler, column_letters=column_filler)

Índices:

In [None]:
khs_indexes = get_section_index(df_khs)
_ = add_indexes(df_khs, khs_indexes, can_filler_sections)
can_filler_sections

Planilhas separadas:

In [None]:
import time

df_performance = separate_df_performance(df_khs, can_filler_sections, "1.0 PERFORMANCE and WARRANTY")
df_general_info = separate_df(df_khs, can_filler_sections, "2.0 GENERAL INFORMATION")
df_process = separate_df(df_khs, can_filler_sections, "3.0 PROCESS")

time.sleep(15)

df_characteristics = separate_df(df_khs, can_filler_sections, "4.0 CONSTRUCTIVE CHARACTERISTICS")
df_accessories = separate_df(df_khs, can_filler_sections, "5.0 FILLER ACCESSORIES")
df_dimensions = separate_df(df_khs, can_filler_sections, "6.0 DIMENSIONS / WEIGHT")

time.sleep(15)

df_oxygen = separate_df(df_khs, can_filler_sections, "7.0 OXYGEN ELIMINATORS DEVICES")
df_materials = separate_df(df_khs, can_filler_sections, "8.0 MATERIALS")
df_change = separate_df(df_khs, can_filler_sections, "9.0 CHANGE OVER")

time.sleep(15)

df_safety = separate_df(df_khs, can_filler_sections, "10.0 SAFETY")
df_electricity = separate_df(df_khs, can_filler_sections, "11.0 ELECTRICITY, CONTROL, ALARM, AUTOMATION")
df_power = separate_df(df_khs, can_filler_sections, "12.0 POWER CABINETS")

time.sleep(15)

df_utilities = separate_df_utilities(df_khs, can_filler_sections, "13.0 UTILITIES CONSUMPTION")
df_maintenance = separate_df_maintenance(df_khs, can_filler_sections, "14.0 MAINTENANCE / DOCUMENTATION / TRAINING")
df_acceptance = separate_df(df_khs, can_filler_sections, "15.0 TEC CAN FILLER ACCEPTANCE")
df_zone = separate_df_zone(df_khs, can_filler_sections, "16.0 ZONE DEMANDS DUE TO LOCAL REGULATIONS")

# Cálculo de score #

In [None]:
files = 'result/filler'
comment_column = []

for filename in os.listdir(files):
    if filename.endswith('.csv'):
        filepath = os.path.join(files, filename)
        df = pd.read_csv(filepath)
        col = df.iloc[:, -1]
        comment_column.append(col)

comments = pd.concat(comment_column, axis=0).reset_index(drop=True)
df_comments = pd.DataFrame(comments)

In [None]:
comment_column = df_comments['COMMENT']

total_comments = len(df_comments)
total_ok = df_comments[comment_column == 'OK'].count()
total_nok = df_comments[comment_column == 'NOK'].count()
total_not_enough = df_comments[comment_column == 'Not enough information'].count()

percentage_ok = float((total_ok / total_comments) * 100)
percentage_nok = float((total_nok / total_comments) * 100)
percentage_not_enough = float((total_not_enough / total_comments) * 100)

scores = {
    "OK": f'{percentage_ok:.2f}%',
    "NOK": f'{percentage_nok:.2f}%',
    "NOT ENOUGH INFO": f'{percentage_not_enough:.2f}%'
}

df = pd.DataFrame(list(scores.items()), columns=['Category', 'Percentage'])
df.to_csv('result/filler/scores.csv', index=False)

# Arquivo final #

In [None]:
files = 'result/filler'
excel_writer = pd.ExcelWriter('result/filler/can_filler.xlsx', engine='openpyxl')

for filename in os.listdir(files):
    if filename.endswith('.csv'):
        file_path = os.path.join(files, filename)
        df = pd.read_csv(file_path)

        # add df to excel file as a tab
        sheet_name = os.path.splitext(filename)[0]
        df.to_excel(excel_writer, sheet_name=sheet_name, index=False)

excel_writer.close()