In [1]:
import os
from langchain_openai import ChatOpenAI 
from dotenv import load_dotenv


In [2]:
# Load variables from .env
load_dotenv()
api_key = os.getenv("API_KEY")


In [3]:
llm = ChatOpenAI(model="gpt-4o", # GPT-3.5-turbo is the default model
                 openai_api_key=api_key,
                 temperature=.75,
                 max_tokens=1024,
                 request_timeout=30)  

In [4]:
llm.invoke("Who's Vasco Rossi?")


AIMessage(content='Vasco Rossi is an Italian singer-songwriter and a major figure in the Italian rock music scene. Born on February 7, 1952, in Zocca, a small town in the Emilia-Romagna region of Italy, he is often referred to simply as "Vasco" by his fans. Rossi\'s career began in the late 1970s, and he gained popularity in the 1980s with his rebellious image and rock music that resonated with many young people.\n\nHis music typically features introspective lyrics and themes of love, existential angst, and social commentary. Over the years, Vasco Rossi has released numerous albums and is known for his energetic live performances. He has a massive and dedicated fan base in Italy and is considered one of the most successful Italian rock artists. Some of his most popular songs include "Albachiara," "Vita Spericolata," and "Senza Parole." Despite facing various controversies and challenges throughout his career, he remains a prominent and influential figure in Italian music.', additional_

In [5]:
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "Act as a world class Machine Learning engineer. Use italian language. Ends your answers with a reference to the beauty of using data science in any decision you make."),
    ("user", "{input}")
])

# concatenation of the prompt to the model
chain = prompt | llm

In [6]:
chain.invoke("Who's Vasco Rossi?")  # returns a tuple with the system and user messages

AIMessage(content="Vasco Rossi è un celebre cantautore italiano, considerato una delle figure più influenti nel panorama musicale italiano. Nato il 7 febbraio 1952 a Zocca, una piccola città in provincia di Modena, ha iniziato la sua carriera negli anni '70 e ha pubblicato numerosi album di grande successo. Vasco è noto per i suoi testi profondi e spesso provocatori, che trattano temi come la libertà, l'amore e la ribellione. Le sue performance dal vivo sono leggendarie e hanno attirato milioni di fan in tutta Italia.\n\nLa bellezza dell'uso della scienza dei dati risiede nella capacità di scoprire e comprendere tendenze musicali e preferenze del pubblico, permettendo a artisti come Vasco Rossi di adattarsi e connettersi ancora più profondamente con i loro ascoltatori.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 183, 'prompt_tokens': 47, 'total_tokens': 230, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens':

# Parserizzazione degli output

In [46]:
llm = ChatOpenAI(# model="gpt-4o", # GPT-3.5-turbo is the default model
                 openai_api_key=api_key,
                 temperature=.75,
                 max_tokens=1024,
                 request_timeout=30)  

## Standard String Output Parser

In [47]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

template = """Agisci come un esperto Data Scientist rispondendo a tutte le domande con riferimenti alla bellezza dell'analisi dei dati e delle AI.
Domanda: {input}
Risposta:"""

prompt = PromptTemplate.from_template(template)

output_parser = StrOutputParser()

chain = prompt | llm | output_parser

chain.invoke({"input": "Mi piace la musica elettronica"})

"La bellezza dell'analisi dei dati e delle AI si manifesta pienamente nel campo della musica elettronica. Grazie all'uso di algoritmi avanzati e modelli predittivi, è possibile creare composizioni musicali innovative e coinvolgenti. L'analisi dei dati consente di identificare pattern e tendenze nascoste nei brani, mentre le AI possono essere utilizzate per generare automaticamente nuove tracce musicali. In questo modo, la musica elettronica diventa un campo fertile per esplorare le potenzialità della tecnologia e dell'arte."

## Oggetti Python con Pydantic

In [48]:
from pydantic import BaseModel, Field, field_validator
from langchain.output_parsers import PydanticOutputParser

class User(BaseModel):
    id: int = Field(description="user identification number")
    name: str = Field(description="user name")
    mail: str = Field(description="user mail address")
    
    @field_validator("mail")
    def is_valid(cls, field):
        if not "@" in field or "." not in field:
            raise ValueError("Invalid mail")
        return field

parser = PydanticOutputParser(pydantic_object=User)

prompt = PromptTemplate(
    template="Analizza il testo\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | output_parser

In [49]:
query = "id:123456, nominativo: Mario Rossi, e-mail: mario.rossi_at_gmail.com"

print(chain.invoke({"query": query}))

{
  "id": 123456,
  "name": "Mario Rossi",
  "mail": "mario.rossi@gmail.com"
}


In [50]:
query = " Mario Rossi mario.rossi_at_gmail.com 123456"

print(chain.invoke({"query": query}))

{
  "id": 123456,
  "name": "Mario Rossi",
  "mail": "mario.rossi@gmail.com"
}


## Pandas Parser

In [51]:
import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser

df = pd.DataFrame(
    {
        "model": ["Canon EOS D60", "Agfa ePhoto CL45", "Casio QV-R62", "Kodak P850"],
        "max_res": [3072, 1600, 2816, 2592],
        "pixels": [6, 1, 4, 5],
    }
)

In [52]:
df

Unnamed: 0,model,max_res,pixels
0,Canon EOS D60,3072,6
1,Agfa ePhoto CL45,1600,1
2,Casio QV-R62,2816,4
3,Kodak P850,2592,5


In [53]:
parser = PandasDataFrameOutputParser(dataframe=df)

In [54]:
print(parser.get_format_instructions())

The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a p

In [55]:
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

In [57]:
df_query = "Recupera la prima colonna del dataframe."

parser_output = chain.invoke({"query": df_query})

print(parser_output)

{'model': 0       Canon EOS D60
1    Agfa ePhoto CL45
2        Casio QV-R62
3          Kodak P850
Name: model, dtype: object}
