<a href="https://colab.research.google.com/github/vidyasagarcrj/Codedirect/blob/main/Lang_Chain_Model_IO_V5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages

In [None]:
!pip install openai --quiet
!pip install langchain --quiet
!pip install langchain-cohere
!pip install langchain-community --quiet

In [None]:
import os
os.environ['OPENAI_API_KEY'] = "Your OpenAI Key"
os.environ["COHERE_API_KEY"] = "Your Cohere Key"

#Better way
#from google.colab import userdata
#os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")
#os.environ['COHERE_API_KEY'] = userdata.get("COHERE_API_KEY")

In [None]:
from langchain.llms import OpenAI, Cohere
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import TextLoader

# Basic Document/text Loader

In [None]:
!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Mobile_Phone_Review/Mobile.md

loader = TextLoader("./Mobile.md")
loaded_text= loader.load()
print(type(loaded_text))
print(loaded_text)

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following review and extract the following information:
Name of the product
Brand of the product
Price of the product
Rating of the product

review is given here : {input_review}
"""

prompt=PromptTemplate(template=template,
                       input_variables=["input_review"])

chain=LLMChain(llm=llm,
               prompt=prompt)

result=chain.invoke({"input_review":loaded_text})
print(result['text'])

# Text loader Example2

In [None]:
!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Client_Mail/Mail.txt

loader = TextLoader("./Mail.txt")
loaded_text= loader.load()
print(type(loaded_text))
print(loaded_text)

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following mail and extract the following information:
Name of the person
Main Point of the mail
Ticket-id
The mail is given here : {input_mail}
"""
prompt=PromptTemplate(template=template,
                       input_variables=["input_mail"])

chain=LLMChain(llm=llm,
               prompt=prompt)

result=chain.invoke({"input_mail":loaded_text})
print(result['text'])

# CSV Loader

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Leads.csv

loader = CSVLoader(file_path="./Leads.csv")
csv_file_data = loader.load()
print(type(csv_file_data))
print(csv_file_data)

In [None]:
print(csv_file_data)

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following data and extract the following information:

What is the average promotional budget ?
What is the average Leads?

data is given here : {input_data}
"""

prompt=PromptTemplate(template=template,
                       input_variables=["input_data"])

chain=LLMChain(llm=llm,
               prompt=prompt)

In [None]:
result=chain.invoke({"input_data":csv_file_data})
print(result['text'])

#Loader Common Issue with Large Datasets

Max Tokens issue

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/AirBnB_NY/AB_NYC_2019.csv

loader = CSVLoader(file_path="./AB_NYC_2019.csv")
csv_file_data = loader.load()
print(type(csv_file_data))
print(csv_file_data)

In [None]:
print(csv_file_data)

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following data and extract the following information:

What is the average price?
What are the top 5 most expensive listings?
What are the top 5 most reviewed listings?

data is given here : {input_data}
"""

prompt=PromptTemplate(template=template,
                       input_variables=["input_data"])

chain=LLMChain(llm=llm,
               prompt=prompt,
               verbose=True)

In [None]:
chain.invoke({"input_data":csv_file_data})

# WebBaseLoader

In [None]:
from langchain_community.document_loaders import WebBaseLoader

In [None]:
loader = WebBaseLoader("https://www.amazon.in/Indigenous-Unprocessed-Unfiltered-Unpasteurized-Disorders/product-reviews/B07H5PVCH7/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")

web_file_data = loader.load()
print(type(web_file_data))
print(web_file_data)
print(len(web_file_data[0].page_content))
web_file_data[0].page_content=web_file_data[0].page_content[:10000]
print(len(web_file_data[0].page_content))

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following data summarize it into 4 bullet points
data is given here : {input_data}
"""

prompt=PromptTemplate(template=template,
                       input_variables=["input_data"])

chain=LLMChain(llm=llm,
               prompt=prompt)

In [None]:
result=chain.invoke({"input_data":web_file_data})
result['text']

# Wikipedia

In [None]:
!pip install wikipedia

In [None]:
from langchain.document_loaders import WikipediaLoader

loader=WikipediaLoader(query="Sundar Pichai", load_max_docs=1)
wiki_file_data=loader.load()
print(type(wiki_file_data))
print(wiki_file_data)

# PyPdf Loader

In [None]:
!pip install pypdf

In [None]:
from langchain.document_loaders import PyPDFLoader

!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Regulatory_Rules_in_Credit_Risk_Models/Regulatory_Rules_in_Credit_Risk_Models.pdf
loader=PyPDFLoader(file_path="Regulatory_Rules_in_Credit_Risk_Models.pdf")
pdf_file_data=loader.load()
print(type(pdf_file_data))
print(pdf_file_data)

# HTML Loader

In [None]:
!pip install beautifulsoup4

In [None]:
from langchain.document_loaders import BSHTMLLoader

In [None]:
!wget https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Word_bank_news_html/South_Asia_Global_Debt_Summary.html

loader=BSHTMLLoader(file_path="South_Asia_Global_Debt_Summary.html")
html_file_data=loader.load()
print(type(html_file_data))
print(html_file_data)


In [None]:
result=chain.invoke({"input_data":html_file_data})
print(result['text'])

# Youtube

In [None]:
!pip install --upgrade --quiet  youtube-transcript-api

In [None]:
from langchain_community.document_loaders import YoutubeLoader
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=fbQvVS_8ZNI"
)

youtube_file_data = loader.load()
print(type(youtube_file_data))
print(youtube_file_data)

In [None]:
#llm=OpenAI(temperature=0)
llm=Cohere()

template="""
read the following data summarize it into 4 bullet points
data is given here : {input_data}
"""

prompt=PromptTemplate(template=template,
                       input_variables=["input_data"])

chain=LLMChain(llm=llm,
               prompt=prompt)

In [None]:
result=chain.invoke({"input_data":youtube_file_data})
result['text']

# Output Parsers

# CSV Parser

In [None]:
from langchain.output_parsers import CommaSeparatedListOutputParser

In [None]:
output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

In [None]:
template="""
List down the top 10 concepts to learn from the following Topic
Topic name is: {topic_name}
{format_instructions}
"""
prompt=PromptTemplate(template=template,
                      input_variables=["topic_name", "format_instructions"])

#llm=OpenAI(temperature=0)
llm=Cohere()
chain=LLMChain(prompt=prompt,
               llm=llm)

result=chain.invoke({"topic_name":"Machine Learning",
              "format_instructions":format_instructions })
print(result["text"])

# Datetime Parser

In [None]:
from langchain.output_parsers import DatetimeOutputParser
output_parser = DatetimeOutputParser()
format_instructions=output_parser.get_format_instructions()
print(format_instructions)

In [None]:
Server_Logs =[
    "[2024-04-01 13:48:11] ERROR: Failed to connect to database. Retrying in 60 seconds.",
    "[2023-08-04 12:01:00 AM - Warning: The system is running low on disk space.",
    "[04-01-2024 13:55:39] CRITICAL: System temperature exceeds safe threshold. Initiating shutdown",
    "[Monday, April 01, 2024 01:55:39 PM] DEBUG: User query executed in 0.45 seconds.",
    "[13:55:39 on 2024-04-01] ERROR: Unable to send email notification. SMTP server not responding."
]

In [None]:
template="""
Read the server log text and extract the date and time
log text is: {log_text}
{format_instructions}
"""
prompt=PromptTemplate(template=template,
                      input_variables=["log_text", "format_instructions"])

#llm=OpenAI(temperature=0)
llm=Cohere()
chain=LLMChain(prompt=prompt,
               llm=llm)

In [None]:
for log_message in Server_Logs:
  result=chain.invoke({"log_text":log_message,
                      "format_instructions":format_instructions })
  print(result["text"],";", log_message )

# Custom Parser - Using Pydantic

Pydantic is a handy tool for making sure the information (or data) your Python program receives is exactly what you expect. With Pydantic, you tell your program what kind of data it should accept (like numbers, text, or dates) using simple rules. If the data matches the rules, your program works smoothly. If not, Pydantic helps by pointing out the problem, making it easier to keep your program safe and error-free.

In [None]:
from pydantic import BaseModel, ValidationError

class User(BaseModel):
    name: str
    age: int

# Correct data
user = User(name="Alice", age=30)
print(user)

# Incorrect data raises an error
User(name="Bob", age="thirty")


In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

In [None]:
class Scientist(BaseModel):
  name: str= Field(description= "Name of the Scientist")
  dob: str= Field(description= "Date of Birth of the Scientist")
  bio: str= Field(description= "Biography of the Scientist")

In [None]:
custom_output_parser= PydanticOutputParser(pydantic_object=Scientist)
print(custom_output_parser.get_format_instructions())

In [None]:
template="""
Take the name of the scientist is {name} and try to fill the rest of the details
{format_instructions}
"""
prompt=PromptTemplate(template=template,
                      input_variables=["name", "format_instructions"])

#llm=OpenAI(temperature=0)
llm=Cohere()
chain=LLMChain(prompt=prompt,
               llm=llm)

result=chain.invoke({"name":"Ramanujan",
                      "format_instructions":custom_output_parser.get_format_instructions() })
print(result["text"])


# EmailResponse App

In [None]:
import random as rand
import requests
from langchain.document_loaders import WebBaseLoader
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain

In [None]:
email_location="https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Customer_Emails/Mail"+str(rand.randint(1,5))+".txt"
print(email_location)

loader = WebBaseLoader(email_location)
loaded_text= loader.load()
print(type(loaded_text))
final_mail=loaded_text[0].page_content
print(final_mail)

In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

class EmailResponse(BaseModel):
  Email_Language: str= Field(description= "The Original Language of the Email")
  Customer_ID: str= Field(description= "The Customer ID mentioned in the mail")
  English_email: str= Field(description= "The email after translating to English")
  Summary: str= Field(description= "A 4 bullets point summary of the email")
  Reply: str= Field(description= "A polite 2 line reply to the email")

custom_output_parser= PydanticOutputParser(pydantic_object=EmailResponse)
print(custom_output_parser.get_format_instructions())

In [None]:
template="""
Take the email as input. Email text is {email}
{format_instructions}
"""
prompt=PromptTemplate(template=template,
                      input_variables=["email","format_instructions"])

#llm=OpenAI(temperature=0)
llm=Cohere()

chain=LLMChain(prompt=prompt,
               llm=llm)

result=chain.invoke({"email":final_mail,
                     "format_instructions":custom_output_parser.get_format_instructions()})
print(result["text"])