In [1]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

True

- The elements of statistical learning - https://hastie.su.domains/ElemStatLearn/
- Weather Dataset - https://www.kaggle.com/datasets/guillemservera/global-daily-climate-data
- DEEP LEARNING: HTML from here - https://atcold.github.io/NYU-DLSP21/
- Speech Recognition and Graph Transformer Network I - https://github.com/Atcold/NYU-DLSP21/blob/master/docs/en/week11/11-1.md

In [2]:
from langchain.document_loaders import (
    UnstructuredCSVLoader,
    UnstructuredHTMLLoader,
    UnstructuredImageLoader,
    PythonLoader,
    PyPDFLoader,
    JSONLoader,
)

from langchain.document_loaders.csv_loader import CSVLoader


# CSV Loader

In [9]:
csv_weather_cities_location = "./csv_data/cities.csv"
csv_weather_countries_location = "./csv_data/countries.csv"

csv_weather_loader = CSVLoader(csv_weather_cities_location)
weather_data = csv_weather_loader.load()


In [11]:
print(weather_data[0].page_content)

station_id: 41515
city_name: Asadabad
country: Afghanistan
state: Kunar
iso2: AF
iso3: AFG
latitude: 34.8660000397
longitude: 71.1500045859


In [13]:
import pandas as pd

df = pd.read_csv(csv_weather_cities_location)

df

Unnamed: 0,station_id,city_name,country,state,iso2,iso3,latitude,longitude
0,41515,Asadabad,Afghanistan,Kunar,AF,AFG,34.866000,71.150005
1,38954,Fayzabad,Afghanistan,Badakhshan,AF,AFG,37.129761,70.579247
2,41560,Jalalabad,Afghanistan,Nangarhar,AF,AFG,34.441527,70.436103
3,38947,Kunduz,Afghanistan,Kunduz,AF,AFG,36.727951,68.872530
4,38987,Qala i Naw,Afghanistan,Badghis,AF,AFG,34.983000,63.133300
...,...,...,...,...,...,...,...,...
1240,67475,Kasama,Zambia,Northern,ZM,ZMB,-10.199598,31.179947
1241,68030,Livingstone,Zambia,Southern,ZM,ZMB,-17.860009,25.860013
1242,67633,Mongu,Zambia,Western,ZM,ZMB,-15.279598,23.120025
1243,67775,Harare,Zimbabwe,Harare,ZW,ZWE,-17.817790,31.044709


# PDF

In [15]:
file_path = "./mixed_data/ESLII_print12_toc (1).pdf"

sl_loader = PyPDFLoader(file_path)
sl_data = sl_loader.load_and_split()

In [17]:
print(sl_data[0])
print(len(sl_data))

page_content='Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-\nnology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the approach is statistical, theemphasis is on concepts rather than mathematics. Many examples are given, with a liberaluse of color graphics. It should be a valuable re

In [19]:
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

splitter1 = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)

splitter2 = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0
)

sl_data1 = sl_loader.load_and_split(splitter1)
sl_data2 = sl_loader.load_and_split(splitter2)

In [20]:
print(len(sl_data1), len(sl_data1[0].page_content))

print(len(sl_data2), len(sl_data2[0].page_content))

764 2474
2252 305


In [21]:
from langchain.document_loaders import DirectoryLoader

mixed_loader = DirectoryLoader(
    path="./mixed_data",
    use_multithreading=True,
    show_progress=True,
)

mixed_data = mixed_loader.load_and_split()


100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


In [None]:
print(len(mixed_data))

# Summarization

Issue: LLM context window is limited, and so feeding an entire document into the model is not possible.

Solution: Summarize the document into a smaller text.

Two main strategies:

## Map-reduce strategy

Split document into chunks, summarize each chunk, then combine the summaries into a single summary.

## Refinement strategy

Split document into chunks, and iterativery refine the summary by summarizing the chunks against the output summary.


In [22]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()

chain = load_summarize_chain(
    llm=llm,
    chain_type="stuff"
)

chain.run(sl_data[:2])

'The book "The Elements of Statistical Learning" provides an overview of the important ideas in data mining, machine learning, and bioinformatics. The authors cover a range of topics, including neural networks, support vector machines, and classification trees. This second edition includes additional topics such as graphical models and ensemble methods. The authors are well-respected statisticians and researchers in the field.'

In [23]:
chain.run(weather_data[:2])

'This summary provides information about two weather stations in Afghanistan. The first station is located in Asadabad, Kunar, with a latitude of 34.8660000397 and a longitude of 71.1500045859. The second station is in Fayzabad, Badakhshan, with a latitude of 37.1297607616 and a longitude of 70.5792471913.'

# Custom prompt

In [24]:
print(chain.llm_chain.prompt.template)

Write a concise summary of the following:


"{text}"


CONCISE SUMMARY:


In [28]:
from langchain.prompts import PromptTemplate

spanish_template_example = """
Write concise summary of the following text in Spanish:
"{text}"

CONCISE SUMMARY IN SPANISH:
"""

chain = load_summarize_chain(
    llm=llm,
    prompt=PromptTemplate.from_template(spanish_template_example)
)

chain.run(sl_data[:2])

'Este texto describe el libro "The Elements of Statistical Learning", escrito por Trevor Hastie, Robert Tibshirani y Jerome Friedman. El libro aborda las áreas de estadística, minería de datos, aprendizaje automático y bioinformática, y presenta herramientas y conceptos importantes en estos campos. La nueva edición incluye temas adicionales como modelos gráficos, métodos de ensamble y clustering espectral. Los autores son destacados investigadores en el campo de la estadística y han desarrollado diversas técnicas y herramientas utilizadas en la modelización estadística y minería de datos.'

# Map-reduce chain

In [30]:
chain = load_summarize_chain(
    llm=llm,
    chain_type="map_reduce",
    verbose=True,
)

chain.run(sl_data[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"Springer Series in Statistics
Trevor Hastie
Robert TibshiraniJerome FriedmanSpringer Series in Statistics
The Elements of
Statistical Learning
Data Mining, Inference, and Prediction
The Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-
nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the ap

'"The Elements of Statistical Learning" is a comprehensive book that covers important concepts in data mining, machine learning, and bioinformatics. The authors provide an overview of various topics, including supervised and unsupervised learning, neural networks, support vector machines, and boosting. The second edition of the book includes new topics such as graphical models, random forests, and ensemble methods. The authors are renowned professors at Stanford University and have made significant contributions to the field. The book is dedicated to the families and parents of the authors. The preface to the second edition explains the updates and changes made to the book, while also addressing issues with colorblind readers and clarifying the origin of a quote. The book aims to explain important new ideas in learning using a statistical framework and hopes to be useful for researchers and practitioners in various fields. The book covers topics such as supervised learning, linear meth

In [35]:
map_template_example = """
The following is a set of documents

{text}

Based on this list of docs, please identify the main themes

Helpful Answer:
"""

combine_template_example = """
The following is a set of summaries:

{text}

Take these and distill it into a final, consolidated list
of the main themes.

Return that list as a comma separated list.

Helpful Answer:
"""

chain = load_summarize_chain(
    llm=llm,
    map_prompt=PromptTemplate.from_template(map_template_example),
    combine_prompt=PromptTemplate.from_template(combine_template_example),
    chain_type="map_reduce",
    verbose=True,
)

chain.run(sl_data[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
The following is a set of documents

Springer Series in Statistics
Trevor Hastie
Robert TibshiraniJerome FriedmanSpringer Series in Statistics
The Elements of
Statistical Learning
Data Mining, Inference, and Prediction
The Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-
nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the approach 

'Statistics, Data Mining and Machine Learning, Conceptual Framework, Topics and Techniques, Authors\' Background, Acknowledgement and gratitude towards parents, Acknowledgement and gratitude towards families, Familial relationships, Personal connections and relationships, Popularity and motivation for updating "The Elements of Statistical Learning", Fast pace of research in statistical learning, New chapters and updates in the second edition, Attempt to keep layout of first edition unchanged, Quote about trusting in God and bringing data, Introduction and overview of supervised learning, Linear methods for regression and classification, Basis expansions and regularization techniques, Kernel smoothing methods, Model assessment and selection, Model inference and averaging, Additive models, trees, and related methods, Boosting and additive trees, Neural networks, Support vector machines and flexible discriminants, Prototype methods and nearest-neighbors, Unsupervised learning, Random fore

# Refine chain

In [36]:
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    verbose=True,
)

chain.run(sl_data[:20])



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"Springer Series in Statistics
Trevor Hastie
Robert TibshiraniJerome FriedmanSpringer Series in Statistics
The Elements of
Statistical Learning
Data Mining, Inference, and Prediction
The Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-
nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the appro

'The second edition of "The Elements of Statistical Learning" includes four new chapters: Random Forests, Ensemble Learning, Undirected Graphical Models, and High-Dimensional Problems. The layout has been designed for readers familiar with the first edition, and changes have been made to the color palette for colorblind readers. The name of Chapter 6 has been changed to "Kernel Smoothing Methods" to avoid confusion. The discussion of error-rate estimation has been improved. Chapters 15 and 16 should be read after Chapter 10, and Chapter 17 focuses on undirected graphical models. Chapter 18 explores learning in high-dimensional feature spaces and covers topics such as diagonal linear discriminant analysis, nearest shrunken centroids, linear classifiers with quadratic regularization, logistic regression with quadratic regularization, the support vector classifier, feature selection, linear classifiers with L1 regularization, classification when features are unavailable, high-dimensional 

In [38]:
print(chain.refine_llm_chain.prompt.template)

Your job is to produce a final summary.
We have provided an existing summary up to a certain point: {existing_answer}
We have the opportunity to refine the existing summary (only if needed) with some more context below.
------------
{text}
------------
Given the new context, refine the original summary.
If the context isn't useful, return the original summary.


In [42]:
initial_template_example = """
Extract the most relevant themes from the following:

"{text}"

THEMES:
"""

refine_template_example = """
Your job is to extract the most relevant themes
We have provided you with the list of themes up to a certain point: {existing_answer}
We have the opportunity to refine the existing list(only if needed) with some more context bellow:

----------
{text}
----------

Given the new context, refine the original list
If the context isn't useful, return the original list, and ONLY the original list.

Return that list as a comma separated list.

LIST:"""

chain = load_summarize_chain(
    llm=llm,
    question_prompt=PromptTemplate.from_template(initial_template_example),
    refine_prompt=PromptTemplate.from_template(refine_template_example),
    chain_type="refine",
    verbose=True,
)

chain.run(sl_data[:20])



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Extract the most relevant themes from the following:

"Springer Series in Statistics
Trevor Hastie
Robert TibshiraniJerome FriedmanSpringer Series in Statistics
The Elements of
Statistical Learning
Data Mining, Inference, and Prediction
The Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-
nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. Whil

'Computation and information technology, Data analysis and understanding, Statistical concepts and frameworks, Broad coverage of topics, Prominent researchers and their contributions, Updated edition, Practical applications, Use of examples and graphics'