In [1]:
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter

In [9]:
from langchain import PromptTemplate

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [12]:
llm=ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.2
)

#### Convert content of url as a document object

In [3]:
from PyPDF2 import PdfReader
pdfreader=PdfReader('Data_Scientist_Resume_John_Doe.pdf')

#### Useful only when iska size <less than token limit

In [4]:
from typing_extensions import Concatenate
text=''
for i,page in enumerate(pdfreader.pages):
    content=page.extract_text()
    if content:
        text+=content

In [5]:
text

'John Doe - Data Scientist\nContact Information\nEmail: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY\nProfessional Summary\nData Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex\nbusiness problems. Proficient in statistical analysis, predictive modeling, and machine learning.\nStrong expertise in Python, SQL, and data visualization with a proven track record of deriving\nactionable insights from large datasets.\nSkills\n- Programming: Python, R, SQL\n- Machine Learning: Scikit-learn, TensorFlow, Keras\n- Data Analysis: Pandas, NumPy, SciPy\n- Data Visualization: Matplotlib, Seaborn, Tableau\n- Big Data: Hadoop, Spark\n- Cloud Platforms: AWS, Azure\n- Version Control: Git\nWork Experience\nData Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present\n- Developed and deployed machine learning models that increased prediction accuracy by 15%.\n- Conducted exploratory data analysis on large datasets to identify tre

In [6]:
# conver this text to document
docs=[Document(page_content=text)]

#### All the text is now pushed into a single document

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
from langchain.chains.summarize import load_summarize_chain

In [11]:
template='''
Write a concise and short summary of the followin resume
{text}
'''
prompt=PromptTemplate(
    input_variables=["text"],
    template=template
)

In [13]:
chain=load_summarize_chain(
    llm,
    chain_type='stuff',
    prompt=prompt,
    verbose=True
)

In [14]:
output_summary=chain.run(docs)

  warn_deprecated(




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Write a concise and short summary of the followin resume
John Doe - Data Scientist
Contact Information
Email: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY
Professional Summary
Data Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex
business problems. Proficient in statistical analysis, predictive modeling, and machine learning.
Strong expertise in Python, SQL, and data visualization with a proven track record of deriving
actionable insights from large datasets.
Skills
- Programming: Python, R, SQL
- Machine Learning: Scikit-learn, TensorFlow, Keras
- Data Analysis: Pandas, NumPy, SciPy
- Data Visualization: Matplotlib, Seaborn, Tableau
- Big Data: Hadoop, Spark
- Cloud Platforms: AWS, Azure
- Version Control: Git
Work Experience
Data Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 

In [15]:
output_summary

"John Doe is a data scientist with 4+ years of experience in using data to solve business problems. He is proficient in Python, SQL, and data visualization, and has a strong track record of deriving actionable insights from large datasets. He has experience in machine learning, big data, and cloud platforms, and has successfully implemented AI-driven solutions in previous roles. His skills include data analysis, predictive modeling, and ETL automation. He holds a Master's degree in Data Science and is certified in data science and machine learning. \n"

##### Map reduce technique

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
text

'John Doe - Data Scientist\nContact Information\nEmail: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY\nProfessional Summary\nData Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex\nbusiness problems. Proficient in statistical analysis, predictive modeling, and machine learning.\nStrong expertise in Python, SQL, and data visualization with a proven track record of deriving\nactionable insights from large datasets.\nSkills\n- Programming: Python, R, SQL\n- Machine Learning: Scikit-learn, TensorFlow, Keras\n- Data Analysis: Pandas, NumPy, SciPy\n- Data Visualization: Matplotlib, Seaborn, Tableau\n- Big Data: Hadoop, Spark\n- Cloud Platforms: AWS, Azure\n- Version Control: Git\nWork Experience\nData Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present\n- Developed and deployed machine learning models that increased prediction accuracy by 15%.\n- Conducted exploratory data analysis on large datasets to identify tre

In [18]:
llm.get_num_tokens(text)

419

In [19]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20
)
chunks=text_splitter.create_documents([text])

In [20]:
chunks

[Document(page_content='John Doe - Data Scientist\nContact Information\nEmail: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY\nProfessional Summary\nData Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex\nbusiness problems. Proficient in statistical analysis, predictive modeling, and machine learning.\nStrong expertise in Python, SQL, and data visualization with a proven track record of deriving\nactionable insights from large datasets.\nSkills\n- Programming: Python, R, SQL\n- Machine Learning: Scikit-learn, TensorFlow, Keras\n- Data Analysis: Pandas, NumPy, SciPy\n- Data Visualization: Matplotlib, Seaborn, Tableau\n- Big Data: Hadoop, Spark\n- Cloud Platforms: AWS, Azure\n- Version Control: Git\nWork Experience\nData Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present\n- Developed and deployed machine learning models that increased prediction accuracy by 15%.'),
 Document(page_content='- Conducted explorator

In [21]:
len(chunks)

2

##### Two chunks were created here when chunk sze of 10k used


In [22]:
chain=load_summarize_chain(
    llm,
    chain_type='map_reduce',
    verbose=True
)
summary=chain.run(chunks)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"John Doe - Data Scientist
Contact Information
Email: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY
Professional Summary
Data Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex
business problems. Proficient in statistical analysis, predictive modeling, and machine learning.
Strong expertise in Python, SQL, and data visualization with a proven track record of deriving
actionable insights from large datasets.
Skills
- Programming: Python, R, SQL
- Machine Learning: Scikit-learn, TensorFlow, Keras
- Data Analysis: Pandas, NumPy, SciPy
- Data Visualization: Matplotlib, Seaborn, Tableau
- Big Data: Hadoop, Spark
- Cloud Platforms: AWS, Azure
- Version Control: Git
Work Experience
Data Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present

In [23]:
summary


"John Doe is a data scientist with 4+ years of experience in using data to solve business problems. He has a Master's in Data Science and is proficient in Python, SQL, and machine learning. He has a proven track record of deriving actionable insights from data, improving prediction accuracy, and driving business outcomes. His accomplishments include a 20% increase in marketing ROI and a 25% improvement in reporting efficiency. \n"

##### Map reduce With Custom Prompts

- > We converted document to chunks and gave every chunk to the LLM Model
- > While giving we can develop our own prompt too

In [24]:
chunks_prompt="""
Please summarize the below speech:
Speech ` {text}`
Summary:
"""
map_prompt_template=PromptTemplate(
    input_variables=['text'],
    template=chunks_prompt
)

In [29]:
final_combine_prompt='''
Provide a final summary of the entire resume with these important points.
Add a Generic Resume Title,
Start the precise summary with an introduction and provide the
summary in number points for the resume.
Speech: `{text}`
'''
final_combine_prompt_template=PromptTemplate(input_variables=['text'],
                                             template=final_combine_prompt)

In [30]:
summary_chain = load_summarize_chain(
    llm=llm,
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=final_combine_prompt_template,
    verbose=True
)
output = summary_chain.run(chunks)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please summarize the below speech:
Speech ` John Doe - Data Scientist
Contact Information
Email: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY
Professional Summary
Data Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex
business problems. Proficient in statistical analysis, predictive modeling, and machine learning.
Strong expertise in Python, SQL, and data visualization with a proven track record of deriving
actionable insights from large datasets.
Skills
- Programming: Python, R, SQL
- Machine Learning: Scikit-learn, TensorFlow, Keras
- Data Analysis: Pandas, NumPy, SciPy
- Data Visualization: Matplotlib, Seaborn, Tableau
- Big Data: Hadoop, Spark
- Cloud Platforms: AWS, Azure
- Version Control: Git
Work Experience
Data Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present

In [31]:
output

'## Data Scientist Resume\n\n**Summary:**\n\nA highly motivated and results-oriented Data Scientist with over 4 years of experience leveraging data to solve complex business challenges. Proven ability to extract actionable insights from large datasets, develop and deploy machine learning models, and drive impactful business outcomes. Expertise in:\n\n* **Data Analysis & Machine Learning:** Skilled in Python, SQL, and data visualization techniques. Developed and deployed machine learning models that improved prediction accuracy by 15%.\n* **Data Automation & AI Integration:** Experience in automating data processes and integrating AI solutions to enhance efficiency and accuracy.\n* **Data Visualization & Reporting:** Proficient in creating compelling data visualizations and reports to communicate insights effectively to stakeholders.\n* **Business Impact:** Demonstrated success in driving business outcomes, including increasing ROI by 20% and improving reporting efficiency by 25%.\n* **

- > Refine chain for summarization
- > Firstly give chunk1 uski summary get then u give chunk1, chunk2 , then cbunk 1 2 3 and so on

In [32]:
chain=load_summarize_chain(
    llm=llm,
    chain_type='refine',
    verbose=True
)
output_chain=chain.run(chunks)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"John Doe - Data Scientist
Contact Information
Email: johndoe@example.com | Phone: +123-456-7890 | Location: New York, NY
Professional Summary
Data Scientist with 4+ years of experience in leveraging data-driven approaches to solve complex
business problems. Proficient in statistical analysis, predictive modeling, and machine learning.
Strong expertise in Python, SQL, and data visualization with a proven track record of deriving
actionable insights from large datasets.
Skills
- Programming: Python, R, SQL
- Machine Learning: Scikit-learn, TensorFlow, Keras
- Data Analysis: Pandas, NumPy, SciPy
- Data Visualization: Matplotlib, Seaborn, Tableau
- Big Data: Hadoop, Spark
- Cloud Platforms: AWS, Azure
- Version Control: Git
Work Experience
Data Scientist | XYZ Tech Solutions, New York, NY | Jan 2021 - Present
- 

In [33]:
output_chain

'John Doe is a data scientist with 4+ years of experience in using data to solve business problems. He is proficient in Python, SQL, and data visualization, and has a proven track record of deriving actionable insights from large datasets. He has experience with machine learning, big data, and cloud platforms, and has successfully developed and deployed models that improved prediction accuracy. \n\nJohn has a strong track record of success in analyzing and visualizing data to drive business decisions. He has conducted exploratory data analysis to identify trends and insights, automated ETL processes to improve efficiency, and collaborated with cross-functional teams to integrate AI-driven solutions. His work at ABC Data Corp resulted in a 20% increase in ROI through improved marketing strategy and a 25% improvement in reporting efficiency through streamlined data collection and processing. \n\nJohn holds a Master of Science in Data Science from the University of XYZ and a Bachelor of S