In [13]:
import os
from dotenv import load_dotenv

load_dotenv()


api_key=os.getenv("GEMINI_API_KEY")
print(api_key)

AIzaSyB8VwPGA08UUPd0ayNPCpHjJhIRxU6l_tI


### 1. initialize Model

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

llm = ChatGoogleGenerativeAI(
    api_key = api_key,
    model="gemini-2.0-flash"
)

In [15]:
response = llm.invoke("tell me a joke")
print(response.content)

Why don't scientists trust atoms?

Because they make up everything!


### Load the doc

In [16]:
from langchain_community.document_loaders import PyPDFLoader


loader = PyPDFLoader(r"D:\JMM Internship\M7 - Generative ai\Task 2\Flask RAG App\DATA\Zabih_Resume-6_1_1.pdf")

documents = loader.load()

In [17]:
documents

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2024-11-12T18:53:55+05:00', 'author': 'Zabih', 'moddate': '2024-11-12T18:53:55+05:00', 'source': 'D:\\JMM Internship\\M7 - Generative ai\\Task 2\\Flask RAG App\\DATA\\Zabih_Resume-6_1_1.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Zabihullah \n03190904793          Zabihullah18381@gmail.com          GitHub          LinkedIn          Kaggle          Portfolio \n \n \nEducation \n \nBachelor of Science in Software Engineering \nAbasyn University Peshawar: CGPA 3.3 \nYear of Graduation: 2024 \nSummery \n \nAI and machine learning engineer with nearly one year of hands-on experience in developing intelligent applications. Successfully \ndeveloped AI chatbots, predictive models, and web applications using advanced technologies like Langchain and Fastapi. Skilled in Python, \ndata analysis, and deploying AI-driven solutions to enhance software capabilities. \n \n

### Making chunks

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [19]:
len(docs)

9

### Initialize embedding model

In [20]:

embeddings = GoogleGenerativeAIEmbeddings(google_api_key=api_key, model="models/embedding-001")

### setting `Weaviate` vector db

In [21]:
from dotenv import load_dotenv
load_dotenv()
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
weaviate_url = os.getenv("WEAVIATE_URL")

In [26]:
import weaviate
from weaviate.classes.init import Auth

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
    skip_init_checks=True
)


In [38]:
from langchain_weaviate.vectorstores import WeaviateVectorStore


vector_db = WeaviateVectorStore.from_documents(docs, embeddings, client=client)

In [None]:
# print(vector_db.similarity_search("who is Zabihullah", k=3)[0].page_content)

### Prompt template

In [29]:
from langchain_core.prompts import ChatPromptTemplate

template= """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Question: {question}
Context: {context}
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

In [30]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [31]:
output_parser=StrOutputParser()

In [32]:
retriever = vector_db.as_retriever(search_kwargs={"k": 5})


In [33]:

response = retriever.invoke("who is zabih")
for i in response:
    print("_"*80)
    print(i)

________________________________________________________________________________
page_content='Zabihullah 
03190904793          Zabihullah18381@gmail.com          GitHub          LinkedIn          Kaggle          Portfolio 
 
 
Education 
 
Bachelor of Science in Software Engineering 
Abasyn University Peshawar: CGPA 3.3 
Year of Graduation: 2024 
Summery 
 
AI and machine learning engineer with nearly one year of hands-on experience in developing intelligent applications. Successfully' metadata={'creator': 'Microsoft® Word 2016', 'total_pages': 1.0, 'moddate': datetime.datetime(2024, 11, 12, 18, 53, 55, tzinfo=datetime.timezone(datetime.timedelta(seconds=18000))), 'creationdate': datetime.datetime(2024, 11, 12, 18, 53, 55, tzinfo=datetime.timezone(datetime.timedelta(seconds=18000))), 'source': 'D:\\JMM Internship\\M7 - Generative ai\\Task 2\\Flask RAG App\\DATA\\Zabih_Resume-6_1_1.pdf', 'page_label': '1', 'producer': 'Microsoft® Word 2016', 'page': 0.0, 'author': 'Zabih'}
____________

In [34]:
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)
   

In [35]:
print(rag_chain.invoke("Who is Zabih?"))

Zabihullah is an AI and machine learning engineer with almost one year of experience, holding a Bachelor of Science in Software Engineering from Abasyn University Peshawar with a CGPA of 3.3, expected to graduate in 2024. He has experience as an AI Researcher at DataWars.io and as an Artificial Intelligence Intern at Kairiz Cyber Security.


In [65]:
import nest_asyncio

nest_asyncio.apply()

llama_parser_api = "llx-IAJ3qqI8ZGZ77WCmJlKGrNq2zyyb646GAxhOOVHVsv3ZpQCX"

from llama_cloud_services import LlamaParse

file_path = r"C:\Users\Zabih\Downloads\New FULL STACK AI CURRICULUM.pdf"

parsed_documents = LlamaParse(api_key=llama_parser_api, 
                       premium_mode = True,
                       result_type="markdown").load_data(file_path)


Started parsing the file under job_id 9d4bb8ba-0f79-4387-888a-76b3219c9374


In [67]:
parsed_documents

[Document(id_='04470a6f-af26-4d9d-a27d-cdbfc9e378bc', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='# FULL STACK AI CURRICULUM\n## Batch-4\n### JMM Technologies\n\n## Module 1: Python Fundamentals\n\n- Introduction to Python programming language\n- Variables, Data Types, and Operators\n- Control Flow: Conditionals and Loops\n- Functions, Lambdas, and Scope\n\n## Module 2: Fundamentals of Data Science\n\n- Introduction to Data Science and its applications\n- The Data Science Workflow\n- Exploratory Data Analysis (EDA) with Excel, Python and Pandas\n- Data Visualization with Matplotlib, Seaborn, Plotly, PowerBI,\n- Probability and Statistics for Data Science\n\n## Module 3: Machine Learning\n\n- Introduction to Machine Learning and its applications\n- Supervised Learning: Linear Regression, Logistic Reg

In [71]:
from langchain_core.documents import Document

documents = [Document(page_content=doc.text) for doc in parsed_documents]

In [72]:
documents

[Document(metadata={}, page_content='# FULL STACK AI CURRICULUM\n## Batch-4\n### JMM Technologies\n\n## Module 1: Python Fundamentals\n\n- Introduction to Python programming language\n- Variables, Data Types, and Operators\n- Control Flow: Conditionals and Loops\n- Functions, Lambdas, and Scope\n\n## Module 2: Fundamentals of Data Science\n\n- Introduction to Data Science and its applications\n- The Data Science Workflow\n- Exploratory Data Analysis (EDA) with Excel, Python and Pandas\n- Data Visualization with Matplotlib, Seaborn, Plotly, PowerBI,\n- Probability and Statistics for Data Science\n\n## Module 3: Machine Learning\n\n- Introduction to Machine Learning and its applications\n- Supervised Learning: Linear Regression, Logistic Regression, Decision Trees, Random Forests, K-Nearest Neighbors, Support Vector Machines\n- Unsupervised Learning: Clustering, K-Means, PCA, Dimensionality Reduction\n- Model Evaluation, Cross-Validation, Bias-Variance Tradeoff, Overfitting and Underfitt

In [77]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""]
)

docs = text_splitter.split_documents(documents)

In [78]:
docs

[Document(metadata={}, page_content='# FULL STACK AI CURRICULUM\n## Batch-4\n### JMM Technologies\n\n## Module 1: Python Fundamentals\n\n- Introduction to Python programming language\n- Variables, Data Types, and Operators\n- Control Flow: Conditionals and Loops\n- Functions, Lambdas, and Scope\n\n## Module 2: Fundamentals of Data Science'),
 Document(metadata={}, page_content='## Module 2: Fundamentals of Data Science\n\n- Introduction to Data Science and its applications\n- The Data Science Workflow\n- Exploratory Data Analysis (EDA) with Excel, Python and Pandas\n- Data Visualization with Matplotlib, Seaborn, Plotly, PowerBI,\n- Probability and Statistics for Data Science\n\n## Module 3: Machine Learning'),
 Document(metadata={}, page_content='## Module 3: Machine Learning\n\n- Introduction to Machine Learning and its applications\n- Supervised Learning: Linear Regression, Logistic Regression, Decision Trees, Random Forests, K-Nearest Neighbors, Support Vector Machines\n- Unsupervis

## Whisper api

In [21]:
import os
from groq import Groq

os.environ["GROQ_API_KEY"] = "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"

client = Groq()



def speech_to_text(audio_path):

    with open(audio_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-large-v3",
            file=audio_file
        )

        return transcript.text


In [22]:
audio_path = r"C:\Users\Zabih\Downloads\recorded_audio.wav"
# audio_path = r"C:\Users\Zabih\Downloads\1.mp3"
text = speech_to_text(audio_path)

In [23]:
text

" What's one thing you'd never tell me right now?"