# Installs

In [1]:
!pip install -r req.txt

Collecting cassandra-driver (from -r req.txt (line 1))
  Downloading cassandra_driver-3.29.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.9/18.9 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain (from -r req.txt (line 2))
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cohere (from -r req.txt (line 3))
  Downloading cohere-5.5.8-py3-none-any.whl (173 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.8/173.8 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf (from -r req.txt (line 4))
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cassio>=0.1.1 (

# Importing Cassandra

In [2]:
import cassandra
print (cassandra.__version__)

3.29.1


Here it is connecting to a Cassandra database hosted on Datastax Astra :


*   Cluster and PlainTextAuthProvider are classes from the cassandra library used to connect to a Cassandra database.
*   cloud_config is a dictionary containing the path to the secure connect bundle. This bundle is a ZIP file provided by Datastax Astra to securely connect to their managed Cassandra database.
*  The JSON file that contains the credentials is loaded into a dictionary called secrets.
*  The clientId and secret are extracted from the secrets dictionary, which are necessary for authentication.
*  Then we create a Cluster object with the cloud configuration and authentication provider, and then establishes a session with the Cassandra cluster.




In [3]:
# Load environment variables from .env file
from dotenv import load_dotenv
import os
load_dotenv()

True

In [4]:
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
ASTRA_DB_TOKEN = os.getenv("ASTRA_DB_TOKEN")
ASTRA_DB_ENDPOINT = os.getenv("ASTRA_DB_ENDPOINT")

In [5]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json

cloud_config= {
  'secure_connect_bundle': 'secure-connect-sumeet.zip'
}

with open("sumeet-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
  print(row[0])
else:
  print("An error occurred.")

ERROR:cassandra.connection:Closing connection <LibevConnection(137198247292080) 4a1c36e1-80dc-415c-bbba-3d05e2197013-us-east1.db.astra.datastax.com:29042:3b8afe0a-9b27-48ef-af55-d95b0aef8e45> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


4.0.11-09ec37c912ed


# Import

In [6]:
from langchain.llms import Cohere
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, PyPDFLoader


In [7]:
os.environ['COHERE_API_KEY'] = COHERE_API_KEY


# Initialization

Now we set up a Cohere language model for text generation and a Cohere embeddings model for converting text into numerical vectors.

In [8]:
from langchain.llms import Cohere
from langchain.embeddings import CohereEmbeddings

llm = Cohere(model="command-r-plus", temperature=0)
cohere_embeddings = CohereEmbeddings()


  warn_deprecated(
  warn_deprecated(


Here I am basically setting up a system to index and store vectorized text data in a Cassandra database using embeddings generated by Cohere. It splits the text into chunks of 400 characters with an overlap of 30 characters, converts these chunks into embeddings, and stores them in a specified table and keyspace in Cassandra. This allows for efficient querying and retrieval of relevant text data based on vector similarity.

In [9]:
table_name = 'pdf_q_n_a_table_1'
keyspace = "default_keyspace"

index_creator = VectorstoreIndexCreator(
    vectorstore_cls = Cassandra,
    embedding = cohere_embeddings,
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 400,
        chunk_overlap = 30,
    ),
    vectorstore_kwargs={
        'session': session,
        'keyspace': keyspace,
        'table_name': table_name,
    },
)


# Loading PDF

In [10]:
file_name = input("Enter the PDF file name: ")
loader = PyPDFLoader(file_name)
pages = loader.load_and_split()


Enter the PDF file name: test.pdf


In [11]:
len(pages)

17

In [12]:
pages[1]

Document(metadata={'source': 'test.pdf', 'page': 1}, page_content='The EUROCALL Review,  Volume 25, No. 2, September 2017  \n \n 19 namely a research question, description of participants, data collection tools and \nanalysis. This is followe d by the presentation of the results of the study. The article \ncloses with discussion and conclusions.  \n2. Literature review  \n2.1. Autonomy in foreign/second language learning  \nThe concept of autonomy in second/foreign language learning and teaching has been \nthe focus of attention for many researchers and practitioners for more than three \ndecades. According to Benson (2001), the notion of autonomy was introduced and \npopularized in 1981 by Henri Holec in his seminal report for the Council of Europe \nentitled  Autonomy in Foreign Language Learning  in which the researcher defined \nautonomy in the context of language learning as “the ability to take charge of one’s own \nlearning” (Holec, 1981, p. 3). Holec’s idea of autonomy encompas

# Load to Index

Here I will create an index for the documents loaded by the specified loader. This index can then be used for various purposes, such as searching for specific content within the documents.

* pdf_index will be the created index that can be queried to retrieve information from the PDF documents loaded by the loader.

In [13]:
pdf_index = index_creator.from_loaders([loader])

Here I will retrieve all rows from the specified Cassandra table and prints some of the contents of each row, including the row_id, a truncated version of the embedding_vector, a truncated version of the body_blob, and the metadata_blob. This is useful for debugging or inspecting the contents of the table.

In [14]:
default_query = f'SELECT * FROM {keyspace}.{table_name}'

rows = session.execute(default_query)

for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'row_id: {row.row_id}')
    print(f'embedding_vector: {str(row.vector)[:64]} ...')
    print(f'body_blob: {row.body_blob[:64]} ...')
    print(f'metadata_blob: {row.metadata_s}')

print('\n...')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
embedding_vector: [-0.1748046875, 2.138671875, 1.8408203125, -3.240234375, 1.77636 ...
body_blob: the
r egular
meetings
pla y ed
a
crucial
r ole
in
k eeping
me
al ...
metadata_blob: {'page': '2.0', 'source': 'temp.pdf'}

Row 1568:
row_id: f371b37262c7480fad153ac8366fcf29
embedding_vector: [-1.85546875, 2.6015625, 1.7607421875, -1.4208984375, -0.1636962 ...
body_blob: interest among researchers in recent years (Byrne & Diem, 2014). ...
metadata_blob: {'page': '0.0', 'source': 'temp.pdf'}

Row 1569:
row_id: a9595b3eab4347f486b81638f2b53c16
embedding_vector: [-0.919921875, 2.09375, 0.2724609375, -0.70947265625, 1.52050781 ...
body_blob: instance, the use of chunks of spare time for language practice, ...
metadata_blob: {'page': '2.0', 'source': 'temp.pdf'}

Row 1570:
row_id: a0ff1a4c1b394372b269e3ab86a95080
embedding_vector: [-0.5400390625, 0.246337890625, 0.453857421875, -1.7216796875, 1 ...
body_blob: since they were acces

# Asking questions to PDF

In [15]:
llm.model = 'command-r-plus'
query_1 = "What is 1+1?"
pdf_index.query_with_sources(query_1, llm=llm)



{'question': 'What is 1+1?', 'answer': "I don't know.\n", 'sources': ''}

In [16]:
llm.model = 'command-r-plus'
query_2 = "What roles did Sumeet play?"
pdf_index.query_with_sources(query_2, llm=llm)



{'question': 'What roles did Sumeet play?',
 'answer': 'Sumeet played a variety of roles in the event, including helping to design the poster and LinkedIn posts, coordinating with the team to manage the crowd, ensuring there were no technical problems, making feedback forms, and helping people with doubts.\n\n',
 'sources': 'temp.pdf'}

In [17]:
llm.model = 'command-r-plus'
query_3 = "How does Benson (2011) describe the relationship between educational technologies and learner autonomy?"
pdf_index.query_with_sources(query_3, llm=llm)



{'question': 'How does Benson (2011) describe the relationship between educational technologies and learner autonomy?',
 'answer': 'Benson (2011) describes the relationship between educational technologies and learner autonomy as a connection that has always existed, with educational technologies often being intended for independent practice.\n',
 'sources': 'temp.pdf, test.pdf'}

In [18]:
llm.model = 'command-r-plus'
query_4 = "Why should language teachers equip learners with knowledge about mobile devices?"
pdf_index.query_with_sources(query_4, llm=llm)



{'question': 'Why should language teachers equip learners with knowledge about mobile devices?',
 'answer': "Language teachers should equip learners with knowledge about mobile devices because these devices are ubiquitous and substantial constituents of almost every language learner's everyday life.\n",
 'sources': 'test.pdf, temp.pdf'}

In [19]:
llm.model = 'command-r-plus'
query_5 = "What does Reinders (2011) suggest about the continuum of autonomy in language learning?"
pdf_index.query_with_sources(query_5, llm=llm)




{'question': 'What does Reinders (2011) suggest about the continuum of autonomy in language learning?',
 'answer': 'Reinders (2011) suggests that autonomy in language learning is not an either-or concept, but a continuum. This means that a learner can display varying levels of autonomy in different learning situations, and that autonomy can change over time within and between skills.\n',
 'sources': 'test.pdf, temp.pdf'}

In [20]:
llm.model = 'command-r-plus'
query_6 = "What is 1+1?"
pdf_index.query_with_sources(query_6, llm=llm)




{'question': 'What is 1+1?', 'answer': "I don't know.\n", 'sources': ''}

In [21]:
llm.model = 'command-r-plus'
query_7 = "What new features did you work on in Swipe Pages?"
pdf_index.query_with_sources(query_7, llm=llm)




{'question': 'What new features did you work on in Swipe Pages?',
 'answer': 'The new features worked on in Swipe Pages include a sliding testimonial feature, a form feature, and integration with Zoho and Razorpay.\n\n',
 'sources': 'temp.pdf, Sumeet.pdf'}

In [22]:
llm.model = 'command-r-plus'
query_8 = "Please give the summary of test.pdf?"
pdf_index.query_with_sources(query_8, llm=llm)



{'question': 'Please give the summary of test.pdf?',
 'answer': "The document is a performance appraisal self-evaluation template for an intern named Sumeet Prusty, who worked as a marketing analyst intern from January 14, 2024, to April 30, 2024. Sumeet's tasks during the internship included researching data on individuals seeking higher education, particularly those preparing for visa interviews in the USA, EU, and other countries.\n\n",
 'sources': 'temp.pdf'}

In [23]:
!pip install pymupdf
import fitz
import cohere
import time


Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.6 pymupdf-1.24.7


In [25]:
api_key = COHERE_API_KEY
co = cohere.Client(api_key)

In [26]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text


In [27]:
def chunk_text(text, max_chunk_size=3000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word) + 1
        if current_length > max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [28]:
def summarize_text_with_cohere(text):
    chunks = chunk_text(text)
    summary = ""
    for chunk in chunks:
        response = co.summarize(text=chunk, length='medium')
        summary += response.summary + " "
        time.sleep(12)
    return summary.strip()


In [30]:
pdf_files = ["test.pdf"]
summaries = {}

for pdf_file in pdf_files:
    text = extract_text_from_pdf(pdf_file)
    summary = summarize_text_with_cohere(text)
    summaries[pdf_file] = summary

for pdf, summary in summaries.items():
    print(f"Summary for {pdf}:\n{summary}\n")

Summary for test.pdf:
The study explores the use of mobile devices by advanced learners of English. The data, collected from 20 students by means of a semi-structured interview, were subjected to qualitative and quantitative analysis. The results show that, while some students are aware of the benefits of mobile devices for language learning and can retrieve necessary information to adjust their learning to their personal learning styles, others use their mobile devices rather intuitively and/or ad hoc in the classroom. The paper discusses the implications for research and teaching. This study investigates the use of mobile devices (smartphones and tablets) in the context of English language learning. Specifically, it focuses on the ways in which advanced English language students use their mobile devices for their learning. The study is based on the concept of autonomy in language learning, which suggests that learners are able to take control of their own learning. The study found th