# Installing Dependencies

In [1]:
!pip install sentence-transformers PyPDF2  language-tool-python nltk

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: PyPDF2, language-tool-python, sentence-transformers
Successfully installed PyPDF2-3.0.1 language-tool-python-2.8.1 sentence-transformers-3.3.1


In [2]:
 # Install the nltk library if you haven't already
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
!pip install faiss-gpu
import faiss


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [4]:
import PyPDF2
from sentence_transformers import SentenceTransformer


# Data Extraction

In [5]:
def extract_text_with_metadata(pdf_paths):
    paragraphs = []
    metadata = []
    
    for name,pdf_path in pdf_paths:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    page_paragraphs = '\n'.join(lines) + '\n\n'
                    # Split paragraphs
                    split_paragraphs = page_paragraphs.split('\n\n')
                    split_paragraphs = [p.strip() for p in split_paragraphs if p.strip()]
                    paragraphs.extend(split_paragraphs)
                    # Store metadata for each paragraph
                    metadata.extend([(name, page_num + 2)] * len(split_paragraphs))
    
    return paragraphs, metadata


In [6]:
pdf_paths = [("OS","/kaggle/input/operating-systems/Abraham-Silberschatz-Operating-System-Concepts_removed.pdf")]

In [6]:
pdf_paths = [("Distributed Systems",
              "/kaggle/input/distributed-textbook/Distributed_Systems_trimmed.pdf")]

In [7]:
paragraphs,metadata = extract_text_with_metadata(pdf_paths)

In [8]:
len(paragraphs)

610

# Encoding

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
paragraph_embeddings = model.encode(paragraphs, show_progress_bar=True)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
import numpy as np
paragraph_embeddings = paragraph_embeddings / np.linalg.norm(paragraph_embeddings, axis=1, keepdims=True)

In [12]:
dimension = paragraph_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dimension)
faiss_index.add(paragraph_embeddings)

In [13]:
def query_faiss_index(query, top_k):
    query_embedding = model.encode([query])
    distances, indices = faiss_index.search(query_embedding, top_k)
    return [paragraphs[i] for i in indices[0]]

# Loading LLM Model

In [14]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama-2/pytorch/7b-chat-hf/1", trust_remote_code=True)
llm_model = AutoModelForCausalLM.from_pretrained("/kaggle/input/llama-2/pytorch/7b-chat-hf/1", trust_remote_code=True,device_map="auto", offload_folder="offload")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Generate insight simple

In [15]:
def generate_insight_simple(query, context):
    # Construct a structured prompt with a clear marker
    combined_input = (
        f"\nINSTRUCTIONS:\n"
        f"Answer the users QUESTION concisely using the DOCUMENT text above.\n"
        f"If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE\n"
        f"Use simple terms so that the user can understand\n"
        f"DOCUMENT:\n"
        f"{context}"
        f"\nQUESTION:\n"
        f"{query}"        
        f"\nANSWER:\n"
        
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the prompt
    inputs = tokenizer.encode_plus(
        combined_input,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=4000
    ).to('cuda')

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate the output using the model
    outputs = llm_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=500,
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        temperature=0.3,
        top_k=10,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )


    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = generated_text.index("ANSWER:") + len("ANSWER:")
    cleaned_output = generated_text[answer_start:].strip()                      
                                        
    


    return generated_text,cleaned_output


# Context cleaning

In [16]:
import re
def clean_context(context):
    # Remove newline characters and replace them with spaces
    cleaned_text = context.replace('\n', ' ')
    
    # Remove any sequences like "DS 4.02 downloaded by UMAG @AM.AMRITA.EDU1.2"
    cleaned_text = re.sub(r'DS \d+\.\d+ downloaded by .*?@.*?\d+\.\d+', '', cleaned_text)
    
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

# Evaluation Metrics

In [17]:
from sentence_transformers import CrossEncoder
import language_tool_python
# Initialize CrossEncoder models for query relevance and context fidelity
query_relevance_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
#context_fidelity_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [18]:
def calculate_weighted_context_embedding(query, top_k):
    # Retrieve top-k documents
    results = query_faiss_index(query, top_k)
    
    # Assign weights based on rank
    weights = [1 / (rank + 1) for rank in range(len(results))]  # Example: Exponential decay
    normalized_weights = [w / sum(weights) for w in weights]  # Normalize to sum to 1
    
    # Encode embeddings and compute the weighted average
    doc_embeddings = [model.encode(result, convert_to_tensor=True) for result in results]
    weighted_embedding = sum(weight * embedding for weight, embedding in zip(normalized_weights, doc_embeddings))
    
    return weighted_embedding


In [22]:
import torch
from sentence_transformers import SentenceTransformer, util
import language_tool_python
def evaluation_metrics(query, context, answer):
    # Compute query relevance score
    query_relevance_score = query_relevance_model.predict([(query, answer)])[0]
    context_embedding = calculate_weighted_context_embedding(query, top_k=3)
    answer_embedding = model.encode(answer, convert_to_tensor=True)
    
    # Compute context fidelity score
    #context_fidelity_score = context_fidelity_model.predict([(context, answer)])[0]
    similarity_score_cf = util.pytorch_cos_sim(answer_embedding, context_embedding)
    
    # Normalize scores to percentages using sigmoid
    query_relevance_percentage = torch.sigmoid(torch.tensor(query_relevance_score)).item() * 100
    #context_fidelity_percentage = torch.sigmoid(torch.tensor(context_fidelity_score)).item() * 100
    
    # Grammar checking using LanguageTool
    tool = language_tool_python.LanguageTool('en-US')  
    matches = tool.check(answer)
    
    # Calculate the number of errors
    num_errors = len(matches)
    
    # Calculate the percentage of errors per word
    num_words = len(answer.split())
    error_percentage = (num_errors / num_words) * 100 if num_words > 0 else 0

    return query_relevance_percentage, similarity_score_cf.item()*100, error_percentage


# Testing

Evaluation criteria:
Clarity (3 points): Is the answer clear, unambiguous, and free of jargon?
Conciseness (2 points): Is the answer concise, avoiding unnecessary information?
Relevance (3 points): Does the answer directly address the student’s question?
Grammar & Language (2 points): Is the grammar correct and the language simple enough for students to understand?

# OOPS

# **Q1**


In [102]:
query = "What are the main purposes of an operating system?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
4 Chapter 1 Introduction user 1user 2user 3 computer hardwareoperating systemsystem and application programscompiler assembler text editor database systemuser n… … Figure 1.1 Abstract view of the components of a computer system. 1.1 What Operating Systems Do We begin our discussion by looking at the operating system’s role in the overall computer system. A computer system can be divided roughly into four components: the hardware, theoperating system, theapplication programs, and the users (Figure 1.1). The hardware —the central processing unit (CPU),t h e memory ,a n dt h e input/output (I/O)devices —provides the basic computing resources for the system. The application programs —such as word processors, spreadsheets, compilers, and Web browsers—deﬁne t

In [107]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99539852142334
Context Fidelity Score: 74.63601231575012
Error Percentage: 6.86%


In [32]:
qr,cf,gram = evaluation_metricss(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 85.43357849121094
Context Fidelity Score: 76.58169269561768
Error Percentage: 4.35%


Overall grade - 8.25/10

# **Q2**

In [108]:

query = "Explain the concept of multiprogramming."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
3CHAPTER Processes Early computers allowed only one program to be executed at a time. This program had complete control of the system and had access to all the system’s resources. In contrast, contemporary computer systems allow multiple pro- grams to be loaded into memory and executed concurrently. This evolution required ﬁrmer control and more compartmentalization of the various pro- grams; and these needs resulted in the notion of a process , which is a program in execution. A process is the unit of work in a modern time-sharing system. The more complex the operating system is, the more it is expected to do on behalf of its users. Although its main concern is the execution of user programs, it also needs to take care of various system tasks that are 

In [109]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.67774748802185
Context Fidelity Score: 63.34573030471802
Error Percentage: 4.05%


In [35]:
qr,cf,gram = evaluation_metricss(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 67.99759268760681
Context Fidelity Score: 72.56436944007874
Error Percentage: 29.30%


Overall grade - 7.6/10

# Q3

In [110]:
query = "What is a system call?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 1 has a total capacity of 14.74 GiB of which 12.12 MiB is free. Process 3105 has 14.73 GiB memory in use. Of the allocated memory 13.98 GiB is allocated by PyTorch, and 638.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Cache

In [38]:
import torch
import gc
print(torch.cuda.device_count())

# Loop through all GPUs and clear their memory
for i in range(torch.cuda.device_count()):
    torch.cuda.set_device(i)
    torch.cuda.empty_cache()

# Optionally, collect garbage
gc.collect()

print("CUDA memory cache cleared for all GPUs.")


2
CUDA memory cache cleared for all GPUs.


Grading for the answer (out of 10): 9.03/10

In [37]:
import torch
#torch.cuda.set_device(0)
torch.cuda.empty_cache()


In [42]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 88.19022178649902
Context Fidelity Score: 78.09152603149414
Error Percentage: 5.26%


# Q4

In [114]:
query = "Describe the structure and purpose of a file system."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
Part Four Storage Management Since main memory is usually too small to accommodate all the data and programs permanently, the computer system must provide secondary storage to back up main memory. Modern computer systems use disksas the primary on-line storage me dium for information (both programs and data). The ﬁle system provides the mechanism for on-line storage of and access to both data and programs residing on the disks. A ﬁle is a collection of related information deﬁned by its creator. The ﬁles aremapped by the operating system onto physical devices. Files are normally organized into directories for ease of use. The devices that attach to a computer vary in many aspects. Some devices transfer a character or a bl ock of characters at a time. Som

In [116]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.98220801353455
Context Fidelity Score: 77.71925926208496
Error Percentage: 1.99%


Overall grade - 8.3/10

# Q5

In [117]:
query = "Discuss the key differences between batch systems, time-sharing systems, and real-time systems."
results = query_faiss_index(query, top_k = 2)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
798 Chapter 18 The Linux System difference between FCFS and round-robin scheduling is that FCFS processes continue to run until they either exit or b lock, whereas a round-robin process will be preempted after a while and will be moved to the end of the scheduling queue, so round-robin processes of equal priority will automatically time-shareamong themselves. Linux’s real-time scheduling is soft—rather than hard—real time. The scheduler offers strict guarantees about the relative priorities of real-time processes, but the kernel does not offer any guarantees about how quicklya real-time process will be scheduled once that process becomes runnable. Incontrast, a hard real-time system can guarantee a minimum latency between when a process becomes runnable

Overall grade - 9.3/10

In [118]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.95736479759216
Context Fidelity Score: 78.96842956542969
Error Percentage: 5.56%


# Q6

In [119]:
query = "How does an operating system ensure memory protection and prevent a process from accessing another process’s memory?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
14CHAPTER Protection The processes in an operating system must be protected from one another’s activities. To provide such protection, we can use various mechanisms to ensure that only processes that have gained proper authorization from the operating system can operate on the ﬁles, memory segments, CPU, and other resources of a system. Protection refers to a mechanism for controlling the access of programs, processes, or users to the resources deﬁned by a computer system. This mechanism must provide a means for specifying the controls to be imposed, together with a means of enforcemen t. We distinguish between protection and security, which is a measure of conﬁdence that the integrity of a system and its data will be preserved. In this cha pter, we foc

Overall grade - 8.12/10

In [120]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.98562335968018
Context Fidelity Score: 79.14291620254517
Error Percentage: 4.26%


# Q7

In [121]:
query = "Explain the role of interrupts in modern operating systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
592 Chapter 13 I/O Systems device driver initiates I/O CPU receiving interrupt, transfers control to interrupt handler CPU resumes processing of interrupted taskCPU 1I/O controller CPU executing checks for interrupts between instructions 5 interrupt handler processes data, returns from interruptinitiates I/O 32 4 7input ready, output complete, or error generates interrupt signal 6 Figure 13.3 Interrupt-driven I/O cycle. 13.2.2 Interrupts The basic interrupt mechanism works as follows. The CPU hardware has a wire called the interrupt-request line that the CPU senses after executing every instruction. When the CPU detects that a controller has asserted a signal on the interrupt-request line, the CPU performs a state save and jumps to the interrupt-handler

Overall grade - 7.27/10

In [122]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99234676361084
Context Fidelity Score: 77.30997800827026
Error Percentage: 6.92%


# Q8

In [123]:
query = "Analyze the storage hierarchy in operating systems"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
502 Chapter 10 Mass-Storage Structure are opened for reading are read sequentially in their entirety, and most seeks are short. The concept of a storage hierarchy has been studied for more than forty years. For instance, a 1970 paper by [Mattson et al. (1970)] describes amathematical approach to predicting the performance of a storage hierarchy. Bibliography [Kim et al. (2009)] J. Kim, Y. Oh, E. Kim, J. C. D. Lee, and S. Noh, “Disk schedulers for solid state drivers ”(2009), pages 295–304. [Love (2010)] R. Love, Linux Kernel Development, Third Edition, Developer’s Library (2010). [Lumb et al. (2000)] C. Lumb, J. Schindler, G. R. Ganger, D. F. Nagle, and E. Riedel, “Towards Higher Disk Head Utilization: Extracting Free Bandwidth From Busy Disk Drives ”,S

Overall grade - 8.7 / 10

In [124]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.9552071094513
Context Fidelity Score: 78.62392663955688
Error Percentage: 7.79%


# Q9

In [125]:
query = "What challenges do operating systems face in distributed environments?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
48 Chapter 1 Introduction THE STUDY OF OPERATING SYSTEMS There has never been a more interesting time to study operating systems, and it has never been easier. The open-source movement has overtaken operating systems, causing many of them to be made available in both source and binary (executable) format. The list of operating systems available in both formats includes Linux, BSD UNIX , Solaris, and part of Mac OS X . The availability of source code allows us to study operating systems from the inside out. Questions that we could once answer only by looking at documentation or the behavior of an operating system we can now answer by examining the code itself. Operating systems that are no longer commercially viable have been open-sourced as well, enabli

Overall grade - 8.5/10

In [126]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99728202819824
Context Fidelity Score: 62.924814224243164
Error Percentage: 5.69%


# Q10

In [163]:
query = "what is a deadlock"
results = query_faiss_index(query, top_k = 2)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
Practice Exercises 339 7.8 Summary A deadlocked state occurs when two or more processes are waiting indeﬁnitely for an event that can be caused only by one of the waiting processes. There arethree principal methods for dealing with deadlocks: •Use some protocol to prevent or avoid deadlocks, ensuring that the system will never enter a deadlocked state. •Allow the system to enter a deadlocked state, detect it, and then recover. •Ignore the problem altogether and pre tend that deadlocks never occur in the system. The third solution is the one used by most operating systems, including Linux and Windows. A deadlock can occur only if four necessary conditions hold simultaneously in the system: mutual exclusion, hold and wait, no preemption, and circularwait.

Overall grade - 9/10

In [164]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99488592147827
Context Fidelity Score: 76.21306777000427
Error Percentage: 3.16%


# Q11

In [18]:
query = "What are the key differences between symmetric multiprocessing (SMP) and asymmetric multiprocessing (AMP)?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.3 Computer-System Architecture 15 is restarted. This solution is expensive, since it involves special hardware and considerable hardware duplication. The multiple-processor systems in use today are of two types. Some systems use asymmetric multiprocessing , in which each processor is assigned a speciﬁc task. A boss processor controls the system; the other processors either look to the boss for instruction or have predeﬁned tasks. This scheme deﬁnes a boss–worker relationship. The boss processor schedules and allocates work to the worker processors. The most common systems use symmetric multiprocessing (SMP ),i n which each processor performs all tasks within the operating system. SMP means that all processors are peers; no boss–worker relationship exi

Overall grade - 7.9/10

In [21]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:02<00:00, 90.3MB/s] 


Query relevance Score: 79.13312911987305
Context Fidelity Score: 80.77019453048706
Error Percentage: 13.54%


# Q12

In [22]:
query = "Explain the concept of process synchronization and the need for mutual exclusion."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
258 Chapter 5 Process Synchronization Bibliographical Notes The mutual-exclusion problem was ﬁrst discussed in a classic paper by [Dijkstra (1965)]. Dekker’s algorithm (Exercise 5.8)—the ﬁrst correct softwaresolution to the two-process mutual-exclusion problem—was developed by theDutch mathematician T. Dekker. This algorithm also was discussed by [Dijkstra (1965)]. A simpler solution to the two-process mutual-exclusion problem has since been presented by [Peterson (1981)] (Figure 5.2). The semaphore conceptwas suggested by [Dijkstra (1965)]. The classic process-coordination problems that we have described are paradigms for a large class of concurrency-control problems. The bounded- buffer problem and the dining-philosophers problem were suggested in[Dij

In [29]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.95644688606262
Context Fidelity Score: 73.4193742275238
Error Percentage: 5.97%


# Q13

In [30]:
query = "Describe the structure and operation of a clustered system."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.3 Computer-System Architecture 17 1.3.3 Clustered Systems Another type of multiprocessor system is a clustered system , which gathers together multiple CPUs. Clustered systems differ from the multiprocessor systems described in Section 1.3.2 in that they are composed of two or more individual systems—or nodes—joined together. Such systems are considered loosely coupled . Each node may be a single processor system or a multicore system. We should note that the deﬁnition of clustered is not concrete; many commercial packages wrestle to deﬁne a clustered system and why one form is better than another. The generally accepted deﬁnition is that clustered computers share storage and are closely linked via a local-area network LAN (as described in Chapter 17)

In [31]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.09331798553467
Context Fidelity Score: 77.59290933609009
Error Percentage: 24.02%


# Q14

In [32]:
query = "What is the significance of system daemons in operating systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
8 Chapter 1 Introduction user processexecutingCPU I/O interrupt processing I/O requesttransfer doneI/O requesttransfer doneI/O deviceidle transferring Figure 1.3 Interrupt timeline for a single process doing output. this goal, the bootstrap program must locate the operating-system kernel and load it into memory. Once the kernel is loaded and executin g, it can start providing services to the system and its users. Some services are provided outside of the kernel, bysystem programs that are loaded into memory at boot time to become system processes ,o rsystem daemons that run the entire time the kernel is running. On UNIX , the ﬁrst system process is “init, ”and it starts many other daemons. Once this phase is complete, the system is fully booted, and the

In [33]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.9916672706604
Context Fidelity Score: 68.33714246749878
Error Percentage: 5.08%


# Q15

In [34]:
query = "How does the operating system handle deadlocks?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
Practice Exercises 339 7.8 Summary A deadlocked state occurs when two or more processes are waiting indeﬁnitely for an event that can be caused only by one of the waiting processes. There arethree principal methods for dealing with deadlocks: •Use some protocol to prevent or avoid deadlocks, ensuring that the system will never enter a deadlocked state. •Allow the system to enter a deadlocked state, detect it, and then recover. •Ignore the problem altogether and pre tend that deadlocks never occur in the system. The third solution is the one used by most operating systems, including Linux and Windows. A deadlock can occur only if four necessary conditions hold simultaneously in the system: mutual exclusion, hold and wait, no preemption, and circularwait.

In [35]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99743700027466
Context Fidelity Score: 77.02907919883728
Error Percentage: 4.85%


# DS - Q1

In [43]:
query = "What are the two key technological developments that led to the rise of distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.1. FROM NETWORKED SYSTEMS TO DISTRIBUTED SYSTEMS 3 The size of a networked computer system may vary from a handful of devices, to millions of computers. The interconnection network may be wired, wireless, or a combination of both. Moreover, these systems are often highly dynamic, in the sense that computers can join and leave, with the topology and performance of the underlying network almost continuously changing. It is difficult to think of computer systems that are notnetworked. And as a matter of fact, most networked computer systems can be accessed from any place in the world because they are hooked up to the Internet. Studying to understand these systems can easily become exceedingly complex. In this chapter, we start with shedding some light on

In [44]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.9895453453064
Context Fidelity Score: 63.916659355163574
Error Percentage: 0.00%


# DS-Q2

In [45]:
query = "Explain the difference between centralized, decentralized, and distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.1. FROM NETWORKED SYSTEMS TO DISTRIBUTED SYSTEMS 3 The size of a networked computer system may vary from a handful of devices, to millions of computers. The interconnection network may be wired, wireless, or a combination of both. Moreover, these systems are often highly dynamic, in the sense that computers can join and leave, with the topology and performance of the underlying network almost continuously changing. It is difficult to think of computer systems that are notnetworked. And as a matter of fact, most networked computer systems can be accessed from any place in the world because they are hooked up to the Internet. Studying to understand these systems can easily become exceedingly complex. In this chapter, we start with shedding some light on

In [46]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99672174453735
Context Fidelity Score: 71.96626663208008
Error Percentage: 3.25%


# DS-Q3

In [47]:
query = "What is the role of a middleware layer in achieving distribution transparency?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.2. DESIGN GOALS 11 special shared folder that is maintained by a third party somewhere on the In- ternet. Using special software, the shared folder is barely distinguishable from other folders on a user’s computer. In effect, these services replace the use of a shared directory on a local distributed file system, making data available to users independent of the organization they belong to, and independent of where they are. The service is offered for different operating systems. Where exactly data are stored is completely hidden from the end user. 1.2.2 Distribution transparency An important goal of a distributed system is to hide the fact that its processes and resources are physically distributed across multiple computers, possibly separated by lar

In [48]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.91674423217773
Context Fidelity Score: 61.2017035484314
Error Percentage: 5.68%


# DS-Q4

In [49]:
query = "Define the term 'scalability' in the context of distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
24 CHAPTER 1. INTRODUCTION 1.2.6 Scalability For many of us, worldwide connectivity through the Internet is as common as being able to send a package to anyone anywhere around the world. Moreover, where until recently, we were used to having relatively powerful desktop computers for office applications and storage, we are now witnessing that such applications and services are being placed in what has been coined “the cloud,” in turn leading to an increase of much smaller networked devices such as tablet computers or even cloud-only laptops such as Google’s Chromebook. With this in mind, scalability has become one of the most important design goals for developers of distributed systems. Scalability dimensions Scalability of a system can be measured along

In [50]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.96851682662964
Context Fidelity Score: 76.64677500724792
Error Percentage: 11.96%


# DS-Q5

In [51]:
query = "What is replication in distributed systems, and why is it used?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
7.1. INTRODUCTION 393 7.1.1 Reasons for replication There are two primary reasons for replicating data. First, data are replicated to increase the reliability of a system. If a file system has been replicated, it may be possible to continue working after one replica crashes by simply switching to one of the other replicas. Also, by maintaining multiple copies, it becomes possible to provide better protection against corrupted data. For example, imagine there are three copies of a file, and every read and write operation is performed on each copy. We can safeguard ourselves against a single, failing write operation, by considering the value that is returned by at least two copies as being the correct one. The other reason for replicating data is performa

In [52]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.98722076416016
Context Fidelity Score: 83.68646502494812
Error Percentage: 1.86%


# DS-Q6

In [53]:
query = "Discuss the challenges of achieving consistency in distributed systems with replicated data."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
392 CHAPTER 7. CONSISTENCY AND REPLICATION An important issue in distributed systems is the replication of data. Data are generally replicated to enhance reliability or improve performance. One of the major problems is keeping replicas consistent. Informally, this means that when one copy is updated, we need to ensure that the other copies are updated as well; otherwise the replicas will no longer be the same. In this chapter, we take a detailed look at what consistency of replicated data actually means, and the various ways that consistency can be achieved. We start with a general introduction discussing why replication is useful and how it relates to scalability. We then continue by focusing on what consis- tency actually means. An important class of 

In [54]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.99438524246216
Context Fidelity Score: 80.75077533721924
Error Percentage: 9.40%


# DS-Q7

In [55]:
query = "What are the different types of distribution transparency, and why are they important in distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
12 CHAPTER 1. INTRODUCTION Transparency Description Access Hide differences in data representation and how an object is accessed Location Hide where an object is located Relocation Hide that an object may be moved to another location while in use Migration Hide that an object may move to another location Replication Hide that an object is replicated Concurrency Hide that an object may be shared by several independent users Failure Hide the failure and recovery of an object Figure 1.3: Different forms of transparency in a distributed system (see ISO [1995]). An object can be a resource or a process. agreement on how data is to be represented by different machines and operat- ing systems. For example, a distributed system may have computer systems that ru

In [56]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.33324456214905
Context Fidelity Score: 66.96643829345703
Error Percentage: 8.72%


# DS-Q8

In [30]:
query = "Explain the concept of fault tolerance in distributed systems." 
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
8.1. INTRODUCTION TO FAULT TOLERANCE 463 information on fault tolerance in distributed systems, see, for example Jalote [1994]; Shooman [2002] or Koren and Krishna [2007]. 8.1.1 Basic concepts To understand the role of fault tolerance in distributed systems, we first need to take a closer look at what it actually means for a distributed system to tolerate faults. Being fault tolerant is strongly related to what are called dependable systems . Dependability is a term that covers a number of useful requirements for distributed systems, including the following [Kopetz and Verissimo, 1993]: • Availability • Reliability • Safety • Maintainability Availability is defined as the property that a system is ready to be used immediately . In general, it refers to 

In [31]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.97696280479431
Context Fidelity Score: 81.53613805770874
Error Percentage: 0.62%


# DS-Q9

In [80]:
query = "How do geographical and administrative scalability differ in distributed systems?" 
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
24 CHAPTER 1. INTRODUCTION 1.2.6 Scalability For many of us, worldwide connectivity through the Internet is as common as being able to send a package to anyone anywhere around the world. Moreover, where until recently, we were used to having relatively powerful desktop computers for office applications and storage, we are now witnessing that such applications and services are being placed in what has been coined “the cloud,” in turn leading to an increase of much smaller networked devices such as tablet computers or even cloud-only laptops such as Google’s Chromebook. With this in mind, scalability has become one of the most important design goals for developers of distributed systems. Scalability dimensions Scalability of a system can be measured along

In [81]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.97578263282776
Context Fidelity Score: 59.24699306488037
Error Percentage: 8.57%


# DS-Q10

In [32]:
query = "What is the CAP theorem, and how does it apply to distributed systems?" 
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 14.12 MiB is free. Process 2920 has 14.72 GiB memory in use. Of the allocated memory 13.87 GiB is allocated by PyTorch, and 744.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

# DS-Q11

#

In [97]:
query = "Evaluate the techniques used to achieve scalability in distributed systems." 
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 54.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 28.12 MiB is free. Process 3941 has 14.71 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 342.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

# DS-Q12

In [20]:
query = "Analyze the trade-offs between achieving distribution transparency and maintaining system performance." 
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
1.2. DESIGN GOALS 15 it should be considered together with other issues such as performance and comprehensibility. The price for achieving full transparency may be surprisingly high. Note 1.2 (Discussion: Against distribution transparency) Several researchers have argued that hiding distribution will lead to only further complicating the development of distributed systems, exactly for the reason that full distribution transparency can never be achieved. A popular technique for achieving access transparency is to extend procedure calls to remote servers. How- ever, Waldo et al. [1997] already pointed out that attempting to hide distribution by such remote procedure calls can lead to poorly understood semantics, for the simple reason that a procedure call

In [23]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:02<00:00, 88.7MB/s] 


Query relevance Score: 99.79459643363953
Context Fidelity Score: 63.759446144104004
Error Percentage: 8.49%


# DS-Q13

In [24]:
query = "Describe the differences between grid computing and cluster computing in high-performance distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
34 CHAPTER 1. INTRODUCTION Mimicking shared-memory systems using multicomputers eventually had to be abandoned because performance could never meet the expectations of pro- grammers, who would rather resort to far more intricate, yet better (predictably) performing message-passing models. An important side effect of exploring the hardware-software boundaries of parallel processing is a thorough understanding of consistency models, to which we return extensively in Chapter 7. Cluster computing Cluster computing systems became popular when the price/performance ratio of personal computers and workstations improved. At a certain point, it became financially and technically attractive to build a supercomputer using off-the-shelf technology by simply hooking

In [25]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.98869895935059
Context Fidelity Score: 72.52702713012695
Error Percentage: 6.06%


# DS-Q14

In [26]:
query = "Discuss the implications of partial failures in distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
462 CHAPTER 8. FAULT TOLERANCE A characteristic feature of distributed systems that distinguishes them from single-machine systems is the notion of partial failure: part of the system is failing while the remaining part continues to operate, and seemingly correctly. An important goal in distributed-systems design is to construct the system in such a way that it can automatically recover from partial failures without seriously affecting the overall performance. In particular, whenever a failure occurs, the system should continue to operate in an acceptable way while repairs are being made. In other words, a distributed system is expected to be fault tolerant. In this chapter, we take a closer look at techniques to achieve fault toler- ance. After providi

In [27]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.87643957138062
Context Fidelity Score: 73.08513522148132
Error Percentage: 17.27%


# DS-Q15

In [28]:
query = "How does cryptography contribute to security in distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated Insight:

INSTRUCTIONS:
Answer the users QUESTION concisely using the DOCUMENT text above.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return NONE
Use simple terms so that the user can understand
DOCUMENT:
10 CHAPTER 1. INTRODUCTION •As also mentioned, there is no such thing as a nonsecured distributed system. The security perspective focuses on how to ensure authorized access to resources. To that end, we need to discuss trust in distributed systems, along with authentication, namely verifying a claimed identity. The security perspective comes last, yet later in this chapter we shall discuss a few basic instruments that are needed to understand the role of security in the previous perspectives. 1.2 Design goals Just because it is possible to build distributed systems does not necessarily mean that it is a good idea. In this section, we discuss four important goals that should be met to make building a distributed system worth the effort. A distributed sy

In [29]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query relevance Score: 99.9263346195221
Context Fidelity Score: 53.043532371520996
Error Percentage: 3.19%


# **Unused part**

# Q11

In [None]:
query = "Assume that a class is implementing two interfaces. Both interfaces have one common method.What will happen? Justify your answer."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

# Q12

In [None]:
query = "Differentiate between byte stream, character stream, data stream and object stream."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer= generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

# **Q1-Simple**

In [None]:
query = "What is the role of a constructor ?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(para)
insight,answer = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What is the role of a constructor ?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(para)
insight,answer = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What is the essential difference between Error and Exception in Java?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

# **Q2-short**

In [None]:
query = "What is a static variable??"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
#context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = para
context = clean_context(context)
insight,answer = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What year did the modern computer era begin?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight,answer = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What year did the modern computer era begin?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight,answer = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

In [None]:
query = "Explain the concept of distribution transparency in distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight,answer = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

In [None]:
query = "Explain the concept of distribution transparency in distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight,answer = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
qr,cf,gram = evaluation_metrics(query,context,answer)
print(f"Query relevance Score: {qr}\n"
     f"Context Fidelity Score: {cf}\n"
     f"Error Percentage: {gram:.2f}%")

In [None]:
similarity = calculate_Query_relevance(query, answer)
print(f"Query relevance Score: {similarity}")

In [None]:
fidelity_score = calculate_context_fidelity(context, answer)
print(f"Context Fidelity Score: {fidelity_score}")


In [None]:
num_errors, error_percentage = evaluate_grammar_accuracy(answer)
print(f"Number of Errors: {num_errors}")
print(f"Error Percentage: {error_percentage:.2f}%")

In [None]:
query = "Differentiate between decentralized and distributed systems"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Differentiate between decentralized and distributed systems"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Differentiate between decentralized and distributed systems"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What are the challenges of maintaining consistency in a distributed system with replicated data?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What are the challenges of maintaining consistency in a distributed system with replicated data?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What are the two key technological developments that led to the rise of distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What are the two key technological developments that led to the rise of distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What are the two key technological developments that led to the rise of distributed systems?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Define a decentralized system."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Define a decentralized system."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What is the primary goal of a distributed system in terms of resource sharing?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "What is the primary goal of a distributed system in terms of resource sharing?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Analyze the trade-offs between scalability and distribution transparency in distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Analyze the trade-offs between scalability and distribution transparency in distributed systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Analyze the trade-offs between scalability and distribution transparency in distributed systems."
generated_answer = "The trade-Off between scalabilty and distribution tranparency are:1. In geographic scalability hiding latency and bandwith restriction can be difficult2. Administrative scalability may easily conflic with goals of achieving distrubution transparancy3. Full distribution transaprency is impossible to achieve and making distribution explicit can help users understand the behavior of distributed system better."
similarity = calculate_semantic_similarity(query, generated_answer)
print(f"Semantic Similarity Score: {similarity}")

In [None]:
from sentence_transformers import SentenceTransformer, util
def calculate_fidelity_to_source(context, generated_answer):
    # Generate embeddings for the context and the generated answer
    context_embedding = model.encode(context, convert_to_tensor=True)
    answer_embedding = model.encode(generated_answer, convert_to_tensor=True)
    
    # Calculate cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(answer_embedding, context_embedding)
    
    return similarity_score.item()  # Convert tensor to a scalar value

# Example usage
context = ("INTRODUCTION most choices for coming to a distributed system come from the need to im- prove the performance of a single computer system in terms of, for example, reliability, scalability, and efficiency. Design goals for distributed systems include sharing resources and ensur- ing openness. Increasingly important is designing secure distributed systems. The fact that trade-offs need to be made between achieving various forms of distribution transparency is inherent to the design of distributed systems, and can easily complicate their understanding. One specific difficult design goal that does not always blend well with achieving distribution transparency is scalability. This is particularly true for geographi- cal scalability, in which case hiding latencies and bandwidth restrictions can turn out to be difficult. Likewise, administrative scalability, by which a system is designed to span multiple administrative domains, may easily conflict with goals for achieving distribution transparency. Different types of distributed systems exist which can be classified as being oriented toward supporting computations, information processing, and pervasiveness. Distributed computing systems are typically deployed for high-performance applications, often originating from the field of parallel computing. Finally, an emerging class of distributed systems is where components are small, the system is composed in an ad hoc fashion, but most of all is no longer managed through a system administrator. The price for achieving full transparency may be surprisingly high. Note 1.2 (Discussion: Against distribution transparency) Several researchers have argued that hiding distribution will lead to only further complicating the development of distributed systems, exactly for the reason that full distribution transparency can never be achieved. A popular technique for achieving access transparency is to extend procedure calls to remote servers. As an alternative, various researchers and practitioners are now arguing for less transparency, for example, by more explicitly using message-style commu- nication, or more explicitly posting requests to, and getting results from remote machines, as is done on the Web when fetching pages. 1.2.3 Openness Another important goal of distributed systems is openness. At the same time, an open distributed system itself will often consist of components that originate from elsewhere. INTRODUCTION Degree of distribution transparency Although distribution transparency is generally considered preferable for any distributed system, there are situations in which blindly attempting to hide all distribution aspects from users is not a good idea. Likewise, a wide-area distributed system that connects a process in San Francisco to a process in Amsterdam cannot be expected to hide the fact that Mother Nature will not allow it to send a message from one process to the other in less than approximately 35 milliseconds. There is also a trade-off between a high degree of transparency and the performance of a system. Finally, there are situations in which it is not at all obvious that hiding distribution is a good idea. As distributed systems are expanding to devices that people carry around and where the very notion of location and context awareness is becoming increasingly important, it may be best to actually expose distribution rather than trying to hide it. There are other arguments against distribution transparency. Recognizing that full distribution transparency is simply impossible, we should ask our- selves whether it is even wise to pretend that we can achieve it. It may be much better to make distribution explicit so that the user and application developer are never tricked into believing that there is such a thing as transparency. The result will be that users will much better understand the (sometimes unex- pected) behavior of a distributed system, and are thus much better prepared to deal with this behavior. The conclusion is that aiming for distribution transparency may be a nice goal when designing and implementing distributed systems, but that DS 4.02 downloaded by UMAG @AM.AMRITA.EDU"
)
generated_answer="The trade-Off between scalabilty and distribution tranparency are:1. In geographic scalability hiding latency and bandwith restriction can be difficult2. Administrative scalability may easily conflic with goals of achieving distrubution transparancy3. Full distribution transaprency is impossible to achieve and making distribution explicit can help users understand the behavior of distributed system better."
fidelity_score = calculate_fidelity_to_source(context, generated_answer)
print(f"Fidelity to Source Material Score: {fidelity_score}")


In [None]:
query = "Discuss the implications of fault tolerance in centralized versus decentralized systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
from sentence_transformers import SentenceTransformer, util
def calculate_fidelity_to_source(context, generated_answer):
    # Generate embeddings for the context and the generated answer
    context_embedding = model.encode(context, convert_to_tensor=True)
    answer_embedding = model.encode(generated_answer, convert_to_tensor=True)
    
    # Calculate cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(answer_embedding, context_embedding)
    
    return similarity_score.item()  # Convert tensor to a scalar value

# Example usage
context = ("FAULT TOLERANCE Essential Paxos The assumptions under which Paxos operates are rather weak: •The distributed system is partially synchronous (in fact, it may even be asynchronous). By-and-large, these are realistic assumptions for many practical distributed systems. By replicating this server, we aim at obtaining fault tolerance in the presence of crash failures. PROCESS RESILIENCE 505 It should also be noted that the schemes described so far assume that nodes are either Byzantine, or collaborative. A first step toward a solution is captured in the form of BAR fault tolerance , which stands for Byzantine, Altruism, and Rationality. BAR fault tolerance is described in Aiyer et al. Consistency, availability, and partitioning Strongly related to the conditions under which consensus can (not) be reached, is when consistency can be reached. We introduced process groups to improve fault tolerance, and, more specifically, to improve availability. Being consistent in responses while also being highly available is not an unreasonable requirement for services that are part of a distributed system. In 2000, Eric Brewer posed an important theorem which was later proven to be correct by Gilbert and Lynch [2002]: CAP Theorem : Any networked system providing shared data can provide only two of the following three properties: •C: consistency, by which a shared and replicated data item appears as a single, up-to-date copy •A: availability, by which updates will always be eventually executed •P: Tolerant to the partitioning of process group (e.g., because of a failing network). COORDINATION Virtually all algorithms suffer badly in the event of crashes. It is somewhat ironic that distributed algorithms are generally more sensitive to crashes than centralized ones. In this sense, it should not come as a surprise that, indeed, centralized mutual exclusion is widely applied: it is simple to understand the behavior, and relatively easy to increase the fault tolerance of the centralized server. 5.3.6 Example: Simple locking with ZooKeeper For many practical reasons, mutual exclusion in distributed systems is often done with a centralized coordinator, not in the least because the behavior of these solutions is much easier to understand than many other noncentralized versions. Let us briefly look at a system that is designed for coordination tasks in distributed systems, and which is by now also widely deployed. It has been designed for scalability and fault tolerance. ZooKeeper’s fault tolerance aspects are discussed in Chapter 8.")
generated_answer="In centralized systems, fault tolerant mechanisms such as replication can be used to ensure consistency and availability of data. However, in decentralize systems, achieving fault toleranc can be more challenging due to the lack of a central authority that can enforce consistency. Instead, decentralizes systems often rely on consensus protocols such as Paxo, which can tolerate faulty nodes but may not provide strong consistency guarantees. Additionally, decentrilized systems may require more complex fault toleran mechanisms, such as Byzantium fault tolerace, to ensure the system's continuity in the face of faults or failures"
fidelity_score = calculate_fidelity_to_source(context, generated_answer)
print(f"Fidelity to Source Material Score: {fidelity_score}")


In [None]:


def calculate_semantic_similarity(query, generated_answer):
    # Generate embeddings for both the query and the generated answer
    query_embedding = model.encode(query, convert_to_tensor=True)
    answer_embedding = model.encode(generated_answer, convert_to_tensor=True)
    
    # Calculate cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(query_embedding, answer_embedding)
    
    return similarity_score.item()  # Convert tensor to a scalar value



In [None]:
query = "Discuss the implications of fault tolerance in centralized versus decentralized systems."
generated_answer = "In centralized systems, fault tolerant mechanisms such as replication can be used to ensure consistency and availability of data. However, in decentralize systems, achieving fault toleranc can be more challenging due to the lack of a central authority that can enforce consistency. Instead, decentralizes systems often rely on consensus protocols such as Paxo, which can tolerate faulty nodes but may not provide strong consistency guarantees. Additionally, decentrilized systems may require more complex fault toleran mechanisms, such as Byzantium fault tolerace, to ensure the system's continuity in the face of faults or failures"
similarity = calculate_semantic_similarity(query, generated_answer)
print(f"Semantic Similarity Score: {similarity}")

In [None]:
query = "Discuss the implications of fault tolerance in centralized versus decentralized systems."
generated_answer = "Fault tolerance refers to the ability of a system to continue functioning correctly even when one or more of its components fail or malfunction. In centralized systems, fault tolerant mechanisms are typically designed to protect the central component from failures, while in decentralize systems, the focus is on protecting the overall system from failuers.A real-life example of fault-tolerant centralized system is a bank's core banking system. This system is critical to the bank' operations and must be available at all times. To ensure fault toleranc, the system is designed with redundant components, such as multiple servers, storage systems, a nd network connections. If one of these components fails, the other components can take over, ensuring that the system remains available.On the other hand, a decentralzed system such as a distributed ledger like blockchain is designed to be fault-toleant by design. Since there is no central component, the failure of one node does not affect the entire system. Instead, the nodes work together to achieve consensus on the state of the ledger, ensurimg that the data remains consistent across the system.In summary, centralize systems are designed to tolerate faults in the central componenet, while decentralze systems are desgned to tolerat faults across the entire sytem."
similarity = calculate_semantic_similarity(query, generated_answer)
print(f"Semantic Similarity Score: {similarity}")

In [None]:
query = "Discuss the implications of fault tolerance in centralized versus decentralized systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "Discuss the implications of fault tolerance in centralized versus decentralized systems."
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_simple(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is the port number of HTTP"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is a three way handshake?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "when does data transfer happen in a three way handshake?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is subnetting?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is dijkstra algorithm and how it is relevant to computer networks?"
results = query_faiss_index(query, top_k = 3)
para = ""
for result in results:
    para = para + result
context = extract_relevant_sentences(query, para, similarity_threshold=0.4)
context = clean_context(context)
insight = generate_insight_short(query, context)

print("Generated Insight:")
print(insight)

In [None]:
def generate_insight_short(query):
    # Construct a structured prompt with a clear marker
    combined_input = (
        f"Imagine you are a teacher and i am a student who has no knowledge about the subject."
        
        f"Question: {query}\n answer the question concisely with correct spellings"
        f"\nAnswer:"
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the prompt
    inputs = tokenizer.encode_plus(
        combined_input,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=4000
    ).to('cuda')

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate the output using the model
    
    outputs = llm_model.generate(
        input_ids,
        attention_mask=attention_mask,
        #max_new_tokens=150,
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        temperature=0.33,
        top_k=10,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)


    return generated_text


In [None]:
query = "what is the prot number of HTTP?"
insight = generate_insight_short(query)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is distribution transparency?"
insight = generate_insight_short(query)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is three way handshake?"
insight = generate_insight_short(query)

print("Generated Insight:")
print(insight)

In [None]:
query = "what is computer networking"
insight = generate_insight_short(query)

print("Generated Insight:")
print(insight)

In [None]:
!pip install textstat


In [None]:
import textstat

# Function to calculate various readability scores
def calculate_readability_scores(text):
    scores = {
        "Flesch Reading Ease": textstat.flesch_reading_ease(text),
        "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(text),
        "Gunning Fog Index": textstat.gunning_fog(text),
        "SMOG Index": textstat.smog_index(text),
        "Automated Readability Index": textstat.automated_readability_index(text),
        "Coleman-Liau Index": textstat.coleman_liau_index(text),
        "Dale-Chall Readability Score": textstat.dale_chall_readability_score(text),
        "Text Standard": textstat.text_standard(text)
    }
    return scores

# Example generated answer
generated_answer = """
Distribution transparence is the degree to which the distribution of goods, services, or resources is openly and easily understood and accessible to all stakeholders. It refers to the ability to see how resources are allocated and how decisions are made, and to have a say in those decisions. In a transparent distribution system, information is readily available and easily accessible, and decision-making processes are open and inclusive. This can help to build trust and accountability, and can lead to more equitable and efficient distribution of resources.
"""

# Calculate readability scores
readability_scores = calculate_readability_scores(generated_answer)

# Print the readability scores
print("Readability Scores:")
for metric, score in readability_scores.items():
    print(f"{metric}: {score}")


In [None]:
pip install language-tool-python


In [None]:
import language_tool_python

def evaluate_grammar_accuracy(text):
    # Initialize the LanguageTool object for English language
    tool = language_tool_python.LanguageTool('en-US')
    
    # Check the text for grammatical errors
    matches = tool.check(text)
    
    # Calculate the number of errors
    num_errors = len(matches)
    
    # Print out the errors (optional, for detailed analysis
    # Calculate the percentage of errors per word
    num_words = len(text.split())
    error_percentage = (num_errors / num_words) * 100 if num_words > 0 else 0
    
    return num_errors, error_percentage



In [None]:
# Example generated text
generated_text = "Computer networking is the process of connecting computers and other devices together to share resources and exchange data. It involves the use of hardware and software technologies to create a network infrastructure that allows devices to communicate and exchange information. This can include local area networks (LANs), wide area networks(WANs) and the internet."
# Evaluate grammatical accuracy
num_errors, error_percentage = evaluate_grammar_accuracy(generated_text)

print(f"Number of Errors: {num_errors}")
print(f"Error Percentage: {error_percentage:.2f}%")


In [None]:
import matplotlib.pyplot as plt

# Example data
x = [1, 2, 3, 4, 5,6,7,8,9,10,11,12]
y1 = [77.356112, 74.44374561, 66.12750888,79.72038388,81.69698715,10.31930596,77.64601111,
      84.69850421,67.61313677,69.6780324,70.43831348,76.43733025]  # First line
y2 = [78.37123871,85.71727276,60.00449657,76.70654655,82.02352524,33.73324275,64.6933198,
      75.27422905,75.26009083,55.56576252,69.03122663,77.19646692]  # Second line

# Create a plot
plt.plot(x, y1, label='Line 1', marker='o')
plt.plot(x, y2, label='Line 2', marker='s')

# Adding title and labels
plt.title('Double Line Graph')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

# Adding a legend
plt.legend()

# Show the plot
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Example data
x = [1, 2, 3, 4, 5,6,7,8,9,10,11,12]
y1 = [60.51571369,54.1473031,55.15674353,59.38210487,61.00667715,19.71448809,81.62000179,
      68.90394092,45.2965498,62.93884516,76.7865181,78.50136161]  # First line
y2 = [60.0430727,52.42186785,46.0766077,59.44015384,78.53261828,31.54866993,73.2598424,
      63.62524033,45.42482793,47.96879292,74.63451028,70.87694407]  # Second line

# Create a plot
plt.plot(x, y1, label='Line 1', marker='o')
plt.plot(x, y2, label='Line 2', marker='s')

# Adding title and labels
plt.title('Double Line Graph')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

# Adding a legend
plt.legend()

# Show the plot
plt.grid(True)
plt.show()



In [None]:
import matplotlib.pyplot as plt

# Example data
x = [1, 2, 3, 4, 5,6,7,8,9,10,11,12]
y1 = [17.27,0,9.02,4.55,3.77,0,7.05,4.92,4.26,7.45,5.71,11.46]  # First line
y2 = [0,5.88,6.67,2.22,1.85,0,3.45,5.56,0,4.23,0,0]  # Second line

# Create a plot
plt.plot(x, y1, label='Line 1', marker='o')
plt.plot(x, y2, label='Line 2', marker='s')

# Adding title and labels
plt.title('Double Line Graph')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

# Adding a legend
plt.legend()

# Show the plot
plt.grid(True)
plt.show()



In [None]:
from sentence_transformers import util
from nltk.tokenize import sent_tokenize
def extract_relevant_sentences(query, paragraphs, similarity_threshold=0.5):
    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)
    relevant_sentences = []
    
    # Split paragraph into sentences
    sentences = sent_tokenize(paragraphs)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

    # Compute cosine similarity between the query and each sentence
    similarities = util.pytorch_cos_sim(query_embedding, sentence_embeddings)

    # Filter sentences based on the similarity threshold
    for idx, similarity_score in enumerate(similarities[0]):
        if similarity_score.item() >= similarity_threshold:
            relevant_sentences.append(sentences[idx])

    # Combine the relevant sentences back into a single text block
    return ' '.join(relevant_sentences)

# *Streamlit*

In [4]:
%%writefile app.py
import streamlit as st
import os
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import PyPDF2
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from language_tool_python import LanguageTool

st.set_page_config(page_title="Teachmate Assistant", layout="wide")

# Debugging logs
def log_debug(message):
    print(f"DEBUG: {message}")
    st.write(f"DEBUG: {message}")

# Load models using Kaggle's GPU environment
@st.cache_resource
def load_transformer_models():
    log_debug("Loading Transformer models...")
    tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama-2/pytorch/7b-chat-hf/1", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        "/kaggle/input/llama-2/pytorch/7b-chat-hf/1",
        trust_remote_code=True,
        device_map="auto",
        offload_folder="offload"
    )
    log_debug("Transformer models loaded.")
    return tokenizer, model

@st.cache_resource
def load_sentence_transformer():
    log_debug("Loading SentenceTransformer...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    log_debug("SentenceTransformer loaded.")
    return model

# Utility functions
def extract_text_with_metadata(pdf_paths):
    log_debug("Extracting text with metadata from PDFs...")
    paragraphs = []
    metadata = []

    for name, pdf_path in pdf_paths:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    page_paragraphs = '\n'.join(lines) + '\n\n'
                    split_paragraphs = [p.strip() for p in page_paragraphs.split('\n\n') if p.strip()]
                    paragraphs.extend(split_paragraphs)
                    metadata.extend([(name, page_num + 1)] * len(split_paragraphs))
    log_debug(f"Extracted {len(paragraphs)} paragraphs.")
    return paragraphs, metadata

def query_faiss_index(query, top_k, model, paragraphs):
    log_debug("Querying FAISS index...")
    embeddings = model.encode(paragraphs, show_progress_bar=False)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    log_debug(f"FAISS query returned {len(indices[0])} results.")
    return [paragraphs[i] for i in indices[0]]

def clean_context(context):
    log_debug("Cleaning context...")
    cleaned_text = context.replace('\n', ' ')
    cleaned_text = re.sub(r'DS \d+\.\d+ downloaded by .*?@.*?\d+\.\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def generate_insight_simple(query, context, tokenizer, llm_model):
    log_debug("Generating insight...")
    input_text = (
        f"INSTRUCTIONS:\n"
        f"Answer the QUESTION using the DOCUMENT text above. If DOCUMENT does not contain facts, return NONE.\n"
        f"DOCUMENT:\n{context}\n"
        f"QUESTION:\n{query}\n"
        f"ANSWER:\n"
    )
    inputs = tokenizer.encode_plus(input_text, return_tensors='pt', max_length=4000, truncation=True).to('cuda')
    outputs = llm_model.generate(inputs['input_ids'], max_new_tokens=500, no_repeat_ngram_size=3, temperature=0.3)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = generated_text.find("ANSWER:") + len("ANSWER:")
    log_debug("Insight generated successfully.")
    return generated_text[answer_start:].strip()

# Load models
log_debug("Initializing models...")
sentence_model = load_sentence_transformer()
tokenizer, llm_model = load_transformer_models()
log_debug("Models initialized.")

# Streamlit UI
#st.set_page_config(page_title="Teachmate Assistant", layout="wide")
st.sidebar.title("Teachmate Assistant")
st.sidebar.markdown("An intelligent assistant to query and generate insights from uploaded documents.")

page = st.sidebar.radio("Navigate", ["Upload PDF", "Query & Generate Insights"])

# Upload PDFs
if page == "Upload PDF":
    st.title("Upload PDF")
    uploaded_files = st.file_uploader("Upload your PDF files", type="pdf", accept_multiple_files=True)
    if uploaded_files:
        log_debug("PDF files uploaded.")
        pdf_paths = [(file.name, file) for file in uploaded_files]
        paragraphs, metadata = extract_text_with_metadata(pdf_paths)
        st.session_state["paragraphs"] = paragraphs
        st.success(f"{len(paragraphs)} paragraphs extracted and ready for querying!")

# Query and Insights
if page == "Query & Generate Insights":
    st.title("Query & Generate Insights")
    if "paragraphs" not in st.session_state:
        st.warning("Please upload a PDF file first!")
    else:
        query = st.text_input("Enter your query:")
        top_k = st.slider("Number of top results", 1, 10, 3)

        if st.button("Generate Insight"):
            log_debug("Generating insight...")
            paragraphs = st.session_state["paragraphs"]
            results = query_faiss_index(query, top_k, sentence_model, paragraphs)
            context = clean_context(" ".join(results))
            st.write("### Top Results")
            for i, result in enumerate(results):
                st.write(f"**Result {i + 1}:** {result}")

            if context:
                st.write("### Generating Insight...")
                insight = generate_insight_simple(query, context, tokenizer, llm_model)
                st.write(f"**Insight:** {insight}")


Writing app.py


In [None]:
%%writefile app.py
import streamlit as st
import os
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import PyPDF2
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from language_tool_python import LanguageTool

# Load models using Kaggle's GPU environment
@st.cache_resource
def load_transformer_models():
    tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama-2/pytorch/7b-chat-hf/1", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        "/kaggle/input/llama-2/pytorch/7b-chat-hf/1",
        trust_remote_code=True,
        device_map="auto",
        offload_folder="offload"
    )
    return tokenizer, model

@st.cache_resource
def load_sentence_transformer():
    return SentenceTransformer('all-MiniLM-L6-v2')

# Utility functions
def extract_text_with_metadata(pdf_paths):
    paragraphs = []
    metadata = []

    for name, pdf_path in pdf_paths:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    page_paragraphs = '\n'.join(lines) + '\n\n'
                    split_paragraphs = [p.strip() for p in page_paragraphs.split('\n\n') if p.strip()]
                    paragraphs.extend(split_paragraphs)
                    metadata.extend([(name, page_num + 1)] * len(split_paragraphs))
    return paragraphs, metadata

def query_faiss_index(query, top_k, model, paragraphs):
    embeddings = model.encode(paragraphs, show_progress_bar=False)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [paragraphs[i] for i in indices[0]]

def clean_context(context):
    cleaned_text = context.replace('\n', ' ')
    cleaned_text = re.sub(r'DS \d+\.\d+ downloaded by .*?@.*?\d+\.\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def generate_insight_simple(query, context, tokenizer, llm_model):
    input_text = (
        f"INSTRUCTIONS:\n"
        f"Answer the QUESTION using the DOCUMENT text above. If DOCUMENT does not contain facts, return NONE.\n"
        f"DOCUMENT:\n{context}\n"
        f"QUESTION:\n{query}\n"
        f"ANSWER:\n"
    )
    inputs = tokenizer.encode_plus(input_text, return_tensors='pt', max_length=4000, truncation=True).to('cuda')
    outputs = llm_model.generate(inputs['input_ids'], max_new_tokens=500, no_repeat_ngram_size=3, temperature=0.3)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = generated_text.find("ANSWER:") + len("ANSWER:")
    return generated_text[answer_start:].strip()

# Load models
sentence_model = load_sentence_transformer()
tokenizer, llm_model = load_transformer_models()

# Streamlit UI
st.set_page_config(page_title="Teachmate Assistant", layout="wide")
st.sidebar.title("Teachmate Assistant")
st.sidebar.markdown("An intelligent assistant to query and generate insights from uploaded documents.")

page = st.sidebar.radio("Navigate", ["Upload PDF", "Query & Generate Insights"])

# Upload PDFs
if page == "Upload PDF":
    st.title("Upload PDF")
    uploaded_files = st.file_uploader("Upload your PDF files", type="pdf", accept_multiple_files=True)
    if uploaded_files:
        pdf_paths = [(file.name, file) for file in uploaded_files]
        paragraphs, metadata = extract_text_with_metadata(pdf_paths)
        st.session_state["paragraphs"] = paragraphs
        st.success(f"{len(paragraphs)} paragraphs extracted and ready for querying!")

# Query and Insights
if page == "Query & Generate Insights":
    st.title("Query & Generate Insights")
    if "paragraphs" not in st.session_state:
        st.warning("Please upload a PDF file first!")
    else:
        query = st.text_input("Enter your query:")
        top_k = st.slider("Number of top results", 1, 10, 3)

        if st.button("Generate Insight"):
            paragraphs = st.session_state["paragraphs"]
            results = query_faiss_index(query, top_k, sentence_model, paragraphs)
            context = clean_context(" ".join(results))
            st.write("### Top Results")
            for i, result in enumerate(results):
                st.write(f"**Result {i + 1}:** {result}")

            if context:
                st.write("### Generating Insight...")
                insight = generate_insight_simple(query, context, tokenizer, llm_model)
                st.write(f"**Insight:** {insight}")


In [6]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s

In [None]:
!streamlit run app.py --server.port 8501 --server.headless true --browser.serverAddress=0.0.0.0 --browser.gatherUsageStats=false

In [5]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Downloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.2


In [None]:
from pyngrok import ngrok

# Expose the Streamlit app on port 8501
public_url = ngrok.connect(8501)
print(f"Access your Streamlit app at: {public_url}")


In [7]:
!ngrok authtoken 2qYhwo7TvkByosPhNPAi863W1rk_7ETwHNM8ypsWkFDKZGiWP

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml                                


In [8]:
from pyngrok import ngrok
import subprocess

# Start the Streamlit server as a background process
subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"])

# Create an ngrok tunnel to the Streamlit port
public_url = ngrok.connect(8501)
print(f"Access your Streamlit app at: {public_url}")


Access your Streamlit app at: NgrokTunnel: "https://46c2-35-227-82-169.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://172.19.2.2:8501
  External URL: http://35.227.82.169:8501





DEBUG: Initializing models...
DEBUG: Loading SentenceTransformer...
DEBUG: SentenceTransformer loaded.
DEBUG: Loading Transformer models...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
!streamlit run app.py --server.port 8501 --server.headless true

In [None]:
!ps -ef | grep streamlit
