In [None]:
!pip -q install -U pip
!pip -q install -U transformers accelerate langchain langchain-community sentence-transformers faiss-cpu chromadb pypdf python-docx python-pptx
# !pip -q install requests==2.32.4 --force-reinstall  # Uncomment on Colab if needed

In [None]:
import os, gc, torch, warnings
warnings.filterwarnings("ignore")
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("Total VRAM (GB):", round(torch.cuda.get_device_properties(0).total_memory/1e9, 2))

PyTorch: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4
Total VRAM (GB): 15.83


In [None]:
from huggingface_hub import login
HF_TOKEN = input("Enter your Hugging Face token:").strip()
if HF_TOKEN:
    try:
        login(HF_TOKEN)
        print("HF login successful (token from env).")
    except Exception as e:
        print("HF login skipped:", e)
else:
    print("HF login skipped (no token provided).")

HF login successful (token from env).


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

MODEL_ID = "mistralai/Mistral-7B-v0.1"

def load_mistral_safely(model_id: str = MODEL_ID):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    bnb_ok = False
    if torch.cuda.is_available():
        try:
            import bitsandbytes as bnb  # noqa: F401
            bnb_ok = True
        except Exception:
            bnb_ok = False

    if torch.cuda.is_available() and bnb_ok:
        try:
            print("→ Trying 4-bit quantized load (bitsandbytes)…")
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=bnb_config,
                device_map="auto",
                attn_implementation="eager",
                torch_dtype=torch.bfloat16,
            )
            print("✅ Loaded in 4-bit.")
            return tokenizer, model
        except Exception as e:
            print("⚠️ 4-bit load failed:", e)

    if torch.cuda.is_available():
        try:
            print("→ Trying FP16 GPU load…")
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                attn_implementation="eager",
                torch_dtype=torch.float16,
            )
            print("✅ Loaded in FP16 on GPU.")
            return tokenizer, model
        except Exception as e:
            print("⚠️ FP16 GPU load failed:", e)

    print("→ Falling back to CPU (will be slow).")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map={"": "cpu"},
        attn_implementation="eager",
        torch_dtype=torch.float32,
    )
    print("✅ Loaded on CPU.")
    return tokenizer, model

tokenizer, model = load_mistral_safely(MODEL_ID)

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

→ Trying FP16 GPU load…


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



✅ Loaded in FP16 on GPU.


In [None]:
gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0.7,          # increase creativity
    do_sample=True,           # allow sampling instead of greedy
    top_p=0.9,                # nucleus sampling
    repetition_penalty=1.05,
    eos_token_id=None,        # don’t stop at EOS early
    pad_token_id=tokenizer.eos_token_id
)


print("Generation pipeline ready.")


Device set to use cuda:0


Generation pipeline ready.


In [None]:
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from pptx import Presentation
import pathlib
def load_pptx(path: str):
    from langchain.docstore.document import Document
    texts = []
    prs = Presentation(path)
    for idx, slide in enumerate(prs.slides, start=1):
        parts = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text:
                parts.append(shape.text)
        slide_text = "\n".join(parts).strip()
        if slide_text:
            texts.append(Document(page_content=slide_text, metadata={"source": path, "slide": idx}))
    return texts

def load_documents(paths):
    docs = []
    for p in paths:
        ext = pathlib.Path(p).suffix.lower()
        try:
            if ext == ".pdf":
                docs.extend(PyPDFLoader(p).load())
            elif ext in [".docx"]:
                docs.extend(Docx2txtLoader(p).load())
            elif ext in [".txt", ".md"]:
                docs.extend(TextLoader(p, encoding="utf-8", autodetect_encoding=True).load())
            elif ext == ".pptx":
                docs.extend(load_pptx(p))
            else:
                print(f"⚠️ Skipping unsupported extension: {p}")
        except Exception as e:
            print(f"⚠️ Failed to load {p}: {e}")
    return docs

print("Doc loader ready. Supported: PDF, DOCX, PPTX, TXT, MD")

Doc loader ready. Supported: PDF, DOCX, PPTX, TXT, MD


In [None]:
# from google.colab import files
# uploaded = files.upload()
# file_paths = [f"/content/{name}" for name in uploaded.keys()]
file_paths = ["/content/project report.txt"]  # e.g., ["/content/sample.pdf", "/content/report.docx"]
docs = load_documents(file_paths)
print(f"Loaded documents: {len(docs)}")

Loaded documents: 1


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
splits = splitter.split_documents(docs)
print("Total chunks:", len(splits))

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = None
retriever = None

if splits:
    vectorstore = FAISS.from_documents(splits, embedding=emb)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
    print("Retriever ready.")
else:
    print("⚠️ No splits created. Did you load any docs?")

Total chunks: 12
Retriever ready.


In [None]:
from langchain.prompts import PromptTemplate

template = """Use the following context to answer the question in detail.
If the answer is long, explain step by step.

Context:
{context}

Question: {question}

Answer (detailed and complete, do not stop early):"""

prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(d.page_content[:2000] for d in docs)

def run_rag(query: str):
    global retriever
    if retriever is None:
        return "No documents available."
    try:
        ctx_docs = retriever.get_relevant_documents(query)
        ctx = format_docs(ctx_docs)
        full_prompt = prompt.format(context=ctx, question=query)
        out = gen_pipe(full_prompt, max_new_tokens=220, do_sample=False)[0]["generated_text"]
        return out[len(full_prompt):].strip()
    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        return "⚠️ GPU ran out of memory during generation. Try a shorter question or fewer retrieved chunks (reduce k)."
    except Exception as e:
        return f"Error during RAG: {e}"

print("RAG function ready. Call run_rag('your question').")

RAG function ready. Call run_rag('your question').


In [None]:
print(run_rag("What is this document about?"))

This document describes a movie recommendation system built using Python and the Scikit-learn library. The system uses a combination of TF-IDF vectorization and Truncated SVD to create a feature string representing each movie. The feature string is then used to calculate cosine similarity between the input movie and other movies in the dataset. The top 10 most similar movies are recommended to the user.

The system has been optimized to reduce RAM consumption and improve runtime efficiency. Changes include avoiding dense matrix operations, using TfidfVectorizer + TruncatedSVD instead of CountVectorizer + full cosine matrix, cleaning and parsing only essential data columns, and applying strict feature weighting to keep vector lengths meaningful and sparse.

The system has several limitations and areas for improvement. Collaborative filtering could be included to consider user preferences and ratings. The Credits Dataset could be used to include descriptive keywords or tags that represen

In [None]:
import torch
print(torch.cuda.get_device_name(0))


Tesla T4
