#V2 inference

In [None]:
# =========================
# ✅ SETUP & DEPENDENCIES
# =========================
!pip install -q peft transformers accelerate bitsandbytes pyngrok

# =========================
# ✅ IMPORTS
# =========================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from huggingface_hub import login
import zipfile
import os
import gc
# =========================
# ✅ AUTHENTICATION
# =========================
login(token="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")  # Replace securely in production


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =========================
# ✅ FILE EXTRACTION
# =========================
zip_path = "/content/drive/MyDrive/llama3_policy_finetune_v2.5.zip"
extract_path = "/content"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Extraction complete!")


✅ Extraction complete!


In [None]:

# =========================
# ✅ PATHS & CONFIGS
# =========================
base_model_name = "meta-llama/Llama-3.2-3B"
fine_tuned_dir = "/content/llama3_policy_finetune_v2.5"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# =========================
# ✅ LOAD MODEL & TOKENIZER
# =========================
def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model = PeftModel.from_pretrained(base_model, fine_tuned_dir)
    model.eval()
    return model, tokenizer

# =========================
# ✅ INFERENCE FUNCTION
# =========================
def generate_response(prompt, max_new_tokens=80):
    model, tokenizer = load_model_and_tokenizer()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # torch.manual_seed(97)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.4,
            top_k=20,
            top_p=0.9,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    return response.strip()


In [None]:
# =========================
# ✅ EXAMPLE RUN
# =========================
gc.collect()
torch.cuda.empty_cache()
prompt = "You are a NACCAS policy expert. Answer using only official NACCAS documentation. can a student with ged get admission in NACCAS accredited institute?"
response = generate_response(prompt)
# Split into sentences and keep first 3
sentences = response.strip().split(".")
short_response = ".".join(sentences[:3]).strip() + "."
print("🧠 Model response:\n", short_response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🧠 Model response:
 A copy of the student?s high school transcript or certificate must be maintained
on file at the institution and a copy is to be sent to the student?s home address.
33. If an institution becomes subject to one or more Show/Cause Orders, the school shall
send a copy to all enrolled students.


#V2.5 inference (not useful right now)

In [None]:
# # =========================
# # ✅ SETUP & DEPENDENCIES
# # =========================
# !pip install -q peft transformers accelerate bitsandbytes streamlit pyngrok

# # =========================
# # ✅ IMPORTS
# # =========================
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# from peft import PeftModel
# from pyngrok import ngrok
# import zipfile
# import os
# from huggingface_hub import login
# # %%writefile app.py
# # import streamlit as st
# # !ngrok config add-authtoken

# # =========================
# # ✅ AUTHENTICATION
# # =========================
# # Login to Hugging Face (you might want to handle this differently for security)
# login(token="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")  # Replace with your token or use environment variables

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# # =========================
# # ✅ FILE EXTRACTION
# # =========================
# zip_path = "/content/drive/MyDrive/llama3_policy_finetune_v2.5.zip"
# extract_path = "/content"

# # Extract the zip
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_path)

# print("✅ Extraction complete!")

# # =========================
# # ✅ PATHS & CONFIGS
# # =========================
# base_model_name = "meta-llama/Llama-3.2-3B"
# fine_tuned_dir = "/content/llama3_policy_finetune_v2.5"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4"
# )

✅ Extraction complete!


In [None]:
# # =========================
# # ✅ INFERENCE FUNCTION
# # =========================

# def generate_response(user_prompt, system_prompt, max_new_tokens=150):
#     # Load model and tokenizer
#     model, tokenizer = load_model_and_tokenizer()

#     # Create properly formatted Llama 3 prompt
#     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
#     Cutting Knowledge Date: December 2023
#     Today Date: 23 July 2024
#     {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
#     {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

#     # Tokenize and move to model's device
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

#     # Generate response
#     with torch.no_grad():
#         outputs = model.generate(
#             input_ids=input_ids,
#             max_new_tokens=max_new_tokens,
#             do_sample=True,
#             temperature=0.3,
#             top_k=20,
#             top_p=0.9,
#             repetition_penalty=1.2,
#             eos_token_id=tokenizer.eos_token_id,
#             pad_token_id=tokenizer.eos_token_id
#         )

#     # Decode only the assistant's response (skip the input prompt)
#     response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

#     return response.strip()

In [None]:

# # # =========================
# # # ✅ TEST PROMPT
# # # =========================
# # prompt = "What must a NACCAS-accredited institute include in its school catalog regarding its Ability to Benefit policy?"
# system_prompt = "You are a NACCAS policy expert. Answer only using official documents."
# user_prompt = "What must NACCAS accredited institute include in its school catalog regarding its ability to benifit policy?."

# response = generate_response(user_prompt,system_prompt)
# print("🧠 Model response:\n", response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



{'default': LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='meta-llama/Llama-3.2-3B', revision=None, inference_mode=True, r=32, target_modules={'up_proj', 'gate_proj', 'down_proj', 'v_proj', 'k_proj', 'o_proj', 'q_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}
🧠 Model response:
 A copy of the current accreditation agreement between the institution and NACCCS is included in the catalog.圭圭ニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニニ

#Inference v2 + streamlit and ngrok both on colab

In [None]:
# =========================
# ✅ FILE EXTRACTION
# =========================
import zipfile
zip_path = "/content/drive/MyDrive/llama3_policy_finetune_v2.5.zip"
extract_path = "/content"

# Extract the zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Extraction complete!")

✅ Extraction complete!


In [None]:
# =========================
# ✅ SETUP & DEPENDENCIES
# =========================
!pip install -q peft transformers accelerate bitsandbytes streamlit pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =========================
# ✅ CREATE STREAMLIT APP FILE (in separate cell)
# =========================
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import gc

@st.cache_resource
def load_model_and_tokenizer():
    base_model_name = "meta-llama/Llama-3.2-3B"
    fine_tuned_dir = "/content/llama3_policy_finetune_v2.5"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(fine_tuned_dir, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    model = PeftModel.from_pretrained(base_model, fine_tuned_dir)
    model = model.merge_and_unload()
    model.eval()
    return model, tokenizer

def generate_response(prompt, max_new_tokens=200):
    model, tokenizer = load_model_and_tokenizer()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # torch.manual_seed(97)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.4,
            top_k=20,
            top_p=0.9,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    return response.strip()

st.title("Llama-3 Policy Assistant")
prompt = st.text_area("Ask about NACCAS policies:")
if st.button("Generate Response"):
    if prompt:
        gc.collect()
        torch.cuda.empty_cache()
        response = generate_response(prompt)
        # Split into sentences and keep first 10
        sentences = response.strip().split(".")
        short_response = ".".join(sentences[:10]).strip() + "."
        st.write(short_response)
    else:
        st.warning("Please enter a question")

  # COPY PASTE THIS BEFORE YOUR PROMPT. You are a NACCAS policy expert. Answer using only official NACCAS documentation.


Writing app.py


In [None]:
# =========================
# ✅ MAIN EXECUTION (in separate cell)
# =========================
import os
from huggingface_hub import login
from pyngrok import ngrok

# Authenticate with Hugging Face
login(token="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")  # Replace with your token


# Set ngrok authtoken
ngrok.set_auth_token("2wqBxne2LxuZWd50w0W38bok7w4_7uoyrn8XJW52aaEfcgjn8")  # Replace with your token

# Start Streamlit with ngrok
public_url = ngrok.connect(8501)
print("Public URL:", public_url)
!streamlit run app.py --server.port 8501

Public URL: NgrokTunnel: "https://9e86-34-105-83-124.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.105.83.124:8501[0m
[0m
2025-05-21 17:37:14.544330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747849034.799655    1596 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747849034.862042    1596 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
20

#Inference v3

Significantly better and coherent results. requires minor adjustment, still putting out tokens like < /s> or < /item> etc

In [3]:
# =========================
# ✅ SETUP & DEPENDENCIES
# =========================
!pip install -q unsloth peft transformers accelerate bitsandbytes streamlit pyngrok
!pip install PyPDF2 docx


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docx
  Building wheel for docx (setup.py) ... [?25l[?25hdone
  Created wheel for docx: filename=docx-0.2.4-py3-none-any.whl size=53893 sha256=53c5ea0fd15e1b6c1c7bf91e8b4365dfe35dc56ed776552528388386fa5f2d7b
  Stored in directory: /root/.cache/pip/wheels/c1/3e/c3/e81c11effd0be5658a035947c66792dd993bcff317eae0e1ed
Successfully built docx
Installing collected packages: PyPDF2, docx
Successfully installed PyPDF2-3.0.1 docx-0.2.4


In [4]:
# SMPLE STREAMLIT GUI

# %%writefile app.py
# import streamlit as st
# import torch
# from unsloth import FastLanguageModel
# from transformers import AutoTokenizer
# import gc
# import re

# @st.cache_resource
# def load_model_and_tokenizer():
#     fine_tuned_dir = "/content/drive/MyDrive/llama3.2-instruct-best"

#     # Load the fine-tuned model and tokenizer
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name=fine_tuned_dir,
#         load_in_4bit=True,
#         max_seq_length=2048,
#         device_map="auto",
#     )
#     model.eval()
#     return model, tokenizer

# def clean_response(response):
#     # Remove all HTML-like tags, special tokens, and normalize whitespace
#     response = re.sub(r'</?(item|s|span|INST|[^>]+)>', '', response)  # Remove tags like <item>, </item>, <s>, </s>, <span>, [INST], etc.
#     response = re.sub(r'\s+', ' ', response).strip()  # Normalize whitespace
#     # Split into sentences and take the first 10 valid sentences
#     sentences = response.split('.')
#     cleaned_sentences = [s.strip() for s in sentences if s.strip()]
#     return '. '.join(cleaned_sentences[:10]) + '.' if cleaned_sentences else ''

# def generate_response(prompt, max_new_tokens=100):
#     model, tokenizer = load_model_and_tokenizer()

# # ----------------------- Format the prompt with NACCAS context -------------------------
#     formatted_prompt = (
#         f"<s>[INST] You are a NACCAS policy expert. Answer the following question based strictly on official NACCAS documentation: {prompt} [/INST]"
#     )
#     input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(model.device)

#     with torch.no_grad():
#         outputs = model.generate(
#             input_ids=input_ids,
#             max_new_tokens=max_new_tokens,
#             do_sample=True,
#             temperature=0.3,  # Lowered for less randomness
#             top_k=10,        # Stricter top-k for coherence
#             top_p=0.85,      # Adjusted for more focused sampling
#             repetition_penalty=1.3,  # Increased to reduce repetition
#             eos_token_id=tokenizer.eos_token_id,
#             pad_token_id=tokenizer.eos_token_id
#         )

#     response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
#     return clean_response(response)

# st.title("NACCAS Policy Assistant")
# prompt = st.text_area("Ask about NACCAS policies:")
# if st.button("Generate Response"):
#     if prompt:
#         gc.collect()
#         torch.cuda.empty_cache()
#         response = generate_response(prompt)
#         st.write(response)
#     else:
#         st.warning("Please enter a question about NACCAS policies")


Writing app.py


In [9]:
%%writefile app.py
import streamlit as st
import PyPDF2
try:
    import docx
except ImportError:
    docx = None
import uuid
import json
import os
import torch
from unsloth import FastLanguageModel
import re

# Initialize session state
if 'chat_sessions' not in st.session_state:
    st.session_state.chat_sessions = {}
if 'current_chat_id' not in st.session_state:
    st.session_state.current_chat_id = str(uuid.uuid4())
if 'memory' not in st.session_state:
    st.session_state.memory = []

# Load persistent state from JSON
def load_state():
    try:
        with open("chat_state.json", "r") as f:
            state = json.load(f)
            st.session_state.chat_sessions = {
                k: v for k, v in state.get("chat_sessions", {}).items()
            }
            st.session_state.memory = state.get("memory", [])
            st.session_state.current_chat_id = state.get("current_chat_id", str(uuid.uuid4()))
    except FileNotFoundError:
        pass

# Save state to JSON
def save_state():
    with open("chat_state.json", "w") as f:
        json.dump({
            "chat_sessions": st.session_state.chat_sessions,
            "memory": st.session_state.memory,
            "current_chat_id": st.session_state.current_chat_id
        }, f)

# Load state at startup
load_state()

# Load Unsloth model and tokenizer
@st.cache_resource
def load_model_and_tokenizer():
    fine_tuned_dir = "/content/drive/MyDrive/llama3.2-instruct-best"
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_dir,
        max_seq_length=2048,
        load_in_4bit=True,
        device_map="auto",
    )
    FastLanguageModel.for_inference(model)
    model.eval()
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer()

# Clean response function
def clean_response(response):
    response = re.sub(r'</?(item|s|span|INST|[^>]+)>', '', response)
    response = re.sub(r'\s+', ' ', response).strip()
    sentences = response.split('.')
    cleaned_sentences = [s.strip() for s in sentences if s.strip()]
    return '. '.join(cleaned_sentences[:10]) + '.' if cleaned_sentences else ''

# Generate response function
def generate_response(prompt, chat_history):
    context = ""
    for mem in st.session_state.memory:
        context += f"[MEMORY]: {mem}\n"
    for msg in chat_history:
        context += f"{msg['role'].capitalize()}: {msg['content']}\n"
    context += f"User: {prompt}\n"

    formatted_prompt = f"<s>[INST] You are a NACCAS policy expert. Answer the following based strictly on official NACCAS documentation:\n{context} [/INST]"

    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.3,
            top_k=10,
            top_p=0.85,
            repetition_penalty=1.3,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    return clean_response(response)

# Function to extract text from PDF
def extract_pdf_text(file):
    try:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

# Function to extract text from Word document
def extract_docx_text(file):
    if docx is None:
        return "Word document support unavailable (python-docx not installed)."
    try:
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    except Exception as e:
        return f"Error reading Word document: {str(e)}"

# Function to process uploaded file
def process_uploaded_file(uploaded_file):
    if uploaded_file is not None:
        if uploaded_file.name.endswith('.pdf'):
            text = extract_pdf_text(uploaded_file)
        elif uploaded_file.name.endswith('.docx'):
            text = extract_docx_text(uploaded_file)
        else:
            text = "Unsupported file format. Please upload a PDF or Word document."
        return text
    return None

# Function to create a new chat session
def new_chat():
    new_chat_id = str(uuid.uuid4())
    st.session_state.chat_sessions[new_chat_id] = []
    st.session_state.current_chat_id = new_chat_id
    if len(st.session_state.chat_sessions) > 5:
        oldest_chat_id = next(iter(st.session_state.chat_sessions))
        del st.session_state.chat_sessions[oldest_chat_id]
    save_state()

# Sidebar for chat session management
st.sidebar.title("Chat Sessions")
if st.sidebar.button("New Chat"):
    new_chat()

for chat_id in list(st.session_state.chat_sessions.keys())[:5]:
    if st.sidebar.button(f"Chat {chat_id[:8]}", key=chat_id):
        st.session_state.current_chat_id = chat_id
        save_state()

# Main chat interface
st.title("NACCAS Policy Assistant")

if st.session_state.current_chat_id not in st.session_state.chat_sessions:
    st.session_state.chat_sessions[st.session_state.current_chat_id] = []
    save_state()

# Display chat history
st.subheader("Chat History")
for message in st.session_state.chat_sessions[st.session_state.current_chat_id]:
    with st.chat_message(message["role"]):
        st.write(message["content"])

# Display memory
st.subheader("Memory")
if st.session_state.memory:
    for mem in st.session_state.memory:
        st.write(f"- {mem}")
else:
    st.write("No memory items yet.")

# Input form to prevent infinite reruns
with st.form(key="input_form", clear_on_submit=True):
    col1, col2 = st.columns([1, 4])
    with col1:
        uploaded_file = st.file_uploader("Upload PDF/Word", type=['pdf', 'docx'], label_visibility="collapsed")
    with col2:
        user_input = st.text_input("Ask about NACCAS policies...", key="user_input")
    submit_button = st.form_submit_button("Send")

# Process input and file on form submission
if submit_button and (user_input or uploaded_file):
    current_chat = st.session_state.chat_sessions[st.session_state.current_chat_id]
    torch.cuda.empty_cache()

    if uploaded_file:
        file_content = process_uploaded_file(uploaded_file)
        if file_content:
            st.session_state.chat_sessions[st.session_state.current_chat_id].append({
                "role": "user",
                "content": f"Uploaded file content:\n{file_content}"
            })
            st.session_state.memory.append(f"Uploaded file: {uploaded_file.name}")
            if len(st.session_state.memory) > 5:
                st.session_state.memory.pop(0)
            assistant_response = generate_response(f"User uploaded a file: {uploaded_file.name}. Content: {file_content}", current_chat)
            st.session_state.chat_sessions[st.session_state.current_chat_id].append({
                "role": "assistant",
                "content": assistant_response
            })
            save_state()

    if user_input:
        st.session_state.chat_sessions[st.session_state.current_chat_id].append({
            "role": "user",
            "content": user_input
        })
        assistant_response = generate_response(user_input, current_chat)
        st.session_state.chat_sessions[st.session_state.current_chat_id].append({
            "role": "assistant",
            "content": assistant_response
        })
        st.session_state.memory.append(f"User asked: {user_input}")
        if len(st.session_state.memory) > 5:
            st.session_state.memory.pop(0)
        save_state()

    st.rerun()

Writing app.py


In [None]:

# =========================
# ✅ MAIN EXECUTION
# =========================
import os
from huggingface_hub import login
from pyngrok import ngrok

# Authenticate with Hugging Face (replace with your token)
login(token="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")

# Set ngrok authtoken (replace with your token)
ngrok.set_auth_token("2wqBxne2LxuZWd50w0W38bok7w4_7uoyrn8XJW52aaEfcgjn8")

# Start Streamlit with ngrok
public_url = ngrok.connect(8501)
print("Public URL:", public_url)
!streamlit run app.py --server.port 8501

Public URL: NgrokTunnel: "https://ea69-34-125-99-167.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.99.167:8501[0m
[0m
