# Initial installs

In [1]:
!pip install -i https://pypi.org/simple/ bitsandbytes --upgrade --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install accelerate --upgrade --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/342.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install PyMuPDF --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
!pip install pyngrok --quiet

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


# Initial imports

In [8]:
import bitsandbytes
import accelerate

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [10]:
from flask import Flask, request, jsonify, send_file
from pyngrok import ngrok
import requests

# HuggingFace/Drive interfacing

In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
from google.colab import drive, userdata
drive.mount('/content/drive')

Mounted at /content/drive


# GPU

In [13]:
# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# Free GPU Memory

In [14]:
import gc
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

# Load in the model

In [15]:
# Load tokenizer and final trained model
model_directory = "/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1-v2"
auth_token = userdata.get('HF_TOKEN')

In [16]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  model_directory,
  local_files_only = True
)

In [17]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(
    model_directory,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    rope_scaling={"type": "dynamic", "factor": 2},
    local_files_only = True,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Webpage layout interface

In [64]:
import os
STATIC_DIR = os.path.abspath('/content/interface/static')

# Main stream

In [71]:
# Initialize the Flask app and the context history
app = Flask("expert-bot", static_folder = STATIC_DIR)
context_history = []

In [72]:
@app.route("/")
def home():
    html_file_path = '/content/interface/index.html'
    with open(html_file_path, 'r') as file:
      html_content = file.read()

    return html_content

In [73]:
# Define a system prompt to guide the responses of the chatbot
system_prompt = """You are a helpful and informative assistant called "Assistant". Your goal is to provide accurate and relevant information to the user's queries.
Please ensure that your responses are succinct, respectful, and factual. Refrain from emoting.
If you're uncertain about a question, it's better to admit it rather than provide inaccurate information.
Respond to the User's question ONLY. Do not impersonate the User and do not include followup questions in your response unless prompted."""

In [74]:
context_history += [system_prompt]

In [75]:
@app.route("/interact", methods=["POST"])
def interact():
    global context_history
    data = request.get_json()
    user_input = data['query']

    # Append user input to context as needed
    context_history.append(f"User: {user_input}")

    # Generate the response using the current context, not repeating the user's input
    conversation = "\n".join(context_history)

    prompt = f"{conversation}\n Assistant: "

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_length=350,
        temperature=0.5,
        top_p=0.75
    )

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_response = response_text.split('Assistant:')[-1].strip()

    # Append the formatted response to the context
    context_history.append(f"Assistant: {assistant_response}")

    # Maintain a recent context window to avoid stale conversation artifacts
    if len(context_history) > 10:
        context_history = context_history[-10:]  # keep the last 10 exchanges

    # Only display the Assistant's response to the user, not the entire context
    response_to_display = assistant_response

    return jsonify({"answer": response_to_display})

In [76]:
if __name__ == '__main__':
    public_url = ngrok.connect(7000)

    print(f"Flask app is running at {public_url}")

    # Run the Flask app
    app.run(host='0.0.0.0', port=7000)

Flask app is running at NgrokTunnel: "https://e7dd-34-125-74-196.ngrok-free.app" -> "http://localhost:7000"
 * Serving Flask app 'expert-bot'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:7000
 * Running on http://172.28.0.12:7000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:37:13] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:37:13] "GET /static/styles.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:37:13] "GET /static/app.js HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:37:14] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:37:33] "POST /interact HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 02:38:07] "POST /interact HTTP/1.1" 200 -
