In [None]:
# --- STEP 1: INSTALLATION ---
import os
print("🛠️ Force-Updating Dependencies...")

# Force uninstalling to clear conflicts
!pip uninstall -y bitsandbytes transformers peft accelerate

# Installing fresh versions
!pip install -q -U bitsandbytes transformers peft accelerate gradio sentence-transformers faiss-cpu

print("\n" + "="*50)
print("✅ INSTALLATION COMPLETE.")
print("⚠️ CRITICAL: Go to 'Runtime' > 'Restart Session' (or Restart Runtime).")
print("⚠️ AFTER RESTARTING, skip this cell and run Step 2 below.")
print("="*50)

🛠️ Force-Updating Dependencies...
[0mFound existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
Found existing installation: peft 0.18.0
Uninstalling peft-0.18.0:
  Successfully uninstalled peft-0.18.0
Found existing installation: accelerate 1.12.0
Uninstalling accelerate-1.12.0:
  Successfully uninstalled accelerate-1.12.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m150.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m


In [None]:
# 💎 CULTUREVERSE FINAL UI: PRODUCTION-READY WITH PERFECT OUTPUT
# =================================================================================


import os
import sys
import shutil
import torch
import gc
import gradio as gr
import pickle
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer
import faiss

# --- 1. MOUNT & EXTRACT ---
print("🔌 Mounting Google Drive...")
drive.mount('/content/drive')

DRIVE_ZIP_PATH = "/content/drive/MyDrive/CultureVerse_Final.zip"
EXTRACT_PATH = "/content/CultureVerse_Final"

if not os.path.exists(EXTRACT_PATH):
    print("📦 Extracting files...")
    if os.path.exists(DRIVE_ZIP_PATH):
        shutil.unpack_archive(DRIVE_ZIP_PATH, EXTRACT_PATH)
        print("✅ Extraction complete.")
    elif os.path.exists("/content/CultureVerse_Final.zip"):
        shutil.unpack_archive("/content/CultureVerse_Final.zip", EXTRACT_PATH)
    else:
        raise FileNotFoundError("❌ File not found! Upload CultureVerse_Final.zip")
else:
    print("✅ Files found. Skipping extraction.")

# --- 2. OPTIMIZED HYBRID ENGINE ---
class CultureVerseEngine:
    def __init__(self, model_dir: str, vector_db_dir: str):
        print("⚙️ Initializing CultureVerse Engine...")

        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = faiss.read_index(f"{vector_db_dir}/idioms.index")
        with open(f"{vector_db_dir}/metadata.pkl", 'rb') as f:
            self.metadata = pickle.load(f)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )

        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        base_model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2.5-7B-Instruct",
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        self.model = PeftModel.from_pretrained(base_model, model_dir)
        self.model.eval()
        print("✅ Engine Ready!")

    def _format_output(self, text: str) -> str:
        marker = "**Cultural Tag"
        if marker in text:
            parts = text.split(marker, 1)
            tail_lines = parts[1].splitlines()
            if tail_lines:
                text = parts[0] + marker + tail_lines[0]

        lines = text.strip().split("\n")
        formatted = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            if line.startswith("**Meaning:**"):
                formatted.append(f"📖 **Meaning** {line.split(':',1)[1].strip()}")
            elif line.startswith("**Cultural Origin:**"):
                formatted.append(f"🌍 **Cultural Origin** {line.split(':',1)[1].strip()}")
            elif line.startswith("**Usage Context:**"):
                formatted.append(f"💬 **Usage Context** {line.split(':',1)[1].strip()}")
            elif line.startswith("**Emotional Tone:**"):
                formatted.append(f"🎭 **Emotional Tone** {line.split(':',1)[1].strip()}")
            elif line.startswith("**Example:**"):
                formatted.append(f"✨ **Example Scenario** {line.split(':',1)[1].strip()}")
            elif line.startswith("**Cultural Tag:**"):
                formatted.append(f"🏷️ **Cultural Tag** {line.split(':',1)[1].strip()}")
            else:
                formatted.append(line)

        return "\n\n".join(formatted)

    def explain(self, phrase: str, language: str = "English") -> str:
        import re

        if not phrase or len(phrase.strip()) < 2:
            return "⚠️ Please enter a valid phrase."

        language = language.split(" ")[0]

        prompt = (
            f"<|im_start|>system\n"
            f"You are a cultural expert. Analyze the phrase and provide a detailed structured response "
            f"using EXACTLY these headings:\n"
            f"Meaning, Cultural Origin, Usage Context, Emotional Tone, Example, Cultural Tag.\n\n"
            f"IMPORTANT INSTRUCTIONS:\n"
            f"- The Example MUST be a realistic, natural 1–2 sentence scenario or short dialogue.\n"
            f"- Do NOT write generic labels like 'workplace chat' or 'coffee shop chat'.\n"
            f"- The Example must contain actual spoken or narrative text (at least 15 words).\n"
            f"<|im_end|>\n"
            f"<|im_start|>user\n"
            f"Phrase: '{phrase}'\n"
            f"<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )

        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=350,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.15,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        input_length = inputs["input_ids"].shape[1]
        generated_tokens = outputs[0][input_length:]
        response = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

        for token in ["<|im_start|>", "<|im_end|>", "<|endoftext|>"]:
            response = response.replace(token, "")

        def _example_is_weak(txt: str) -> bool:
            m = re.search(r"\*\*Example:\*\*(.*?)(?=\n\*\*|\Z)", txt, re.S)
            if not m:
                return True
            ex = m.group(1).strip().lower()
            return len(ex.split()) < 12 or ex in {
                "daily conversation",
                "workplace chat",
                "coffee shop chat",
                "social media comment"
            }

        if _example_is_weak(response):
            regen_prompt = (
                f"<|im_start|>system\n"
                f"You are a cultural expert.\n"
                f"Write ONLY a realistic example (1–2 sentences) for this phrase.\n"
                f"Do not include headings.\n"
                f"<|im_end|>\n"
                f"<|im_start|>user\n"
                f"Phrase: '{phrase}'\n"
                f"<|im_end|>\n"
                f"<|im_start|>assistant\n"
            )

            regen_inputs = self.tokenizer(regen_prompt, return_tensors="pt").to("cuda")

            with torch.no_grad():
                regen_out = self.model.generate(
                    **regen_inputs,
                    max_new_tokens=80,
                    temperature=0.85,
                    top_p=0.95,
                    repetition_penalty=1.1,
                    do_sample=True,
                    eos_token_id=self.tokenizer.eos_token_id,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            regen_text = self.tokenizer.decode(
                regen_out[0][regen_inputs["input_ids"].shape[1]:],
                skip_special_tokens=True
            ).strip()

            response = re.sub(
                r"\*\*Example:\*\*.*?(?=\n\*\*|\Z)",
                f"**Example:** {regen_text}\n",
                response,
                flags=re.S
            )

        return self._format_output(response)

# --- 3. INITIALIZE ENGINE ---
torch.cuda.empty_cache()
gc.collect()

model_path = os.path.join(EXTRACT_PATH, "model_adapter")
vector_path = os.path.join(EXTRACT_PATH, "vector_db")

engine = CultureVerseEngine(model_path, vector_path)

# --- 4. GRADIO UI ---
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

.gradio-container {
    font-family: 'Inter', sans-serif !important;
    max-width: 1200px !important;
    margin: 0 auto !important;
}

h1 {
    text-align: center;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 2.5em !important;
    font-weight: 700 !important;
    margin-bottom: 0.2em !important;
}

.subtitle {
    text-align: center;
    color: #64748b;
    font-size: 1.1em;
    margin-bottom: 2em;
}

.submit-btn {
    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
    color: white !important;
    border: none !important;
    font-weight: 600 !important;
    padding: 12px 32px !important;
    border-radius: 8px !important;
    transition: all 0.3s ease !important;
}

.submit-btn:hover {
    transform: translateY(-2px);
    box-shadow: 0 8px 20px rgba(102, 126, 234, 0.4) !important;
}

.output-box {
    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
    border-radius: 12px;
    padding: 24px;
    margin-top: 20px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}


.output-box * {
    color: #000000 !important;
}
"""

def process_query(phrase, language):
    """Process user query"""
    if not phrase:
        return "⚠️ Please enter a phrase to analyze."

    try:
        result = engine.explain(phrase, language)
        return result
    except Exception as e:
        return f"❌ Error: {str(e)}\n\nPlease try again or use a different phrase."

# Create Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="CultureVerse AI") as demo:

    # Header
    gr.Markdown("# 💎 CultureVerse AI")
    gr.Markdown("<p class='subtitle'>Advanced Cultural & Linguistic Intelligence Platform</p>")

    with gr.Row():
        with gr.Column(scale=1):
            # Input Section
            gr.Markdown("### 🔍 Input Analysis")

            txt_phrase = gr.Textbox(
                label="Phrase / Idiom / Expression",
                placeholder="e.g., Spill the beans, Eid ka chand, Kondanta pani...",
                lines=2
            )

            dd_lang = gr.Dropdown(
                choices=[
                    "English 🌎",
                    "Hindi 🇮🇳",
                    "Telugu 🇮🇳",
                    "French 🇫🇷",
                    "Spanish 🇪🇸",
                    "Chinese 🇨🇳",
                    "Japanese 🇯🇵",
                    "Turkish 🇹🇷",
                    "German 🇩🇪",
                    "Arabic 🇸🇦"
                ],
                value="English 🌎",
                label="Language Context"
            )

            btn_submit = gr.Button("✨ Analyze", elem_classes="submit-btn")

            gr.Markdown("---")
            gr.Markdown("### ⚡ Quick Examples")
            gr.Examples(
                examples=[
                    ["Spill the beans", "English 🌎"],
                    ["إذا لم تستحي ففعل ما شئت", "Arabic 🇸🇦"],
                    ["కొండంత పని కూడా చిన్న చినుకుతో ముగుస్తుంది", "Telugu 🇮🇳"],
                    ["C'est la vie", "French 🇫🇷"],
                    ["Ru xiang sui su", "Chinese 🇨🇳"],
                    ["పంది ముత్యాలు పెట్టినా అది పందే", "Telugu 🇮🇳"],
                    ["अकेला चना भाड़ नहीं फोड़ सकता", "Hindi 🇮🇳"],
                ],
                inputs=[txt_phrase, dd_lang],
                label=""
            )

        with gr.Column(scale=1):
            # Output Section
            gr.Markdown("### 📊 Cultural Analysis")
            out_result = gr.Markdown(
                label="",
                elem_classes="output-box"
            )

    # Connect button
    btn_submit.click(
        fn=process_query,
        inputs=[txt_phrase, dd_lang],
        outputs=out_result
    )

    # Footer
    gr.Markdown("---")
    gr.Markdown(
        "<p style='text-align: center; color: #64748b; font-size: 0.9em;'>"
        "Powered by Qwen 2.5-7B • Fine-tuned on Multilingual Cultural Dataset"
        "</p>"
    )

# --- 5. LAUNCH ---
print("\n" + "="*60)
print("🚀 LAUNCHING CULTUREVERSE AI")
print("="*60 + "\n")

demo.launch(
    share=True,
    debug=True,
    server_name="0.0.0.0",
    server_port=7860
)

🔌 Mounting Google Drive...
Mounted at /content/drive
📦 Extracting files...
✅ Extraction complete.
⚙️ Initializing CultureVerse Engine...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

✅ Engine Ready!


  with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="CultureVerse AI") as demo:



🚀 LAUNCHING CULTUREVERSE AI

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c2e1f2d677bfa1c31c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 0.0.0.0:7860 <> https://c2e1f2d677bfa1c31c.gradio.live


