## Files + Githubhub + Youtube

In [None]:
import os
import json
import uuid
import numpy as np
from datetime import datetime
import fitz  # PyMuPDF for PDFs
import openpyxl  # For Excel files (.xlsx)
import docx
import re
from bs4 import BeautifulSoup  # For HTML files
from pptx import Presentation  # For PowerPoint files (.pptx)
import xml.etree.ElementTree as ET  # For XML files
from sentence_transformers import SentenceTransformer
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings
import google.generativeai as genai
from google.cloud import firestore, storage
import firebase_admin
from firebase_admin import credentials, firestore as firebase_firestore
import logging
from pytube import YouTube
import whisper
speech_model = whisper.load_model("base")
# Load environment variables and configure generative model
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

app = Flask(__name__)
CORS(app)
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# Configure logging
logging.basicConfig(level=logging.DEBUG)

# Initialize Firebase Admin with credentials and Firestore client
import firebase_admin
from firebase_admin import credentials, firestore
credentials_path = os.getenv('FIREBASE_CREDENTIALS')
cred = credentials.Certificate(credentials_path)
storage_client = storage.Client.from_service_account_json(credentials_path)
BUCKET_NAME = os.getenv("BUCKET")
bucket_name = storage_client.bucket(BUCKET_NAME)

firebase_admin.initialize_app(cred)
# Initialize Firestore and Cloud Storage clients
firestore_client = firestore.client()

# # Initialize Firestore and Cloud Storage clients
# firestore_client = firebase_firestore.client()
# storage_client = storage.Client.from_service_account_json('prj-inflectiv-dev-56e96d747576.json')
# bucket_name = os.getenv("BUCKET")
# bucket = storage_client.bucket(bucket_name)



chroma_client = chromadb.Client(Settings(persist_directory="chroma_db"))

# Dictionaries to store user-specific data and configurations
user_chat_sessions = {}
user_chat_history = {}
user_vector_dbs = {}
user_generation_config = {}
model_instructions = ""

default_generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
}

@app.route('/')
def home():
    return render_template('advance.html')

# Helper function to sanitize user IDs for storage paths
def sanitize_user_id(user_id):
    return user_id.replace("@", "_").replace(".", "_")
def sanitize_filename(name):
    """Sanitize filenames to remove invalid characters."""
    return ''.join(c if c.isalnum() or c in (' ', '.', '_') else '_' for c in name)


# File and Link Processing Functions
import yt_dlp

def download_youtube_video_yt_dlp(url, output_path="/tmp"):
    """Download YouTube video using yt-dlp."""
    try:
        ydl_opts = {
            'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
            'format': 'bestvideo+bestaudio/best',
            'merge_output_format': 'mp4',  # Ensure the output is a single mp4 file
            # 'ffmpeg_location': '/usr/bin/ffmpeg',  # Adjust path if necessary
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            file_path = ydl.prepare_filename(info)
            logging.info(f"Downloaded video to: {file_path}")
            return file_path
    except Exception as e:
        logging.error(f"Error downloading YouTube video with yt-dlp: {e}")
        raise


import requests
import zipfile
# import os
import tempfile
import logging
def create_temp_dir():
    return tempfile.mkdtemp(prefix="my_app_")

temp_dir = create_temp_dir()
def download_github_repo(repo_url, output_path=None):
    """Download and extract a GitHub repository."""
    try:
        output_path = output_path or create_temp_dir()
        # Normalize the URL to ensure proper format
        if repo_url.endswith(".git"):
            repo_url = repo_url[:-4]

        # Try fetching the repository archive for common branch names
        for branch in ["main", "master"]:
            zip_url = f"{repo_url}/archive/refs/heads/{branch}.zip"
            response = requests.get(zip_url, stream=True)
            if response.status_code == 200:
                # Create a temporary directory to store the downloaded ZIP
                # with tempfile.TemporaryDirectory() as temp_dir:
                # zip_path = os.path.join(temp_dir, "repo.zip")
                # output_path = output_path or create_temp_dir()
                zip_path = os.path.join(output_path, "repo.zip")
                # Save the ZIP content to a file
                with open(zip_path, "wb") as zip_file:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:  # Filter out keep-alive new chunks
                            zip_file.write(chunk)

                # Extract the ZIP file
                with zipfile.ZipFile(zip_path, "r") as zip_ref:
                    zip_ref.extractall(output_path)
                
                logging.info(f"GitHub repository extracted to {output_path}")
                return output_path

        # If no valid branch is found
        raise Exception(f"Failed to fetch GitHub repo: HTTP 404 for branches ['main', 'master']")
    except Exception as e:
        logging.error(f"Error downloading GitHub repo from {repo_url}: {str(e)}")
        raise





# def process_uploaded_file(content, user_id):
#     """Process uploaded files to extract embeddings."""
#     try:
#         # content = extract_text(blob)
#         content=content
#         chunks = [content]
#         embeddings = embedding_model.encode(chunks, convert_to_tensor=True)
#         save_to_user_index(user_id, embeddings, chunks)
#     except Exception as e:
#         logging.error(f"Error processing file {content}: {str(e)}")
#         raise
        
def extract_text_from_youtube(video_path):
    """Transcribe audio from a YouTube video."""
    try:
        result = speech_model.transcribe(video_path)
        transcription = result['text']
        logging.info(f"Transcription complete for {video_path}")
        # print("\n")
        # print("#######################################YOUTUBE transcription")
        # print("--YOUTTUBE---",transcription)
        return transcription
    except Exception as e:
        logging.error(f"Error transcribing YouTube video: {e}")
        raise
        
@app.route("/upload", methods=["POST"])
def upload_file():
    """Handle file and link uploads."""
    try:
        user_id = sanitize_user_id(request.form.get("user_id"))
        if not user_id:
            return jsonify({"error": "User ID not provided"}), 400

        combined_content = []

        # Process uploaded files
        uploaded_files = request.files.getlist("files")
        if uploaded_files:
            for file in uploaded_files:
                try:
                    if file and file.filename:
                        user_dir = f"{user_id}/files"
                        blob = bucket_name.blob(f"{user_dir}/{sanitize_filename(file.filename)}")
                        blob.upload_from_file(file)
                        file_text = extract_text(blob)
                        combined_content.append(file_text)
                        logging.info(f"Processed file: {file.filename}")
                except Exception as e:
                    logging.error(f"Error processing file {file.filename}: {e}")

        # Process YouTube link
        youtube_link = request.form.get("youtube_link")
        if youtube_link:
            try:
                user_dir = f"{user_id}/youtube"
                file_path = download_youtube_video_yt_dlp(youtube_link)
                blob = bucket_name.blob(f"{user_dir}/{os.path.basename(file_path)}")
                blob.upload_from_filename(file_path)
                youtube_transcription = extract_text_from_youtube(file_path)
                combined_content.append(youtube_transcription)
                logging.info(f"Processed YouTube link: {youtube_link}")
            except Exception as e:
                logging.error(f"Error processing YouTube link {youtube_link}: {e}")

        # Process GitHub link
        github_link = request.form.get("github_link")
        if github_link:
            try:
                user_dir = f"{user_id}/github"
                repo_path = download_github_repo(github_link)
                for root, _, files in os.walk(repo_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        blob = bucket_name.blob(f"{user_dir}/{file}")
                        blob.upload_from_filename(file_path)
                        file_text = extract_text(blob)
                        combined_content.append(file_text)
                logging.info(f"Processed GitHub repository: {github_link}")
            except Exception as e:
                logging.error(f"Error processing GitHub link {github_link}: {e}")

        # Combine and process embeddings if there is content
        if combined_content:
            embeddings = embedding_model.encode(combined_content, convert_to_tensor=False)
            result = save_to_user_index(user_id, embeddings, combined_content)
            if result["status"] == "success":
                return jsonify({"message": result["message"]}), 200
            else:
                return jsonify({"error": result["message"]}), 500
        else:
            return jsonify({"message": "No valid content uploaded."}), 400

    except Exception as e:
        logging.error(f"Error in upload_file: {e}")
        return jsonify({"error": f"Exception occurred: {e}"}), 500


#         return jsonify({"error": f"Exception occurred: {str(e)}"}), 500


### Function to store selected vector database
@app.route("/select_vector_db", methods=["POST"])
def select_vector_db():
    try:
        data = request.json
        user_id = sanitize_user_id(data.get("user_id"))
        vector_db = data.get("vector_db")

        if not user_id or not vector_db:
            return jsonify({"error": "User ID and vector DB selection are required"}), 400

        firestore_client.collection("users").document(user_id).set({"vector_db": vector_db}, merge=True)
        user_vector_dbs[user_id] = vector_db
        return jsonify({"message": f"Vector database set to {vector_db} for user {user_id}"}), 200
    except Exception as e:
        logging.error(f"Error in select_vector_db: {str(e)}")
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500

# Function to update generation configuration
@app.route("/update_config", methods=["POST"])
def update_config():
    try:
        data = request.json
        user_id = sanitize_user_id(data.get("user_id"))

        if not user_id:
            return jsonify({"error": "User ID is required"}), 400

        config = user_generation_config.get(user_id, default_generation_config.copy())
        config["temperature"] = data.get("temperature", config["temperature"])
        config["top_p"] = data.get("top_p", config["top_p"])
        config["top_k"] = data.get("top_k", config["top_k"])
        config["max_output_tokens"] = data.get("max_output_tokens", config["max_output_tokens"])
        
        user_generation_config[user_id] = config
        firestore_client.collection("users").document(user_id).set(config, merge=True)
        
        return jsonify({"message": "Generation configuration updated successfully"}), 200
    except Exception as e:
        logging.error(f"Error in update_config: {str(e)}")
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500


@app.route("/set_model", methods=["POST"])
def set_model():
    global model
    try:
        data = request.json
        user_id = data.get("user_id")
        if not user_id:
            return jsonify({"error": "User ID is required"}), 400

        selected_model = "gemini-1.5-flash"
        model = genai.GenerativeModel(model_name=selected_model)
        return jsonify({"message": f"Model set to {selected_model} for user {user_id}"}), 200
    except Exception as e:
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500
# Set instructions in Firestore
@app.route("/set_instructions", methods=["POST"])
def set_instructions():
    try:
        data = request.json
        user_id = sanitize_user_id(data.get("user_id"))
        instructions = data.get("instructions")

        if not user_id or not instructions:
            return jsonify({"error": "User ID and instructions are required"}), 400

        firestore_client.collection("users").document(user_id).set({"instructions": instructions}, merge=True)
        return jsonify({"message": "Instructions set successfully"}), 200
    except Exception as e:
        logging.error(f"Error in set_instructions: {str(e)}")
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500


##############----text processing---########

def remove_source_brackets(text):
    # print("-----text----",text)
    # return cleaned_text
    # Regex pattern to match hyperlinks starting with http or https
    hyperlink_pattern = r"https?://[^\s]+"
    
    # Find all hyperlinks in the text
    hyperlinks = re.findall(hyperlink_pattern, text)
    
    # Placeholder for links to avoid modifying them
    placeholder = "<<<LINK>>>"
    
    # Replace hyperlinks with a placeholder
    for link in hyperlinks:
        text = text.replace(link, placeholder, 1)
    
    # Remove all remaining # characters
    text = text.replace("#", "")
    
    # Restore the hyperlinks back in the text
    for link in hyperlinks:
        text = text.replace(placeholder, link, 1)
    cleaned_text = re.sub(r'【.*#?】', '', text)
    cleaned_text=re.sub(r'\s+\.', '.', cleaned_text)
    characters="*[])"
    for char in characters:
        cleaned_text=cleaned_text.replace(char,'')
    braces="("
    for br in braces:
        cleaned_text=cleaned_text.replace(br,' ')
    #################Remove link duplication
    # Regular expression to match URLs
    url_pattern = re.compile(r'https?://[^\s<>"]+')
    # Find all links in the text
    links = url_pattern.findall(cleaned_text)
    # Remove duplicates while preserving order
    seen = set()
    unique_links = []
    for link in links:
        if link not in seen:
            seen.add(link)
            unique_links.append(link)
    # Rebuild the text with unique links
    def replace_links(match):
        url = match.group(0)
        if url in unique_links:
            unique_links.remove(url)  # Ensure we replace only the first occurrence
            return url
        return ''  # Remove duplicates by returning an empty string
    # Use the URL pattern to replace links in the text
    
    cleaned_text = url_pattern.sub(replace_links, cleaned_text)
    cleaned_text=cleaned_text.replace("```javascript", "").replace("```", "").strip()
    # print("-----cleaned_text----",cleaned_text)
    return cleaned_text


# Function to extract text from uploaded files
def extract_text(blob):
    text = ""
      # file_path = f"/tmp/{blob.name.split('/')[-1]}"
    file_path = f"/tmp/{blob.name.split('/')[-1]}"
    blob.download_to_filename(file_path)

    # print("--file_path--",file_path)

    try:
        if file_path.endswith('.pdf'):
            with fitz.open(file_path) as pdf:
                for page in pdf:
                    text += page.get_text()
        elif file_path.endswith('.docx'):
            doc = docx.Document(file_path)
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
        elif file_path.endswith('.xlsx'):
            wb = openpyxl.load_workbook(file_path)
            for sheet in wb.sheetnames:
                ws = wb[sheet]
                for row in ws.iter_rows(values_only=True):
                    text += ' '.join([str(cell) for cell in row if cell is not None]) + "\n"
        elif file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                text = soup.get_text()
        elif file_path.endswith('.pptx'):
            presentation = Presentation(file_path)
            for slide in presentation.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
        elif file_path.endswith('.txt') or file_path.endswith('.md'):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        elif file_path.endswith('.xml'):
            tree = ET.parse(file_path)
            root = tree.getroot()
            text = ' '.join(elem.text for elem in root.iter() if elem.text)
    except Exception as e:
        logging.error(f"Error extracting text from {file_path}: {str(e)}")
        raise

    return text




from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import logging
from datetime import datetime, timezone

# Log timestamp with timezone awareness
current_timestamp = datetime.now(timezone.utc).isoformat()

def save_to_user_index(user_id, embeddings, chunks):
    """Save embeddings and chunks as JSON in Cloud Storage and reference in Firestore."""
    try:
        # Ensure embeddings are a NumPy array
        if not isinstance(embeddings, np.ndarray):
            embeddings = np.array(embeddings)

        # Reshape embeddings if they are 1D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)

        # Prepare data to save
        vector_store_data = {
            "embeddings": embeddings.tolist(),  # Convert to list for JSON serialization
            "chunks": chunks  # Store chunks as a list
        }

        # Save to Cloud Storage
        vector_store_json = json.dumps(vector_store_data)
        blob = bucket_name.blob(f"{user_id}/vector_store/vector_store.json")
        blob.upload_from_string(vector_store_json, content_type="application/json")

        # Save metadata to Firestore
        vector_data = {
            "embeddings_location": f"{user_id}/vector_store/vector_store.json",
            "metadata": {
                "timestamp": datetime.utcnow().isoformat(),
                "source": "user_upload"
            }
        }
        firestore_client.collection("user_embeddings").document(user_id).set(vector_data)
        logging.info(f"Embeddings and chunks saved to Cloud Storage for user_id: {user_id}")

        return {"status": "success", "message": "Embeddings saved successfully."}
    except Exception as e:
        logging.error(f"Error saving embeddings for user_id {user_id}: {str(e)}")
        return {"status": "error", "message": f"Error saving embeddings: {str(e)}"}




def get_relevant_context(user_id, query_embedding, top_k=5):
    """Retrieve the most relevant context chunks using cosine similarity."""
    try:
        # Fetch metadata from Firestore
        vector_data = firestore_client.collection("user_embeddings").document(user_id).get().to_dict()
        if not vector_data or "embeddings_location" not in vector_data:
            logging.warning(f"No embeddings location found for user_id: {user_id}")
            return "No relevant data found.", []

        # Load embeddings and chunks from Cloud Storage
        blob = bucket_name.blob(vector_data["embeddings_location"])
        vector_store_data = json.loads(blob.download_as_text())
        embeddings = np.array(vector_store_data["embeddings"])
        chunks = vector_store_data["chunks"]

        # Ensure embeddings are 2D
        if embeddings.ndim != 2:
            raise ValueError(f"Stored embeddings must be 2D arrays. Found: {embeddings.ndim}D.")

        # Validate query_embedding shape
        query_embedding = np.array(query_embedding)
        if query_embedding.ndim == 2:
            query_embedding = query_embedding[0]  # Take the first vector if batched
        elif query_embedding.ndim != 1:
            raise ValueError("Query embedding must be a single vector or a 2D batch.")

        # Check for dimensional match
        if query_embedding.shape[0] != embeddings.shape[1]:
            raise ValueError(
                f"Query embedding dimensions do not match stored embeddings: "
                f"Query {query_embedding.shape[0]}, Stored {embeddings.shape[1]}"
            )

        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]

        # Fetch the top-k relevant chunks
        relevant_chunks = [chunks[i] for i in top_indices]
        combined_content = "\n".join(relevant_chunks)
        ######
        # logging.info(f"Top {top_k} relevant chunks for user_id {user_id}: {relevant_chunks}")

        return combined_content, relevant_chunks

    except Exception as e:
        logging.error(f"Error retrieving relevant context for user_id {user_id}: {str(e)}")
        return "No relevant data found.", []



def log_user_query(user_id, query, relevant_text, response, embeddings):
    """Log user queries and responses."""
    try:
        # Flatten embeddings to avoid nested array issues
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.flatten().tolist()

        # Prepare query log data
        query_log_data = {
            "query": query,
            "relevant_text": relevant_text,
            "response": response,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "embeddings": embeddings  # Flattened embeddings
        }

        # Save log to Firestore
        log_collection = firestore_client.collection("user_query_logs").document(user_id)
        log_collection.collection("logs").add(query_log_data)

        # Save logs to Cloud Storage as a backup
        user_logs = {"logs": []}
        log_blob = bucket_name.blob(f"{user_id}/query_logs/query_log.json")

        if log_blob.exists():
            user_logs = json.loads(log_blob.download_as_text())

        # Append the new log entry
        user_logs["logs"].append(query_log_data)
        log_blob.upload_from_string(json.dumps(user_logs, indent=4), content_type="application/json")

    except Exception as e:
        logging.error(f"Error logging query for user_id {user_id}: {str(e)}")

@app.route("/chat", methods=["POST"])
def chat():
    """Chat endpoint for user queries."""
    try:
        data = request.json
        user_id = sanitize_user_id(data.get("user_id"))
        message = data.get("message")

        if not user_id or not message:
            return jsonify({"error": "User ID and message are required"}), 400

        # Initialize user-specific chat session if it doesn't exist
        if user_id not in user_chat_sessions:
            user_chat_sessions[user_id] = model.start_chat(history=[])
            user_chat_history[user_id] = []

        # Retrieve user instructions from Firestore
        user_doc = firestore_client.collection("users").document(user_id).get()
        user_data = user_doc.to_dict() if user_doc.exists else {}
        model_instructions = user_data.get("instructions", "")

        # Encode the user query to find relevant context
        query_embedding = embedding_model.encode([message], convert_to_tensor=False)
        context_content, relevant_chunks = get_relevant_context(user_id, query_embedding, top_k=5)

        if context_content == "No relevant data found.":
            logging.warning(f"No relevant context found for user_id: {user_id}")

        # Prepare model chat session
        chat_session = user_chat_sessions[user_id]
        chat_session.history.extend([
            {"role": "user", "parts": [message]},
            {"role": "model", "parts": [context_content + "\n" + model_instructions]}
        ])

        # Get the model's response
        response = chat_session.send_message(message)
        bot_response = response.text.strip() if response and response.text else "Error: No response from model"
        bot_response = remove_source_brackets(bot_response)

        # Log the query and response
        log_user_query(user_id, query=message, relevant_text=context_content, response=bot_response, embeddings=query_embedding)

        # Store chat history
        user_chat_history[user_id].append({"user": message, "bot": bot_response})
        user_chat_history[user_id] = user_chat_history[user_id][-4:]  # Keep only the last 4 exchanges

        return jsonify({"response": bot_response}), 200

    except Exception as e:
        logging.error(f"Error in chat endpoint: {str(e)}")
        return jsonify({"error": f"Error generating response: {str(e)}"}), 500

if __name__ == "__main__":
    app.run(port=34000)

  from tqdm.autonotebook import tqdm, trange
  checkpoint = torch.load(fp, map_location=device)
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
DEBUG:chromadb.config:Starting component System
DEBUG:chromadb.config:Starting component Posthog
DEBUG:chromadb.config:Starting component OpenTelemetryClient
DEBUG:chromadb.config:Starting component SqliteDB
DEBUG:chromadb.config:Starting component QuotaEnforcer
DEBUG:chromadb.config:Starting component LocalSegmentManager
DEBUG:chromadb.config:Starting component SegmentAPI


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:34000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): us.i.posthog.com:443
DEBUG:urllib3.connectionpool:https://us.i.posthog.com:443 "POST /batch/ HTTP/1.1" 200 15
INFO:werkzeug:127.0.0.1 - - [22/Nov/2024 01:57:26] "GET / HTTP/1.1" 200 -


In [2]:
# pip install git+https://github.com/openai/whisper.git

In [None]:
# pip install yt-dlp whisper sentence-transformers flask flask-cors google-cloud-storage firebase-admin

In [3]:
# pip install whisper

In [4]:
# pip install ffmpeg

In [5]:
# pip install --upgrade pytube