In [25]:
!pip install transformers torch pandas scikit-learn
!pip install PyPDF2 python-docx



In [26]:
import pandas as pd

# Create a sample Knowledge Base (KB) DataFrames
data = {
    'Intent': ['HR_Policy', 'IT_Support', 'Event_Info', 'HR_Policy', 'General_Greeting'],
    'Query_Keywords': ['leave policy, vacation, sick', 'reset password, slow pc, login issue', 'hackathon date, venue, location', 'reimbursement, travel claim', 'hello, hi, greetings'],
    'Response': [
        "Please refer to Section 3.1 of the HR Policy document regarding the latest leave and vacation rules.",
        "For password resets, please visit the official IT portal at portal.org.com/help. For slow PC issues, try restarting.",
        "The hackathon (SIH1706) is scheduled for October 29th at the main auditorium.",
        "Reimbursement forms must be submitted within 30 days of travel via the employee portal.",
        "Hello! I am your Enterprise Assistant. How may I help you with HR, IT, or Event queries?"
    ]
}
knowledge_base_df = pd.DataFrame(data)
print("Knowledge Base created successfully!")

Knowledge Base created successfully!


In [27]:
def simple_intent_recognizer(query):
    query = query.lower()

    # 1. Simple Keyword Mapping for Routing
    if any(keyword in query for keyword in ['leave', 'vacation', 'hr', 'policy', 'reimbursement', 'claim']):
        return 'HR_Policy'
    elif any(keyword in query for keyword in ['password', 'it', 'pc', 'login', 'support']):
        return 'IT_Support'
    elif any(keyword in query for keyword in ['hackathon', 'event', 'date', 'venue', 'location']):
        return 'Event_Info'
    elif any(keyword in query for keyword in ['hello', 'hi', 'greetings']):
        return 'General_Greeting'
    else:
        return 'Unknown'

def retrieve_response(intent, query, kb=knowledge_base_df):
    if intent == 'Unknown':
        return "I apologize, I can only assist with HR policies, IT support, and company events. Please rephrase your query."

    # Filter KB by Intent
    relevant_entries = kb[kb['Intent'] == intent]

    if relevant_entries.empty:
        return f"I found the topic is '{intent}', but I lack specific information on that exact query."

    # Simple selection: For now, just return the first matching response for the intent
    return relevant_entries['Response'].iloc[0]

# --- Testing the Core System ---
test_query = "I need to know the leave policy details."
intent = simple_intent_recognizer(test_query)
response = retrieve_response(intent, test_query)
print(f"\nUser Query: {test_query}")
print(f"Detected Intent: {intent}")
print(f"Assistant Response: {response}")


User Query: I need to know the leave policy details.
Detected Intent: HR_Policy
Assistant Response: Please refer to Section 3.1 of the HR Policy document regarding the latest leave and vacation rules.


In [28]:
import random
import time

# Dictionary to store active OTPs (Mock Database)
active_otps = {}

def generate_otp():
    """Generates a 6-digit one-time password."""
    return str(random.randint(100000, 999999))

def initiate_2fa(user_email):
    """
    Mocks the 2FA initiation process by generating an OTP and "sending" it.
    In a real application, this would use a library like smtplib to send the email.
    """
    otp = generate_otp()
    timestamp = time.time()

    # Store the OTP with its expiration time (e.g., 300 seconds or 5 minutes)
    active_otps[user_email] = {'otp': otp, 'expiry': timestamp + 300}

    print(f"\n--- 2FA INITIATED ---")
    print(f"A 6-digit OTP has been 'sent' to {user_email}.")
    # --- This is the MOCK email output ---
    print(f"MOCK OTP for {user_email}: {otp}")
    print("-------------------------\n")

    return True

def verify_2fa(user_email, user_input_otp):
    """Verifies the user-entered OTP against the stored active OTP."""
    if user_email not in active_otps:
        print("Verification Failed: No active OTP found for this user.")
        return False

    stored_data = active_otps[user_email]

    # Check for expiration
    if time.time() > stored_data['expiry']:
        del active_otps[user_email]  # Remove expired OTP
        print("Verification Failed: OTP has expired.")
        return False

    # Check for match
    if user_input_otp == stored_data['otp']:
        del active_otps[user_email]  # OTP used, remove it
        print("✅ 2FA Successful! Access Granted.")
        return True
    else:
        print("Verification Failed: Incorrect OTP.")
        return False

# --- Test the 2FA Flow ---
test_email = "employee@org.com"
initiate_2fa(test_email)

# Simulate user entering the correct OTP (Use the MOCK OTP printed above!)
correct_otp = active_otps[test_email]['otp'] # This line fetches the printed mock OTP for testing
print(f"Simulating user input: {correct_otp}")
verify_2fa(test_email, correct_otp)


--- 2FA INITIATED ---
A 6-digit OTP has been 'sent' to employee@org.com.
MOCK OTP for employee@org.com: 535935
-------------------------

Simulating user input: 535935
✅ 2FA Successful! Access Granted.


True

In [29]:
# System-maintained dictionary of forbidden words
# NOTE: This list should be extensive in a production environment.
FORBIDDEN_WORDS = [
    "swear_word_1", "bad_word_2", "profanity_3", "offensive_4", "idiot", "dumb", "stupid"
]

def check_language_filter(query):
    """Checks the query against the forbidden words list."""
    query_words = set(query.lower().split())

    # Find intersection between query words and forbidden words
    found_bad_words = query_words.intersection(FORBIDDEN_WORDS)

    if found_bad_words:
        print(f"⚠️ Bad Language Detected: {', '.join(found_bad_words)}")
        return False, "Your query contains prohibited language. Please rephrase your question professionally."
    else:
        return True, query

# --- Test the Filter ---
test_query_ok = "What is the policy for expense claims?"
is_ok, response_ok = check_language_filter(test_query_ok)
print(f"\nQuery 1 Status: {is_ok} - Response: {response_ok}")

test_query_bad = "I think this idiot IT system is dumb."
is_bad, response_bad = check_language_filter(test_query_bad)
print(f"\nQuery 2 Status: {is_bad} - Response: {response_bad}")



Query 1 Status: True - Response: What is the policy for expense claims?
⚠️ Bad Language Detected: idiot

Query 2 Status: False - Response: Your query contains prohibited language. Please rephrase your question professionally.


In [30]:
sample_document_text = """
## Annual HR Policy Update 2025: Section 4 - Leave and Benefits

### 4.1. Annual Leave Entitlement
All full-time employees are entitled to 20 days of paid annual leave per calendar year. This leave must be approved by the department manager at least two weeks in advance. Unused leave days (up to 5) can be rolled over to the next year.

### 4.2. Sick Leave
Employees are allotted 10 days of paid sick leave per year. For absences exceeding three consecutive days, a doctor's certificate is mandatory. Employees must inform their direct supervisor by 9:00 AM on the first day of sickness.

### 4.3. Employee Benefits Contact
For any detailed queries regarding medical or retirement benefits, please contact **Jane Doe**, our Benefits Coordinator, at **benefits@org.com** or Extension 402.

## Section 5 - IT Asset Management
All issued IT assets (laptops, phones) are the property of the organization. If a device is lost or stolen, the employee must report it to the IT support desk within 24 hours to secure sensitive data. Failure to report promptly may result in disciplinary action. The standard IT support number is **555-1234**.
"""

print("Sample organizational document content loaded successfully.")

Sample organizational document content loaded successfully.


In [31]:
# FIX: Install the separate package for summarization
!pip install -q gensim sumy
print("Required summarization libraries installed successfully.")

Required summarization libraries installed successfully.


In [32]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import re

# Use the sample text from the previous step
# sample_document_text defined in 3.1:
# sample_document_text = "..."

# ----------------- SUMMARIZATION FUNCTION using LexRank -----------------
def get_document_summary_new(document_text, sentence_count=3):
    """
    Generates an extractive summary using the LexRank algorithm (from sumy).
    """
    LANGUAGE = "english"
    SENTENCES_COUNT = sentence_count

    try:
        # 1. Parsing and Tokenization
        parser = PlaintextParser.from_string(document_text, Tokenizer(LANGUAGE))

        # 2. Stemming and Summarizer Initialization
        stemmer = Stemmer(LANGUAGE)
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        # 3. Generating Summary
        summary = summarizer(parser.document, SENTENCES_COUNT)

        # Convert the list of sentences back into a single string
        return " ".join([str(s) for s in summary])

    except Exception as e:
        return f"An error occurred during summarization: {e}"

# --- Test Summarization ---
print("\n--- Document Summarization (LexRank) ---")
# Try to get a summary consisting of the 3 most important sentences
summary_result = get_document_summary_new(sample_document_text, sentence_count=3)
print(f"**Generated Summary (3 sentences):**\n{summary_result}")


--- Document Summarization (LexRank) ---
**Generated Summary (3 sentences):**
An error occurred during summarization: NLTK tokenizers are missing or the language is not supported.
Download them by following command: python -c "import nltk; nltk.download('punkt')"
Original error was:

**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************



In [33]:
def intelligent_enterprise_assistant(user_email, user_query, user_input_otp, document_to_process=None):
    """
    The central integrated function for the chatbot.
    It simulates a single chat session query.
    """
    print(f"\n--- Processing Query from: {user_email} ---")

    # --- STEP 1: SECURITY CHECK (Mock 2FA for Session Start) ---
    # In a real app, this runs only once at login. Here, we mock it.
    if user_email in active_otps: # Check if verification is pending
        if not verify_2fa(user_email, user_input_otp):
            return "ACCESS DENIED: Please provide the correct 2FA OTP to continue."

    # --- STEP 2: LANGUAGE FILTER ---
    is_clean, filtered_query_or_message = check_language_filter(user_query)
    if not is_clean:
        return f"FILTERED: {filtered_query_or_message}"

    clean_query = filtered_query_or_message

    # --- STEP 3: DOCUMENT PROCESSING (Highest Priority Action) ---
    if document_to_process:
        print("ACTION: Document processing initiated...")

        # We can simulate the background processing for scalability
        # NOTE: For SIH, this should be executed asynchronously.

        # Keyword Extraction
        info = extract_organizational_info(document_to_process)

        # Summarization
        summary = get_document_summary_new(document_to_process, sentence_count=3)

        # Final Document Response (simulating an email being sent)
        return (f"✅ Document Analysis Complete (Asynchronous Mock).\n"
                f"Summary Sent to Email: '{summary}'\n"
                f"Key Information Extracted: {info['Keywords']}")

    # --- STEP 4: NLP CORE (Answer the Query) ---
    else:
        print("ACTION: Answering General Query...")

        # 4a. Intent Recognition (from Phase 1)
        intent = simple_intent_recognizer(clean_query)

        # 4b. Response Retrieval (from Phase 1)
        response = retrieve_response(intent, clean_query)

        return f"ASSISTANT: {response}"

# Re-run a test of the 2FA system to get a fresh OTP for the first test
test_email = "hackathon_team@sih.com"
initiate_2fa(test_email)
fresh_otp = active_otps[test_email]['otp']



--- 2FA INITIATED ---
A 6-digit OTP has been 'sent' to hackathon_team@sih.com.
MOCK OTP for hackathon_team@sih.com: 287857
-------------------------



In [34]:
# 1. First, complete the 2FA login
login_result = intelligent_enterprise_assistant(
    user_email="hackathon_team@sih.com",
    user_query="This is just a login attempt.",
    user_input_otp=fresh_otp, # Use the fresh_otp variable from 4.1
    document_to_process=None
)
print("\n[Test 1: 2FA Login Result]")
print(login_result)


# 2. Next, test a query that triggers the bad language filter
bad_query_result = intelligent_enterprise_assistant(
    user_email="hackathon_team@sih.com",
    user_query="I think that idiot manager is being stupid.",
    user_input_otp="000000", # OTP is now irrelevant, but we pass a placeholder
    document_to_process=None
)
print("\n[Test 2: Bad Language Filter Result]")
print(bad_query_result)


--- Processing Query from: hackathon_team@sih.com ---
✅ 2FA Successful! Access Granted.
ACTION: Answering General Query...

[Test 1: 2FA Login Result]
ASSISTANT: For password resets, please visit the official IT portal at portal.org.com/help. For slow PC issues, try restarting.

--- Processing Query from: hackathon_team@sih.com ---
⚠️ Bad Language Detected: idiot

[Test 2: Bad Language Filter Result]
FILTERED: Your query contains prohibited language. Please rephrase your question professionally.


In [35]:
# Install all dependencies needed for the project core
!pip install -q sumy scikit-learn nltk
import nltk
nltk.download('stopwords') # Download necessary NLTK data

# 2. Imports for Summarization (using sumy)
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# 3. Imports for Keyword Extraction (using scikit-learn/TfidfVectorizer)
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

import re

print("All dependencies are successfully installed and imported.")

All dependencies are successfully installed and imported.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# ----------------- RE-DEFINE SUMMARIZATION FUNCTION (from sumy) -----------------
def get_document_summary_new(document_text, sentence_count=3):
    """Generates an extractive summary using the LexRank algorithm (from sumy)."""
    LANGUAGE = "english"
    SENTENCES_COUNT = sentence_count
    try:
        parser = PlaintextParser.from_string(document_text, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = summarizer(parser.document, SENTENCES_COUNT)
        return " ".join([str(s) for s in summary])
    except Exception as e:
        return f"An error occurred during summarization: {e}"

# ----------------- NEW KEYWORD AND INFO EXTRACTION FUNCTION -----------------
def extract_organizational_info(document_text, num_keywords=10):
    """
    Extracts keywords using TF-IDF and specific organizational entities using regex.
    """
    extracted_info = {}

    # 1. Keyword Extraction using TF-IDF (Term Frequency-Inverse Document Frequency)
    try:
        tfidf = TfidfVectorizer(stop_words=list(stopwords.words('english')))
        tfidf.fit_transform([document_text])

        feature_names = tfidf.get_feature_names_out()
        tfidf_scores = tfidf.idf_

        # Combine words and scores, sort by score (TF-IDF value)
        word_scores = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)

        # Get the top N words (excluding words like 'a', 'the', etc.)
        top_keywords = [word for word, score in word_scores[:num_keywords]]
        extracted_info['Keywords'] = ", ".join(top_keywords)

    except Exception as e:
        extracted_info['Keywords'] = f"Keyword extraction failed: {e}"

    # 2. Specific Entity Extraction using Regular Expressions (from Phase 3)
    name_pattern = re.compile(r'\*\*([A-Za-z]+ [A-Za-z]+)\*\*,')
    extracted_info['Contact_Names'] = name_pattern.findall(document_text)

    phone_pattern = re.compile(r'(\d{3}-\d{4}|\w+ \d+)')
    extracted_info['Phone_Numbers_Ext'] = phone_pattern.findall(document_text)

    section_pattern = re.compile(r'Section \d.*-.*')
    extracted_info['Policy_Sections'] = section_pattern.findall(document_text)

    return extracted_info

print("Document Processing functions are now using robust NLTK/scikit-learn methods.")

Document Processing functions are now using robust NLTK/scikit-learn methods.


In [37]:
# --- Processing Query from: hackathon_team@sih.com ---
document_query_result = intelligent_enterprise_assistant(
    user_email="hackathon_team@sih.com",
    user_query="Please analyze this HR document for me and summarize it.",
    user_input_otp="000000",
    document_to_process=sample_document_text # Pass the content itself
)
print("\n[Final Test: Document Processing Result]")
print(document_query_result)


--- Processing Query from: hackathon_team@sih.com ---
ACTION: Document processing initiated...

[Final Test: Document Processing Result]
✅ Document Analysis Complete (Asynchronous Mock).
Summary Sent to Email: 'An error occurred during summarization: NLTK tokenizers are missing or the language is not supported.
Download them by following command: python -c "import nltk; nltk.download('punkt')"
Original error was:

**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/l

In [38]:
# ==============================================================================
# MASTER SETUP: Re-installing and Re-defining all modules for a clean test
# ==============================================================================

# 1. Install and Import
print("--- 1. SETTING UP ENVIRONMENT AND IMPORTS ---")
!pip install -q sumy scikit-learn nltk
import nltk
nltk.download('stopwords', quiet=True)

import pandas as pd
import random
import time
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re

# 2. Global Data (Phase 1 & 2)
active_otps = {}
FORBIDDEN_WORDS = ["swear_word_1", "bad_word_2", "profanity_3", "offensive_4", "idiot", "dumb", "stupid"]

knowledge_base_df = pd.DataFrame({
    'Intent': ['HR_Policy', 'IT_Support', 'Event_Info', 'HR_Policy', 'General_Greeting'],
    'Query_Keywords': ['leave policy, vacation, sick', 'reset password, slow pc, login issue', 'hackathon date, venue, location', 'reimbursement, travel claim', 'hello, hi, greetings'],
    'Response': [
        "Please refer to Section 3.1 of the HR Policy document regarding the latest leave and vacation rules.",
        "For password resets, please visit the official IT portal at portal.org.com/help. For slow PC issues, try restarting.",
        "The hackathon (SIH1706) is scheduled for October 29th at the main auditorium.",
        "Reimbursement forms must be submitted within 30 days of travel via the employee portal.",
        "Hello! I am your Enterprise Assistant. How may I help you with HR, IT, or Event queries?"
    ]
})

sample_document_text = """
## Annual HR Policy Update 2025: Section 4 - Leave and Benefits
### 4.1. Annual Leave Entitlement: All full-time employees are entitled to 20 days of paid annual leave per calendar year. Unused leave days (up to 5) can be rolled over to the next year.
### 4.2. Sick Leave: Employees are allotted 10 days of paid sick leave per year. For absences exceeding three consecutive days, a doctor's certificate is mandatory.
### 4.3. Employee Benefits Contact: For any detailed queries regarding medical or retirement benefits, please contact **Jane Doe**, our Benefits Coordinator, at **benefits@org.com** or Extension 402.
## Section 5 - IT Asset Management: The standard IT support number is **555-1234**.
"""

# 3. Function Definitions (Security, NLP Core, Document Processing)
def generate_otp():
    return str(random.randint(100000, 999999))
def initiate_2fa(user_email):
    otp = generate_otp()
    timestamp = time.time()
    active_otps[user_email] = {'otp': otp, 'expiry': timestamp + 300}
    print(f"\n[INIT 2FA] OTP 'sent' to {user_email}. MOCK OTP: {otp}")
    return otp
def verify_2fa(user_email, user_input_otp):
    if user_email not in active_otps or time.time() > active_otps[user_email]['expiry']:
        return False
    if user_input_otp == active_otps[user_email]['otp']:
        del active_otps[user_email]
        return True
    return False

def check_language_filter(query):
    query_words = set(query.lower().split())
    if query_words.intersection(FORBIDDEN_WORDS):
        return False, "Your query contains prohibited language. Please rephrase your question professionally."
    return True, query

def simple_intent_recognizer(query):
    query = query.lower()
    if any(k in query for k in ['leave', 'vacation', 'hr', 'policy', 'reimbursement', 'claim']): return 'HR_Policy'
    elif any(k in query for k in ['password', 'it', 'pc', 'login', 'support']): return 'IT_Support'
    elif any(k in query for k in ['hackathon', 'event', 'date', 'venue']): return 'Event_Info'
    elif any(k in query for k in ['hello', 'hi', 'greetings']): return 'General_Greeting'
    return 'Unknown'
def retrieve_response(intent, query, kb=knowledge_base_df):
    if intent == 'Unknown': return "I apologize, I can only assist with HR, IT, or Event queries."
    relevant_entries = kb[kb['Intent'] == intent]
    return relevant_entries['Response'].iloc[0] if not relevant_entries.empty else f"No specific answer for {intent}."

def get_document_summary_new(document_text, sentence_count=3):
    LANGUAGE = "english"
    parser = PlaintextParser.from_string(document_text, Tokenizer(LANGUAGE))
    summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = summarizer(parser.document, sentence_count)
    return " ".join([str(s) for s in summary])
def extract_organizational_info(document_text, num_keywords=10):
    extracted_info = {}
    try:
        tfidf = TfidfVectorizer(stop_words=list(stopwords.words('english')))
        tfidf.fit_transform([document_text])
        feature_names = tfidf.get_feature_names_out()
        tfidf_scores = tfidf.idf_
        word_scores = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)
        top_keywords = [word for word, score in word_scores[:num_keywords]]
        extracted_info['Keywords'] = ", ".join(top_keywords)
    except: extracted_info['Keywords'] = "Keyword extraction failed."
    name_pattern = re.compile(r'\*\*([A-Za-z]+ [A-Za-z]+)\*\*,')
    extracted_info['Contact_Names'] = name_pattern.findall(document_text)
    phone_pattern = re.compile(r'(\d{3}-\d{4}|\w+ \d+)')
    extracted_info['Phone_Numbers_Ext'] = phone_pattern.findall(document_text)
    section_pattern = re.compile(r'Section \d.*-.*')
    extracted_info['Policy_Sections'] = section_pattern.findall(document_text)
    return extracted_info

# 4. Integrated Chatbot Logic
def intelligent_enterprise_assistant(user_email, user_query, user_input_otp=None, document_to_process=None):
    if user_email in active_otps:
        if not verify_2fa(user_email, user_input_otp):
            return "ACCESS DENIED: Please provide the correct 2FA OTP to continue."

    is_clean, filtered_query_or_message = check_language_filter(user_query)
    if not is_clean:
        # CORRECTED LINE: ensures the f-string is properly closed.
        return f"FILTERED: {filtered_query_or_message}"

    clean_query = filtered_query_or_message

    if document_to_process:
        info = extract_organizational_info(document_to_process)
        summary = get_document_summary_new(document_to_process, sentence_count=3)

        # Safely constructing the multi-line string with explicit newlines (\n)
        return (f"✅ Document Analysis Complete (Mock Asynchronous).\n" +
                f"Summary: '{summary}'\n" +
                f"Key Info: Contact Names: {info['Contact_Names']} | Phones/Ext: {info['Phone_Numbers_Ext']} | Keywords: {info['Keywords']}")
    else:
        intent = simple_intent_recognizer(clean_query)
        response = retrieve_response(intent, clean_query)
        return f"ASSISTANT ({intent}): {response}"

--- 1. SETTING UP ENVIRONMENT AND IMPORTS ---


In [40]:
import nltk

# 1. Download the missing 'punkt_tab' resource required by sumy for robust tokenization
nltk.download('punkt_tab', quiet=True)

# 2. Re-download 'punkt' (as a fallback, just in case)
nltk.download('punkt', quiet=True)

# 3. Ensure the 'stopwords' list is also available for TF-IDF
nltk.download('stopwords', quiet=True)

print("✅ All required NLTK resources (punkt_tab, punkt, stopwords) are now downloaded.")

# --- RERUNNING CRITICAL IMPORTS ---
# Re-importing everything to ensure the newly downloaded NLTK resources are used
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re

print("All imports re-executed successfully.")

✅ All required NLTK resources (punkt_tab, punkt, stopwords) are now downloaded.
All imports re-executed successfully.


In [41]:
# ==============================================================================
# RERUNNING END-TO-END TESTS (Functions already loaded in memory)
# ==============================================================================

print("="*70)
print("           VERIFICATION OUTPUT: CHATBOT FUNCTIONALITY CHECK")
print("="*70)

# Define the test user for this session
test_user = "testuser@org.com"

# --- TEST 1: 2FA Authentication and Greeting ---
print("\n--- TEST 1: SECURITY - 2FA Authentication Check ---")
generated_otp = initiate_2fa(test_user) # Generate a new OTP for a clean test
# The first query completes the login
login_result = intelligent_enterprise_assistant(test_user, "Hello assistant", generated_otp)
print(f"✅ Successful Login & Greeting: {login_result}")


# --- TEST 2: Bad Language Filtering (Mandatory SIH Requirement) ---
print("\n--- TEST 2: SECURITY - Bad Language Filter Check ---")
bad_query = "This report is so stupid and the idiot who wrote it should fix it."
filter_result = intelligent_enterprise_assistant(test_user, bad_query)
print(f"❌ Filter Test Result: {filter_result}")


# --- TEST 3: Core NLP (Intent Recognition) ---
print("\n--- TEST 3: CORE NLP - Diverse Query Routing Check ---")
hr_query = "I need details on the company's sick leave policy."
it_query = "My password login is not working, I need IT support."
print(f"HR Query Result: {intelligent_enterprise_assistant(test_user, hr_query)}")
print(f"IT Query Result: {intelligent_enterprise_assistant(test_user, it_query)}")


# --- TEST 4: Document Processing (Summarization & Extraction) ---
print("\n--- TEST 4: DOCUMENT MODULE - Summary & Extraction Check ---")
doc_query = "Analyze this document for contact info and give me the summary."
# We use the 'sample_document_text' variable which should be in memory
doc_test_result = intelligent_enterprise_assistant(
    user_email=test_user,
    user_query=doc_query,
    document_to_process=sample_document_text
)
print(doc_test_result)

print("="*70)
print("     ALL CORE CHATBOT FUNCTIONS VERIFIED SUCCESSFULLY!")
print("="*70)

           VERIFICATION OUTPUT: CHATBOT FUNCTIONALITY CHECK

--- TEST 1: SECURITY - 2FA Authentication Check ---

[INIT 2FA] OTP 'sent' to testuser@org.com. MOCK OTP: 236778
✅ Successful Login & Greeting: ASSISTANT (General_Greeting): Hello! I am your Enterprise Assistant. How may I help you with HR, IT, or Event queries?

--- TEST 2: SECURITY - Bad Language Filter Check ---
❌ Filter Test Result: FILTERED: Your query contains prohibited language. Please rephrase your question professionally.

--- TEST 3: CORE NLP - Diverse Query Routing Check ---
HR Query Result: ASSISTANT (HR_Policy): Please refer to Section 3.1 of the HR Policy document regarding the latest leave and vacation rules.
IT Query Result: ASSISTANT (IT_Support): For password resets, please visit the official IT portal at portal.org.com/help. For slow PC issues, try restarting.

--- TEST 4: DOCUMENT MODULE - Summary & Extraction Check ---
✅ Document Analysis Complete (Mock Asynchronous).
Summary: '## Annual HR Policy Update