In [4]:
import random
import string
import numpy as np
%pip install --upgrade nltk
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
import pandas as pd

df = pd.read_csv("/content/extracted_entitiesab.csv")

print(df.head())

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text (tokenization, lemmatization, punctuation removal)
def preprocess_text(text):
    # Tokenize text
    word_tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Lemmatize and remove punctuation
    cleaned_text = [lemmatizer.lemmatize(word) for word in word_tokens if word not in string.punctuation]

    return ' '.join(cleaned_text)

# Apply preprocessing to 'Description' column
df['Cleaned_Description'] = df['Description'].apply(lambda x: preprocess_text(x) if pd.notna(x) else '')

# Preview the cleaned text
print(df[['Description', 'Cleaned_Description']].head())

import spacy

# Load pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply NER to the cleaned descriptions
df['Entities'] = df['Cleaned_Description'].apply(extract_entities)

# Preview the data with extracted entities
print(df[['Description', 'Cleaned_Description', 'Entities']].head())



                                              Entity Label  \
0  INTERNATIONAL APPLICATION PUBLISHED UNDER THE ...   ORG   
1                                                PCT   ORG   
2  World Intellectual Property Organization Inter...   ORG   
3                   International Publication Number   ORG   
4                  International Publication Date WO   ORG   

                                         Description  
0  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...  
1  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...  
2  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...  
3  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...  
4  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                         Description  \
0  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
1  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
2  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
3  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
4  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   

                                 Cleaned_Description  
0  12 international application published under t...  
1  12 international application published under t...  
2  12 international application published under t...  
3  12 international application published under t...  
4  12 international application published under t...  
                                         Description  \
0  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
1  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
2  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
3  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
4  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER... 

In [6]:
from textblob import TextBlob

# Function to get sentiment polarity (positive, negative, neutral)
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Sentiment score (-1 to 1)

# Apply sentiment analysis
df['Sentiment'] = df['Cleaned_Description'].apply(get_sentiment)

# Preview sentiment scores
print(df[['Description', 'Sentiment']].head())

df.to_csv("processed_entities.csv", index=False)

print("Processed data saved to 'processed_entities.csv'")

                                         Description  Sentiment
0  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   0.037500
1  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   0.075705
2  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   0.037500
3  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   0.037500
4  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   0.037500
Processed data saved to 'processed_entities.csv'


In [7]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

def summarize_text(text, num_sentences=2):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
            word_frequencies[word.text.lower()] = word_frequencies.get(word.text.lower(), 0) + 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text.lower() in word_frequencies:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word.text.lower()]

    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = " ".join([str(sent) for sent in summary_sentences])
    return summary

In [8]:
%pip install sentence-transformers



In [9]:
import pandas as pd
data = pd.read_csv('/content/processed_entities.csv')
print(data.head())

def LemTokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

remove_punc_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punc_dict)))

data['Description'] = data['Description'].fillna('').astype(str)

from sentence_transformers import SentenceTransformer

preprocessed_sentences = [LemNormalize(sentence) for sentence in data['Description']]
preprocessed_sentences_str = [' '.join(sentence) for sentence in preprocessed_sentences]

model = SentenceTransformer('all-MiniLM-L6-v2')

def process_user_query(user_query):
    query_processed = LemNormalize(user_query)
    query_str = ' '.join(query_processed)
    query_embedding = model.encode([query_str])
    if not isinstance(query_embedding, np.ndarray):
        query_embedding = np.array(query_embedding)
    return query_embedding

user_query = input("Enter your query (or type 'exit' to quit): ")

query_embedding = process_user_query(user_query).astype(np.float32)
sentence_embeddings = model.encode(preprocessed_sentences_str).astype(np.float32)


print("Sentence embeddings:", sentence_embeddings)
print("Query Embedding:", query_embedding)


                                              Entity Label  \
0  INTERNATIONAL APPLICATION PUBLISHED UNDER THE ...   ORG   
1                                                PCT   ORG   
2  World Intellectual Property Organization Inter...   ORG   
3                   International Publication Number   ORG   
4                  International Publication Date WO   ORG   

                                         Description  \
0  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
1  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
2  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
3  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   
4  (12) INTERNATIONAL APPLICATION PUBLISHED UNDER...   

                                 Cleaned_Description  \
0  12 international application published under t...   
1  12 international application published under t...   
2  12 international application published under t...   
3  12 international application published under t...   
4  12 inte

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Enter your query (or type 'exit' to quit): uav
Sentence embeddings: [[-0.06737203  0.04754766  0.00790684 ... -0.07381594 -0.06231912
   0.00258845]
 [-0.09162535  0.11541863 -0.02089767 ...  0.00305743 -0.0275792
   0.04578074]
 [-0.06737202  0.04754765  0.00790682 ... -0.07381594 -0.06231911
   0.00258848]
 ...
 [-0.06515896  0.06558628 -0.01483229 ...  0.09415536 -0.08263117
  -0.07280099]
 [-0.06515896  0.06558628 -0.01483229 ...  0.09415536 -0.08263117
  -0.07280099]
 [-0.04509854  0.0688794   0.04400564 ... -0.06060551 -0.0402807
   0.01775094]]
Query Embedding: [[ 2.84795910e-02  2.18813345e-02 -4.45201285e-02 -4.89867991e-03
  -4.24737968e-02  8.17566738e-03  4.04779091e-02 -6.02429360e-02
  -3.79130952e-02  1.25582978e-01  6.87636733e-02  1.79092139e-02
   3.67567055e-02  3.73019017e-02 -1.30240032e-02  1.08865352e-04
  -1.22875040e-02 -9.63344425e-02 -2.84022167e-02  2.54526343e-02
  -5.06005697e-02  1.16227604e-01 -2.56736372e-02  1.65580902e-02
  -4.55573760e-02  3.79199646

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(query_embedding, sentence_embeddings)

top_n = 10
top_n_indices = np.argsort(similarity_scores[0])[::-1][:top_n]

print(f"Top {top_n} most relevant sentences for your query:")
for idx in top_n_indices:
    print(f"Sentence: {preprocessed_sentences_str[idx]}")
    print(f"Similarity score: {similarity_scores[0][idx]}")
    print("-" * 80)

Top 10 most relevant sentences for your query:
Sentence: the uav 100 may also be controlled in a manual mode wherein it fly according to an alternative primary route r1 ’ defined in realtime by control command re ceived via the wireless command link 115
Similarity score: 0.5411809682846069
--------------------------------------------------------------------------------
Sentence: the uav 100 may also be controlled in a manual mode wherein it fly according to an alternative primary route r1 ’ defined in realtime by control command re ceived via the wireless command link 115
Similarity score: 0.5411809682846069
--------------------------------------------------------------------------------
Sentence: the uav 100 may also be controlled in a manual mode wherein it fly according to an alternative primary route r1 ’ defined in realtime by control command re ceived via the wireless command link 115
Similarity score: 0.5411809682846069
-----------------------------------------------------------

In [14]:
%pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [11]:
entity_embeddings = model.encode(data['Entity'].tolist())
entity_embeddings = np.array(entity_embeddings).astype('float32')

import faiss
index = faiss.IndexFlatL2(entity_embeddings.shape[1])
index.add(entity_embeddings)


def search_with_faiss(query_embedding, index, k=5):
    D, I = index.search(np.array(query_embedding).astype('float32'), k)
    return I, D

top_indices, distances = search_with_faiss(query_embedding, index)

for idx in top_indices[0]:
  print(f"Entity: {data.iloc[idx]['Entity']}, Label: {data.iloc[idx]['Label']}, Description: {data.iloc[idx]['Cleaned_Description']}")

Entity: UAV, Label: ORG, Description: the invention relates to remote control of an un- manned aerial vehicle uav 100 from a control station 110 by mean of a wireless command link 115 the uav 100 may be controlled in an autonomous mode wherein it fly according to a primary route r1 r1 ’ defined by a first set of predefined waypoints wp1-wp8 ip the uav 100 may also be controlled in a manual mode wherein it fly according to an alternative primary route r1 ’ defined in real-time by control command re- ceived via the wireless command link 115 flight control parameter are monitored in both mode and in case a major alarm condition occurs the uav 100 is controlled to follow an emergency route r2 ’ defined by a second set of predefined waypoints hp1-hp7 tp1-tp9 ip then the emergency route r2 ’ involves flying the uav 100 to an air space above a termination waypoint tp9 on the ground at which it is estimated that the vehicle ’ s 100 flight may be ended without injuring any personnel or causing 

In [12]:
from transformers import pipeline


# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarizy_text(text, num_sentences=2):
    # Summarize text using the model
    summary = summarizer(text, max_length=250, min_length=75, do_sample=False, length_penalty=2.0)

    summary_text = summary[0]['summary_text']

    # Check if the summary ends abruptly and add a conclusion if needed
    if not summary_text.endswith(('.', '!', '?')):
        summary_text += '.'

    return summary_text

distances, indices = index.search(query_embedding, k=1)

if len(indices[0]) > 0 and indices[0][0] != -1:
    idx = indices[0][0]
    relevant_text = data.iloc[idx]['Entity']
    label = data.iloc[idx]['Label']
    Desc = data.iloc[idx]['Cleaned_Description']


    summary = summarizy_text(Desc, num_sentences=2)

    print("\n--- Relevant Document ---")
    print(f"Entity: {relevant_text}")
    print(f"Label: {label}")
    print(f"Description: {Desc}")
    print("\n--- Summary ---")
    print(summary)
else:
    print("No relevant documents found.")

Your max_length is set to 250, but your input_length is only 223. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=111)



--- Relevant Document ---
Entity: UAV
Label: ORG
Description: the invention relates to remote control of an un- manned aerial vehicle uav 100 from a control station 110 by mean of a wireless command link 115 the uav 100 may be controlled in an autonomous mode wherein it fly according to a primary route r1 r1 ’ defined by a first set of predefined waypoints wp1-wp8 ip the uav 100 may also be controlled in a manual mode wherein it fly according to an alternative primary route r1 ’ defined in real-time by control command re- ceived via the wireless command link 115 flight control parameter are monitored in both mode and in case a major alarm condition occurs the uav 100 is controlled to follow an emergency route r2 ’ defined by a second set of predefined waypoints hp1-hp7 tp1-tp9 ip then the emergency route r2 ’ involves flying the uav 100 to an air space above a termination waypoint tp9 on the ground at which it is estimated that the vehicle ’ s 100 flight may be ended without injuring 

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
def response (user_response, data):
  robo1_response = ''
  descriptions = data['Cleaned_Description'].tolist()
  TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words = 'english' )
  tfidf = TfidfVec.fit_transform(descriptions)
  user_query_normalized = LemNormalize(user_response)
  user_query_str = ' '.join(user_query_normalized)
  user_query_tfidf = TfidfVec.transform([user_query_str])
  vals = cosine_similarity(user_query_tfidf, tfidf)
  idx = vals.argsort ()[0][-2]
  flat = vals.flatten()
  flat. sort()
  req_tfidf = flat[-2]
  if (req_tfidf == 0):
    robo1_response = robo1_response + "I am sorry. Unable to understand you! Please be more clear."
  else:
    robo1_response += data.iloc[idx]['Entity'] + ": " + data.iloc[idx]['Cleaned_Description']
    return robo1_response

In [20]:
%pip install schedule


Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [15]:
print(type(query_embedding))
print(type(sentence_embeddings))
print(query_embedding.shape)
print(sentence_embeddings.shape)
query_embedding = query_embedding.reshape(1, -1)  # Ensure it's a 2D array
sentence_embeddings = np.array(sentence_embeddings)  # Ensure it's a 2D numpy array
print(type(query_embedding))
print(type(sentence_embeddings))
print(query_embedding.shape)
print(sentence_embeddings.shape)

# Ensure all embeddings are numeric and 2D
if not np.issubdtype(query_embedding.dtype, np.floating):
    raise ValueError("query_embedding contains non-numeric values")
if not np.issubdtype(sentence_embeddings.dtype, np.floating):
    raise ValueError("sentence_embeddings contains non-numeric values")

if query_embedding.ndim != 2 or sentence_embeddings.ndim != 2:
        raise ValueError("Both query_embedding and sentence_embeddings must be 2D arrays")

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 384)
(2368, 384)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 384)
(2368, 384)


In [None]:
import json
import queue
import threading
import schedule
import time

conversation_context = {"last_query": "", "last_response": ""}
unanswered_queries = []

def process_user_query(user_query):
    query_processed = LemNormalize(user_query)
    query_str = ' '.join(query_processed)
    query_embedding = model.encode([query_str])
    if not isinstance(query_embedding, np.ndarray):
        query_embedding = np.array(query_embedding)
    return query_embedding

def log_unanswered_query(query):
    with open("unanswered_questions.json", "a") as f:
        f.write(json.dumps({"query": query}) + "\n")

def user_feedback(response):
    print(f"Bot: {response}")
    feedback = input("Was this response helpful? (yes/no): ").strip().lower()
    return feedback

def log_feedback(query, response, feedback):
    feedback_data = {
        "query": query,
        "response": response,
        "feedback": feedback
    }
    with open("feedback.json", "a") as f:
        f.write(json.dumps(feedback_data) + "\n")

greet_inputs = ('hello', 'hey', 'hi', 'whassup', 'how are you?','Namasate','Good Morning')
def greet(user_response):
    for word in user_response.split():
      if word.lower() in greet_inputs:
        return "Hello! How can I assist you today?"
    return None

def find_relevant_document(query_embedding, sentence_embeddings, data):
    query_embedding = np.array(query_embedding).reshape(1, -1)

    similarity = cosine_similarity(query_embedding, sentence_embeddings)

    best_idx = np.argmax(similarity)
    best_score = similarity[0][best_idx]
    if best_score > 0.5:
        best_match = data.iloc[best_idx]['Cleaned_Description']
        return best_match, best_score
    else:
        return None, None



def summarizy_text(response, num_sentences=2):
    # Summarize text using the model
    summary = summarizer(response, max_length=250, min_length=75, do_sample=False, length_penalty=2.0)

    summary_text = summary[0]['summary_text']

    # Check if the summary ends abruptly and add a conclusion if needed
    if not summary_text.endswith(('.', '!', '?')):
        summary_text += '.'

    return summary_text


def chatbot(user_input_queue):
    print('Hello! I am BeeB0T. Start typing your text after greeting me. For ending the convo type bye!')

    flag = True
    while flag:
        user_response = user_input_queue.get()
        user_response = user_response.lower()

        if user_response in ['thank you', 'thanks']:
            print('Bot: You are Welcome...')
            flag = False
            break

        if user_response.startswith(("what about", "tell me more", "can you explain")):
            if conversation_context["last_response"]:
                print("--- Follow-Up Response ---")
                print("Let me expand on that...")
                print(conversation_context["last_response"])
            else:
                print("I'm not sure what you're referring to. Can you ask a specific question?")
            continue

        greeting_response = greet(user_response)
        if greeting_response:
            print('Bot:', greeting_response)
        else:
            query_embedding = process_user_query(user_response)
            response, doc_idx = find_relevant_document(query_embedding, sentence_embeddings, data)

            if response:

                feedback = user_feedback(response)
                log_feedback(user_response, response, feedback)

                conversation_context["last_query"] = user_response
                conversation_context["last_response"] = response
            else:
                print("I'm sorry, I couldn't find relevant information.")
                print("Can you clarify or provide more details?")

                unanswered_queries.append(user_response)
                log_unanswered_query(user_response)
                print("Your query has been logged for further analysis. Thank you for helping me improve!")


def review_unanswered_queries(filename='unanswered_questions.json'):
    with open(filename, 'r') as file:
        unanswered_queries = [json.loads(line) for line in file.readlines()]
    if unanswered_queries:
        for query in unanswered_queries:
            print(f"Reviewing query: {query['query']}")
            response, doc_idx = find_relevant_document(query['query'], None, None)

            if doc_idx is None:
                print("Still no relevant document found for this query.")
            else:
                summary = summarizy_text(response)
                print(f"Answer found: {response}")
                print(f"Summary: {summary}")
                query['status'] = 'answered'

        with open(filename, 'w') as file:
            for query in unanswered_queries:
                file.write(json.dumps(query) + "\n")
    else:
        print("No unanswered queries!")


def run_scheduled_tasks():
    while True:
        schedule.run_pending()
        time.sleep(1)


def main():
    user_input_queue = queue.Queue()

    chatbot_thread = threading.Thread(target=chatbot, args=(user_input_queue,))
    chatbot_thread.start()

    scheduled_thread = threading.Thread(target=run_scheduled_tasks)
    scheduled_thread.start()

    schedule.every().day.at("11:00").do(review_unanswered_queries)


    while True:
        user_input = input("Message BeeBOT: ")
        user_input_queue.put(user_input)
        if user_input.lower() in ['bye', 'exit']:
          print('Bot: Goodbye!')
          break

if __name__ == "__main__":
  main()

import os
def retrain_model(new_queries):
    # Load existing patent texts
    patent_texts = load_existing_patent_texts()

    # Extend with new queries
    patent_texts.extend(new_queries)

    # Preprocess the patent texts (normalization, tokenization, etc.)
    preprocessed_patents = preprocess_text(patent_texts)

    # Initialize the SentenceTransformer model (can be replaced with any other model if needed)
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode patent texts into embeddings
    patent_embeddings = model.encode(preprocessed_patents)

    # Save the retrained embeddings to a file for future use
    np.save("retrained_patent_embeddings.npy", patent_embeddings)

    # Optionally, save the updated patent texts for reference
    np.save("retrained_patent_texts.npy", patent_texts)

    print("Model retrained successfully and embeddings saved!")

def load_existing_patent_texts(file_path='/content/processed_entities.csv'):
    try:
        # Read the CSV file and convert the 'text' column to a list
        patent_texts = pd.read_csv(file_path)['Cleaned_Description'].tolist()
        return patent_texts
    except Exception as e:
        print(f"Error loading patent texts: {e}")
        return []

def LemNormalize(text):
    if not isinstance(text, str):
        text = str(text)
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punc_dict)))

def preprocess_text(text_data):
    return [' '.join(LemNormalize(text)) for text in text_data]

def load_unanswered_queries(file_path="unanswered_questions.json"):
    try:
          with open(file_path, 'r') as file:
              unanswered_queries = json.load(file)
          return unanswered_queries
    except Exception as e:
          print(f"Error loading unanswered queries: {e}")
          return []

# If there are unanswered queries, retrain the model with them
unanswered_queries = load_unanswered_queries()
if unanswered_queries:
    retrain_model(unanswered_queries)

Hello! I am BeeB0T. Start typing your text after greeting me. For ending the convo type bye!
Message BeeBOT: hi
Bot: Hello! How can I assist you today?
Message BeeBOT: uav copter
Bot: these publication describe a drone control system
Was this response helpful? (yes/no): yes
Message BeeBOT: exit
