In [None]:
import spacy
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
nlp = spacy.load("en_core_web_sm")
ner_cats = ["PERSON", "ORG", "GPE", "PRODUCT"]
not_ner_cats = ["DATE"]

In [None]:
text = '''Robert Downey Jr. is an American actor and producer.
He is best known for his roles in films such as Iron Man,
The Avengers, and Sherlock Holmes. Downey has won several awards
for his acting, including two Screen Actors Guild Awards
and a Golden Globe Award. He has also been nominated for an Academy Award.'''
doc = nlp(text)

In [None]:
entities = []
for ent in doc.ents:
  entities.append((ent.text, ent.label_))

In [None]:
for entity, category in entities:
  print(f"{entity}: {category}")

Robert Downey Jr.: PERSON
American: NORP
Iron Man: PERSON
Avengers: NORP
Sherlock Holmes: PERSON
Downey: ORG
two: CARDINAL
Screen Actors Guild Awards: ORG
an Academy Award: ORG


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:


# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the file containing sentences
def load_sentences(file_path):
   with open(file_path, 'r') as file:
    sentences = file.readlines()
   return [sentence.strip() for sentence in sentences]

# Preprocess the input sentence
def preprocess_sentence(sentence):
   tokens = word_tokenize(sentence.lower())
   stop_words = set(stopwords.words('english'))
   tokens = [token for token in tokens if token not in stop_words]
   lemmatizer = WordNetLemmatizer()
   tokens = [lemmatizer.lemmatize(token) for token in tokens]
   return ' '.join(tokens)

def get_most_similar_sentence(user_input, sentences):
   preprocessed_user_input = preprocess_sentence(user_input)
   preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]
   vectorizer = TfidfVectorizer()
   tfidf_matrix = vectorizer.fit_transform([preprocessed_user_input] + preprocessed_sentences)
   similarity_scores = (tfidf_matrix * tfidf_matrix.T).A[0][1:]
   most_similar_index = similarity_scores.argmax()
   most_similar_sentence = sentences[most_similar_index]
   return most_similar_sentence

file_path = '/content/drive/MyDrive/OELP_BERT/BERT_SQuad/trial.txt'  # Path to the file containing sentences

sentences = load_sentences(file_path)
print(type(sentences))
print(sentences)

user_input = 'hello I am a women'

most_similar_sentence = get_most_similar_sentence(user_input, sentences)
print('Most similar sentence:', most_similar_sentence)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


<class 'list'>
['this is comedy movie.', '', 'this is horror movie.', '', 'hello I am a girl.', '', 'hello I am a boy.']
Most similar sentence: hello I am a girl.


In [None]:
!wget -nc https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json


--2024-05-26 08:54:50--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2024-05-26 08:54:51 (66.5 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]



In [None]:
import json  #importing squad
with open('train-v2.0.json', 'rb') as f1:
  raw_data = json.load(f1)
new_contexts = []
question = ""
sq_contexts = []
sq_questions = []
sq_answers = []
sq_answers_starts = []
sq_ids = []
count = 0
for group in raw_data['data']:
  for paragraph in group['paragraphs']:
    context = paragraph['context']
    for qa in paragraph['qas']:
      if(len(qa['answers'])!=0):
        question = qa['question']
        sq_questions.append(qa['question'])
        sq_contexts.append(context)
        sq_ids.append(qa['id'])
        sq_answers.append(qa['answers'][0]['text'])
        sq_answers_starts.append(qa['answers'][0]['answer_start'])
      else:
        question = qa['question']
        sq_questions.append(qa['question'])
        sq_contexts.append(context)
        sq_ids.append(qa['id'])
        sq_answers.append("")
        sq_answers_starts.append(-1)

In [None]:
print(len(sq_ids))

130319


In [None]:
import json

# Opening JSON file
f = open('/content/drive/MyDrive/OELP_BERT/BERT_SQuad/QGen+Augments.json')

# returns JSON object as
# a dictionary
data = json.load(f)

# Iterating through the json
# list

test_questions = []
test_sentences = []
test_ids = []

for i in range(len(data)):
    test_ids.append(data[i]["id"])
    test_sentences.append(data[i]["sentence"])
    test_questions.append(data[i]["question"])

# Closing file
f.close()

In [None]:
print(len(test_ids))
print(len(test_sentences))
print(len(test_questions))

23293
23293
23293


TODO:
Find all questions with same ids, combine contexts for dataset_context, combine questions into a list, find most similar question, find the entity in that question, check that this is indeed the answer. Then use the question to find the most similar sentence and find the word with the answer in the most similar sentence from the context to the question, find it's position and put answer_start into the dataset.

In [None]:
data_ids = []
data_contexts = []
data_question = []
data_answer = []
data_answer_start = []

In [None]:
print(type(sq_ids[0]))

<class 'str'>


In [None]:
err_count = 0
found_count = 0

In [None]:
test_rec = 91
idx = test_rec
id = test_ids[idx]
print(f"data id: {id}")
print(f"data q: {test_questions[idx]}")

#get all questions and contexts with the same id
indexes = [i for i, sq_id in enumerate(sq_ids) if sq_id == id]
temp_contexts = []
temp_questions = []
temp_ans = ""
for index in indexes:
  temp_contexts.append(sq_contexts[index])
  temp_questions.append(sq_questions[index])

final_context = ""    #making context

for ctxt in temp_contexts:
  final_context += ctxt
final_context = final_context.lower()
final_context = final_context.replace("é", "e")
print(f"data context: {final_context}")
print("all questions")
print(temp_questions)
closest_question = get_most_similar_sentence(test_questions[idx], temp_questions)   #finding answer
print(f"Closest_question: {closest_question}")
doc = nlp(closest_question)
entities = []
for ent in doc.ents:
  entities.append((ent.text, ent.label_))

cleaned_sentence = re.sub(r'[^\w\s]', '', test_questions[idx])
q_words = cleaned_sentence.split()

for ele in entities:
  if ele[0] not in q_words and ele[1] != "DATE":
    temp_ans = ele[0]
    break
temp_ans = temp_ans.lower()
print(f"data ans: {temp_ans}")

#getting answer_start
if temp_ans == "":
  print(f"data ans start: -1")
  data_answer_start.append(-1)
else:
  sentences = re.split(r'(?<=[.!?]) +', final_context)
  for sentence in sentences:
    sentence = sentence.lower()
  question_doc = nlp(closest_question)
  occurrences = []
  current_index = 0
  best_index = -1
  best_similarity = -1
  for sentence in sentences:
    sentence_start_index = final_context.find(sentence, current_index)
    for match in re.finditer(re.escape(temp_ans), sentence):
      actual_index = sentence_start_index + match.start()
      sentence_doc = nlp(sentence)
      similarity = cosine_similarity([question_doc.vector], [sentence_doc.vector])[0][0]
      if similarity > best_similarity:
          best_similarity = similarity
          best_index = actual_index
      occurrences.append((actual_index, similarity))
    current_index = sentence_start_index + len(sentence)

  print(f"data answer start: {best_index}")

data id: 57267b57708984140094c796
data q: Who opened its first eight retail stores?
data context: chain department stores grew rapidly after 1920, and provided competition for the downtown upscale department stores, as well as local department stores in small cities. j. c. penney had four stores in 1908, 312 in 1920, and 1452 in 1930. sears, roebuck & company, a giant mail-order house, opened its first eight retail stores in 1925, and operated 338 by 1930, and 595 by 1940. the chains reached a middle-class audience, that was more interested in value than in upscale fashions. sears was a pioneer in creating department stores that catered to men as well as women, especially with lines of hardware and building materials. it deemphasized the latest fashions in favor of practicality and durability, and allowed customers to select goods without the aid of a clerk. its stores were oriented to motorists – set apart from existing business districts amid residential areas occupied by their targe

In [None]:
file_str = ""

In [None]:
for idx, id in enumerate(test_ids):
  if idx%1000==0:
    print(idx)
  data_ids.append(id)
  data_question.append(test_questions[idx])
  # print(f"{idx}")
  #file_str += f"{idx}\n"

  #get all questions and contexts with the same id
  indexes = [i for i, sq_id in enumerate(sq_ids) if sq_id == id]
  temp_contexts = []
  temp_questions = []
  temp_ans = ""
  for index in indexes:
    temp_contexts.append(sq_contexts[index])
    temp_questions.append(sq_questions[index])

  final_context = ""    #making context

  for ctxt in temp_contexts:
    final_context += ctxt
  data_contexts.append(final_context)
  # print("Context")
  # print(data_contexts[-1])
  #file_str += "Context\n"
  #file_str += f"{data_contexts[-1]}\n"
  # print("Question")
  # print(data_question[-1])
  #file_str += "Question\n"
  #file_str += f"{data_question[-1]}\n"
  closest_question = get_most_similar_sentence(test_questions[idx], temp_questions)   #finding answer
  doc = nlp(closest_question)
  entities = []
  for ent in doc.ents:
    entities.append((ent.text, ent.label_))

  cleaned_sentence = re.sub(r'[^\w\s]', '', test_questions[idx])
  q_words = cleaned_sentence.split()

  for ele in entities:
    if ele[0] not in q_words and ele[1] != "DATE":
      temp_ans = ele[0]
      break
  data_answer.append(temp_ans)

  #getting answer_start
  if temp_ans == "":
    data_answer_start.append(-1)
    err_count += 1
  else:
    sentences = re.split(r'(?<=[.!?]) +', final_context)
    for sentence in sentences:
      sentence = sentence.lower()
    question_doc = nlp(closest_question)
    occurrences = []
    current_index = 0
    best_index = -1
    best_similarity = -1
    for sentence in sentences:
      sentence_start_index = final_context.find(sentence, current_index)
      for match in re.finditer(re.escape(temp_ans), sentence):
        actual_index = sentence_start_index + match.start()
        sentence_doc = nlp(sentence)
        similarity = cosine_similarity([question_doc.vector], [sentence_doc.vector])[0][0]
        if similarity > best_similarity:
            best_similarity = similarity
            best_index = actual_index
        occurrences.append((actual_index, similarity))
      current_index = sentence_start_index + len(sentence)
    data_answer_start.append(best_index)
    if(best_index==-1):
      err_count += 1
    else:
      found_count += 1

  # for entity, category in entities:
  #   print(f"{entity}: {category}")



0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000


In [None]:
with open("/content/drive/MyDrive/OELP_BERT/BERT_SQuad/Context_question_w_id.txt", "w") as wf:
  wf.write(file_str)

In [None]:
txt_test = "New York City's most important economic sector lies in its role as the headquarters for the U.S.financial industry, metonymously known as Wall Street. The city's "

In [None]:
print(len(txt_test))

162


In [None]:
print(err_count)
print(found_count)

10121
13172


In [None]:
def stripped_question(question):
  question = question.replace("\"", "")
  question = question.replace("\n", "")
  question = question.replace("\\", "\\\\")
  return question

In [None]:
file_str = ""
file_str += "[\n"
for i in range(len(data_contexts)):
  file_str += "{\n"
  file_str += f"\"id\" : \"{data_ids[i]}\",\n"
  file_str += f"\"SIno\" : {i},\n"
  file_str += f"\"sentence\" : \"{stripped_question(data_contexts[i])}\",\n"
  file_str += f"\"question\" : \"{stripped_question(data_question[i])}\",\n"
  file_str += f"\"answer\" : \"{stripped_question(data_answer[i])}\",\n"
  file_str += f"\"answer_start\" : {data_answer_start[i]},\n"
  file_str += f"\"answer_end\" : 0,\n"
  if(data_answer_start[i]==-1 or data_answer[i]==""):
    file_str += "\"is_impossible\": true\n"
  else:
    file_str += "\"is_impossible\": false\n"
  if i!= len(data_contexts)-1:
    file_str += "},\n"
  else:
    file_str += "}\n"
file_str += "]\n"

In [None]:
with open("/content/drive/MyDrive/OELP_BERT/BERT_SQuad/20k_w_indexes.json", "w") as writefile:
  writefile.write(file_str)

In [None]:
#DONE HERE

In [None]:
!pip install openai==0.28



In [None]:
import openai

In [None]:
openai.api_key = ''

def find_exact_answer(context, question):
    # Construct the prompt
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer the question based on the context with an exact string from the context."

    # Call the OpenAI API
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # You can switch to "gpt-4" if available
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )

    # Extract the response text
    answer = response['choices'][0]['message']['content'].strip()

    # Find the exact match in the context
    if answer in context:
        return answer
    else:
        return None

# Example usage
context = """Python is a high-level, interpreted programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically-typed and garbage-collected."""
question = "What does Python's design philosophy emphasize?"

answer = find_exact_answer(context, question)
print(f"Answer: {answer}")

# Check if the answer is part of the context
if answer:
    print(f"The answer '{answer}' is found in the context.")
else:
    print("The answer was not found in the context.")



RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [None]:
import re
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def find_best_answer_location(paragraph, question, answer):
    # Split paragraph into sentences
    sentences = re.split(r'(?<=[.!?]) +', paragraph)

    # Process the question with spaCy
    question_doc = nlp(question)

    occurrences = []
    current_index = 0
    best_index = -1
    best_similarity = -1

    for sentence in sentences:
        sentence_start_index = paragraph.find(sentence, current_index)

        # Find all occurrences of the answer in the current sentence
        for match in re.finditer(re.escape(answer), sentence):
            # Calculate the actual index in the paragraph
            actual_index = sentence_start_index + match.start()

            # Process the sentence with spaCy
            sentence_doc = nlp(sentence)

            # Calculate similarity between the question and the sentence
            similarity = cosine_similarity(
                [question_doc.vector],
                [sentence_doc.vector]
            )[0][0]

            if similarity > best_similarity:
                best_similarity = similarity
                best_index = actual_index

            occurrences.append((actual_index, similarity))

        current_index = sentence_start_index + len(sentence)

    return best_index, best_similarity, occurrences

# Example usage
paragraph = "This is a sample paragraph. The answer is here. This is another sentence with the answer. Finally, the answer is here again."
question = "Where is the answer?"
answer = "the answer"

best_index, best_similarity, occurrences = find_best_answer_location(paragraph, question, answer)
print(f"Best index: {best_index}, Best similarity: {best_similarity}")
print("All occurrences with similarities:", occurrences)


In [None]:

for i in range(len(test_ids)):
  try:
    idk_temp_id = sq_ids.index(test_ids[i])
    test_questions.append(sq_questions[i])   #ISSUE
    test_contexts.append(sq_contexts[i])
    test_answers.append(sq_answers[i])
    test_answer_starts.append(sq_answers_starts[i])
  except:
    continue

In [None]:
len(test_questions)

15695

<class 'list'>
['this is comedy movie.', '', 'this is horror movie.', '', 'hello I am a girl.', '', 'hello I am a boy.']
Most similar sentence: hello I am a girl.


In [None]:
with open('/content/drive/My Drive/OELP_BERT/BERT_SQuad/final_15k.json', 'w') as f:
  f.write("[\n")
  for i in range(len(test_ids)):
    f.write("{\n")
    f.write(f"\"id\" : \"{test_ids[i]}\",\n")
    f.write(f"\"sentence\" : \"{stripped_question(test_contexts[i])}\",\n")
    f.write(f"\"question\" : \"{stripped_question(test_questions[i])}\",\n")
    f.write(f"\"answer\" : \"{stripped_question(test_answers[i])}\",\n")
    f.write(f"\"answer_start\" : {test_answer_starts[i]},\n")
    f.write(f"\"answer_end\" : 0,\n")

    if(test_answer_starts[i]=="-1"):
      f.write("\"is_impossible\": true\n")
    else:
      f.write("\"is_impossible\": false\n")

    if(i!=len(test_ids)-1):
      f.write("},\n")
    else:
      f.write("}\n")
  f.write("]")