In [1]:
import json
import random
import csv
from flask import Flask, render_template, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import requests
from bs4 import BeautifulSoup

# Function to read JSON file and get 500 random questions
def get_random_questions_from_json(file_path, num_questions=500):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Extracting questions from the JSON data
    questions = [item['metadata']['question'] for item in data if 'metadata' in item and 'question' in item['metadata']]

    # If there are fewer than 500 questions, return all of them
    if len(questions) < num_questions:
        return questions

    # Select 500 random questions
    random_questions = random.sample(questions, num_questions)

    return random_questions

# Example usage
file_path = '/content/documents.json'
random_questions = get_random_questions_from_json(file_path)
for question in random_questions:
    print(question)


What is the major mechanism that results in acclimatization?
What are the physical effects of the illness on the child, such as poor growth and delayed puberty?
What are the morphological and molecular changes associated with cell death by ATP depletion, membrane damage, and free radical injury?
What are the age-adjusted incidence rates per 100,000 U.S. population for squamous cell (epidermoid) carcinoma, small cell carcinoma, large cell carcinoma, carcinoid, mucoepidermoid carcinoma, adenoid cystic carcinoma, sarcoma and other soft tissue tumors, and all others and unspecified carcinomas based on data from the Surveillance, Epidemiology, and End Results (SEER) Program of the National Cancer Institute for the years 1983-1987?
What is the function of the intrinsic ligament spanning from the ilium of the hip bone to the femur, on the superior-anterior aspect of the hip joint?
What are the different types of nevi mentioned in the data provided?
What are some common examples of day-to-day 

In [6]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')
model = AutoModelForCausalLM.from_pretrained('gokcenazakyol/medical-literature-small')

def web_scraping(qs):
    URL = 'https://www.google.com/search?q=' + qs
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    links = soup.findAll("a")
    all_links = []
    for link in links:
        link_href = link.get('href')
        if "url?q=" in link_href and not "webcache" in link_href:
            all_links.append((link.get('href').split("?q=")[1].split("&sa=U")[0]))

    flag = False
    for link in all_links:
        if 'https://en.wikipedia.org/wiki/' in link:
            wiki = link
            flag = True
            break

    div0 = soup.find_all('div', class_="kvKEAb")
    div1 = soup.find_all("div", class_="Ap5OSd")
    div2 = soup.find_all("div", class_="nGphre")
    div3 = soup.find_all("div", class_="BNeawe iBp4i AP7Wnd")

    if len(div0) != 0:
        return div0[0].text
    elif len(div1) != 0:
        return div1[0].text + "\n" + div1[0].find_next_sibling("div").text
    elif len(div2) != 0:
        return div2[0].find_next("span").text + "\n" + div2[0].find_next("div", class_="kCrYT").text
    elif len(div3) != 0:
        return div3[1].text
    elif flag == True:
        page2 = requests.get(wiki)
        soup = BeautifulSoup(page2.text, 'html.parser')
        title = soup.select("#firstHeading")[0].text

        paragraphs = soup.select("p")
        for para in paragraphs:
            if bool(para.text.strip()):
                return title + "\n" + para.text
    return ""

def predict(input):
    response = web_scraping(input)
    if response != "":
        return response

    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(new_user_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(chat_history_ids[:, new_user_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

# Saving questions and answers to CSV
with open('questions_answers.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Question', 'Answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for question in random_questions:
        response = predict(question)
        writer.writerow({'Question': question, 'Answer': response})