In [1]:
import sqlite3
from dotenv import load_dotenv
import os
import random
from pydantic import BaseModel

from langchain_community.utilities.sql_database import SQLDatabase

# Load the OpenAI API key from the .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("The OPENAI_API_KEY environment variable is not set.")

from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI
from openai import OpenAI

from typing import Optional

# database creation

In [2]:
db = sqlite3.connect("aurora.db")
c = db.cursor()

c.execute('''CREATE TABLE languages
            (LanguageID INTEGER PRIMARY KEY,
             Language TEXT NOT NULL)''')

c.execute('''CREATE TABLE learning_style
            (LearningID INTEGER PRIMARY KEY,
             Name TEXT NOT NULL)''')

c.execute('''CREATE TABLE city
            (CityID INTEGER PRIMARY KEY,
             CityName TEXT NOT NULL,
             Country Text NOT NULL)''')

c.execute('''CREATE TABLE clients
            (UserID INTEGER PRIMARY KEY,
             CityID INTEGER ,
             Username Text UNIQUE NOT NULL,
             Name TEXT NOT NULL,
             DateOfBirth TEXT NOT NULL, 
             Password TEXT NOT NULL,
             Email TEXT UNIQUE NOT NULL,
             Gender TEXT NOT NULL,
             PhoneNumber TEXT UNIQUE,
             Streak INTEGER NOT NULL DEFAULT 0,
             PreferredTime TEXT NOT NULL,
             MinPerDay INTEGER NOT NULL,
             FOREIGN KEY (CityID) REFERENCES city(CityID))''')

c.execute('''CREATE TABLE user_language
             (UserID INTEGER,
              LanguageID INTEGER,
              PrimaryLanguage INTEGER NOT NULL DEFAULT 0,
              FOREIGN KEY (UserID) REFERENCES clients(UserID),
              FOREIGN KEY (LanguageID) REFERENCES languages(LanguageID),
              PRIMARY KEY (UserID, LanguageID))''')

c.execute('''CREATE TABLE user_learning_style
             (UserID INTEGER,
              LearningID INTEGER,
              FOREIGN KEY (UserID) REFERENCES clients(UserID),
              FOREIGN KEY (LearningID) REFERENCES learning_style(LearningID),
              PRIMARY KEY (UserID, LearningID))''')

c.execute('''CREATE TABLE course
             (CourseID INTEGER PRIMARY KEY,
              Name TEXT NOT NULL,
              Syllabus TEXT NOT NULL,
              LearningObjectives TEXT NOT NULL,
              LearningUnits TEXT NOT NULL,
              Evaluation TEXT NOT NULL,
              Credits INTEGER NOT NULL)''')

c.execute('''CREATE TABLE previous_courses
             (UserID INTEGER,
              CourseID INTEGER,
              GPA REAL,
              FOREIGN KEY (UserID) REFERENCES clients(UserID),
              FOREIGN KEY (CourseID) REFERENCES course(CourseID),
              PRIMARY KEY (UserID, CourseID))''')

c.execute('''CREATE TABLE user_courses
             (UserID INTEGER,
              CourseID INTEGER,
              FOREIGN KEY (UserID) REFERENCES clients(UserID),
              FOREIGN KEY (CourseID) REFERENCES course(CourseID),
              PRIMARY KEY (UserID, CourseID))''')

c.execute('''CREATE TABLE educational_provider
             (ProviderID INTEGER PRIMARY KEY,
              CityID INTEGER,
              Name TEXT NOT NULL,
              Type TEXT NOT NULL,
              FOREIGN KEY (CityID) REFERENCES city(CityID))''')

c.execute('''CREATE TABLE course_location
             (ProviderID INTEGER,
              CourseID INTEGER,
              FOREIGN KEY (ProviderID) REFERENCES educational_provider(ProviderID),
              FOREIGN KEY (CourseID) REFERENCES course(CourseID),
              PRIMARY KEY (ProviderID, CourseID))''')

c.execute('''CREATE TABLE topic
             (TopicID INTEGER PRIMARY KEY,
              Name TEXT NOT NULL)''')

c.execute('''CREATE TABLE papers
             (PaperID INTEGER PRIMARY KEY,
              ProviderID INTEGER NOT NULL,
              Content TEXT NOT NULL,
              Author TEXT NOT NULL,
              FOREIGN KEY (ProviderID) REFERENCES educational_provider(ProviderID))''')

c.execute('''CREATE TABLE paper_topic
             (TopicID INTEGER,
              PaperID INTEGER,
              FOREIGN KEY (TopicID) REFERENCES Topic(TopicID),
              FOREIGN KEY (PaperID) REFERENCES papers(PaperID),
              PRIMARY KEY (PaperID, TopicID))''')

c.execute('''CREATE TABLE course_topic
             (TopicID INTEGER,
              CourseID INTEGER,
              FOREIGN KEY (TopicID) REFERENCES Topic(TopicID),
              FOREIGN KEY (CourseID) REFERENCES course(CourseID),
              PRIMARY KEY (CourseID, TopicID))''')

db.commit()
db.close()

# languages table

In [3]:
languages = "Arabic Bengali Mandarin Danish Dutch English French Finnish German Greek Gujarati Hausa Hebrew\
      Hindi Hungarian Icelandic Indonesian Italian Japanese Javanese Kannada Kazakh Khmer Korean Kurdish\
          Lao Malay Malayalam Marathi Mongolian Nepali Norwegian Pashto Farsi Polish Portuguese Punjabi\
              Romanian Serbian Sinhala Spanish Swahili Swedish Tamil Telugu Thai Turkish Ukrainian Urdu Vietnamese"

languages = [l.strip() for l in languages.split()]

# Connect to the database
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for l in languages:
    c.execute("INSERT INTO languages (Language) VALUES (?)", (l,))

conn.commit()
conn.close()

# learning_style

In [4]:
styles = "Visual Auditory Kinesthetic Reading/Writing Logical Social Solitary Verbal Musical Naturalistic"

styles = [l.strip() for l in styles.split()]

# Connect to the database
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for s in styles:
    c.execute("INSERT INTO learning_style (Name) VALUES (?)", (s,))

conn.commit()
conn.close()

# cities


In [5]:
cities = """New York – United States

Los Angeles – United States

Chicago – United States

Mexico City – Mexico

Guadalajara – Mexico

Toronto – Canada

Montreal – Canada

Quebec City – Canada

London – United Kingdom

Edinburgh – United Kingdom

Cardiff – United Kingdom

Paris – France

Marseille – France

Berlin – Germany

Munich – Germany

Frankfurt – Germany

Rome – Italy

Milan – Italy

Naples – Italy

Turin – Italy

Madrid – Spain

Barcelona – Spain

Valencia – Spain

Seville – Spain

Lisbon – Portugal

Porto – Portugal

Braga – Portugal

Coimbra – Portugal

Aveiro – Portugal

Funchal – Portugal

Évora – Portugal

Amsterdam – Netherlands

Rotterdam – Netherlands

The Hague – Netherlands

Brussels – Belgium

Zurich – Switzerland

Geneva – Switzerland

Bern – Switzerland

Vienna – Austria

Athens – Greece

Thessaloniki – Greece

Cairo – Egypt

Alexandria – Egypt

Istanbul – Turkey

Ankara – Turkey

Dubai – United Arab Emirates

Abu Dhabi – United Arab Emirates

Mumbai – India

Delhi – India

Beijing – China

Shanghai – China

Tokyo – Japan

Osaka – Japan"""
cities = [pair.split(" – ") for pair in cities.split("\n") if pair!=""]
#cities, counties = [c[0] for c in cities], [c[1] for c in cities]

# Connect to the database
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for pair in cities:
    
    c.execute("INSERT INTO city (CityName, Country) VALUES (?, ?)", (pair[0], pair[1]))

conn.commit()
conn.close()

# clients
change number users final = 10000


In [6]:
def gen_date():
    years = [_ for _ in range(1939, 2012)]
    prob = [1/(abs(y-2004)+1) for y in years]
    prob = [p/sum(prob) for p in prob]

    year = random.choices(years, prob)[0]
    month = random.randint(1, 12)

    if month in [1, 3, 5, 7, 8, 10, 12]:
        day = random.randint(1, 31)
    elif month in [4, 6, 9, 11]:
        day = random.randint(1, 30)
    else:
        if year%4==0 and year%100!=0:
            day = random.randint(1, 29)
        else:
            day = random.randint(1, 28)
    return f"{year}-{month}-{day}"

class Client(BaseModel):
    Username: str
    Name: str
    Password: str
    Email: str
    Gender: str
    PhoneNumber: str
    PreferredTime: str

In [7]:
examples = [
    {
        "example": """Username: joao_the_melhor_, Name: João Padrão,
          Password: %463!__kittymio!!12, email: joaopadraowork@outlook.com, Gender: Male, PhoneNumber: +351937442765, PreferredTime: 15-20"""
    },

    {
        "example": """Username: jaaneman_35, Name: Manav Seth, 
          Password: merepyaari__3!, email: jaaneman@gmail.com, Gender: Non-Binary, PhoneNumber: +912212345678, PreferredTime: 18-45"""
    },

    {
        "example": """Username: tvoja_mamka, Name: Maxym Oshpyrko,
          Password: 20100821MAXYMKO, email: tvijmaxymko@ukrnet.com, Gender: Prefer not to say, PhoneNumber: +8380937654567, PreferredTime: 12-35"""
    },
]

In [8]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo", temperature=1)

synthetic_data_generator = create_openai_data_generator(
    output_schema=Client,
    llm=llm,
    prompt=prompt_template,
)

In [9]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

n_users = 1500

db_path = 'aurora.db'
# Create a SQLDatabase object
db = SQLDatabase.from_uri(f'sqlite:///{db_path}')

existing_usernames = []
existing_emails = []
existing_numbers = []

synthetic_results = synthetic_data_generator.generate(
        subject="client_profile",
        extra=f"the name must be chosen at random. Make it something you wouldn't normally choose.\
              username should be either based on the name of person or just something funny people would do",
        runs=n_users,
        )

# Insert sample data
for user in range(n_users):
    city_tuple = random.choice(eval(db.run("SELECT * FROM city")))
    #gender = random.choices(["Female", "Male", "Prefer Not To Say", "Non-Binary"], [0.4, 0.4, 0.15, 0.05])[0]
    bday = gen_date()
    
    synthetic_result = synthetic_results[user]

    while synthetic_result.PhoneNumber in existing_numbers:
        synthetic_result.PhoneNumber = synthetic_result.PhoneNumber[-5:]
        for _ in range(5):
            synthetic_result.PhoneNumber += str(random.randint(0, 9))

    while synthetic_result.Email in existing_emails:
        synthetic_result.Email = random.choice("qwertyuiopasdfghjklzxcvbnm") + synthetic_result.Email

    while synthetic_result.Username in existing_usernames:
        synthetic_result.Username += "_"

    c.execute("INSERT INTO clients (CityID , Username, Name, DateOfBirth, Password, Email, Gender, PhoneNumber, Streak, PreferredTime, MinPerDay) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
               (city_tuple[0], 
               synthetic_result.Username, synthetic_result.Name, bday,
               synthetic_result.Password, synthetic_result.Email, synthetic_result.Gender, synthetic_result.PhoneNumber, 
               random.randint(0, 1531), 
               synthetic_result.PreferredTime, random.choice([5, 10, 15, 20, 25, 30])))
    
    existing_usernames.append(synthetic_result.Username)
    existing_emails.append(synthetic_result.Email)
    existing_numbers.append(synthetic_result.PhoneNumber)

conn.commit()
conn.close()

# user_lang

In [10]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for user in range(1, n_users+1):
    n_lan = random.choices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                           [0.5, 0.2, 0.12, 0.06, 0.04, 0.027, 0.025, 0.018, 0.009, 0.001])[0]
    ids = random.sample([_ for _ in range(1, len(languages)+1)], k=n_lan)
    primary = random.choice(ids)
    for i in ids:
        c.execute("INSERT INTO user_language (UserID, LanguageID, PrimaryLanguage) VALUES (?, ?, ?)", (user, i, i==primary))

conn.commit()
conn.close()

# learning style


In [11]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for user in range(1, n_users+1):
    n_styles = random.randint(1, 3)
    ids = random.sample([_ for _ in range(1, len(styles)+1)], k=n_styles)
    for i in ids:
        c.execute("INSERT INTO user_learning_style (UserID, LearningID) VALUES (?, ?)", (user, i))

conn.commit()
conn.close()

# course
n_course = 100

In [12]:
class Course(BaseModel):
    Name: str
    Syllabus: str
    LearningObjectives: str
    LearningUnits: str
    Evaluation: str
    Credits: int

In [13]:
examples = [
    {
        "example": """
        Name: Text Mining, 

        Syllabus: The Text Mining curricular unit aims to enable students to acquire a fundamental understanding of the science of
Text Mining in supervised and unsupervised problems, easily transferable to various real-world challenges in
Data Science. This course will cover different topics and challenges commonly associated with processing and
manipulating text data in Data Science projects, presenting the essential methodological aspects of Text Mining
and the most important and currently utilized tools, focusing on traditional Machine Learning algorithms. The
topics to be addressed include predictive and descriptive algorithms in different contexts, such as the application
of Naïve Bayes and K-Means in problems like sentiment analysis and document clustering. By the end of the
course, students will be able to utilize the acquired skills to produce a fully processed dataset compatible for the
application of machine learning models targeted towards text data, enabling the extraction of relevant knowledge
for decision-making across various contexts., 

        LearningObjectives: 1.Gain a comprehensive understanding of the various tasks required when initiating a text mining project
from scratch.
2.Understand the most important text mining methodologies and the key steps in the process.
3.Familiarize themselves with the most important exploration and visualization tools for text data.
4.Possess the capability to work with unstructured data and prepare it for subsequent stages in the Data
Science process.
5.Understand and apply a wide range of predictive models in text data.
6.Understand and apply a wide range of descriptive models in text data, 

        LearningUnits: 1.Introduction to Text Minin 2.Data Transformation and Preprocessing 3.Supervised Models 4.Unsupervised Models, 

        Evaluation: 1st Exam Period
                    Mini-Quizzes in Theoretical Classes (5%)
                    Group Project (35%)
                    Exam (60%)
                    2nd Exam Period
                    Group Project (35%)
                    Exam (65%),

          Credits: 6"""
    },
    {"example" : """
    Name: Neurobiology,
    Syllabus: The Neurobiology curricular unit aims to provide students with a thorough understanding of the biological basis of neural systems, focusing on the structure and function of the nervous system. This course will explore the fundamental principles of neurobiology, including neural signaling, sensory and motor systems, and the molecular and cellular mechanisms underlying neural function. Additionally, it will cover the latest advances in neurobiological research and their applications in fields such as medicine, biotechnology, and cognitive science. By the end of the course, students will have a strong foundation in neurobiology, enabling them to analyze and interpret the neural mechanisms that govern behavior, cognition, and homeostasis.,
    LearningObjectives: 1. Understand the basic anatomy and physiology of the nervous system.\n2. Comprehend the molecular and cellular mechanisms of neural signaling.\n3. Analyze the organization and function of sensory and motor systems.\n4. Explore the processes involved in synaptic transmission and plasticity.\n5. Study the neural basis of higher cognitive functions and behavior.\n6. Apply neurobiological concepts to current research and clinical contexts.,
    LearningUnits: 1. Introduction to Neurobiology\n2. Neural Signaling and Communication\n3. Sensory and Motor Systems\n4. Synaptic Plasticity and Neural Networks\n5. Cognitive and Behavioral Neurobiology,
    Evaluation: "1st Exam Period\nMini-Quizzes in Theoretical Classes (5%)\nGroup Project (30%)\nExam (65%)\n2nd Exam Period\nGroup Project (30%)\nExam (70%),
    Credits: 6"""
     },
     {"example": """
    Name: Applied Mathematics",
    Syllabus: The Applied Mathematics curricular unit aims to equip students with advanced mathematical tools and techniques used in solving real-world problems across various domains such as engineering, economics, biology, and data science. This course emphasizes the practical application of mathematical models, numerical methods, and statistical analysis. Students will learn to formulate, analyze, and interpret mathematical models to solve complex problems. The course covers a broad range of topics, including differential equations, linear algebra, optimization, and probabilistic modeling. By the end of the course, students will be proficient in applying mathematical methods to real-world scenarios, enhancing their problem-solving and analytical skills.,
    LearningObjectives: 1. Develop the ability to formulate mathematical models for real-world problems.\n2. Gain proficiency in solving differential equations and applying them in various contexts.\n3. Understand and apply numerical methods for approximate solutions of mathematical problems.\n4. Master concepts in linear algebra and their applications in data analysis and modeling.\n5. Apply optimization techniques to find solutions in constrained and unconstrained environments.\n6. Understand probabilistic and statistical methods for data analysis and decision-making.,
    LearningUnits: 1. Introduction to Applied Mathematics\n2. Differential Equations and Modeling\n3. Linear Algebra and Matrix Computations\n4. Optimization Techniques\n5. Probability and Statistical Methods,
    Evaluation: 1st Exam Period\nMini-Quizzes in Theoretical Classes (5%)\nGroup Project (25%)\nExam (70%)\n2nd Exam Period\nGroup Project (25%)\nExam (75%),
    Credits: 6"""
     },
     {"example" : """
    Name: Introduction to Law,
    Syllabus": The Introduction to Law curricular unit provides students with a foundational understanding of legal principles, institutions, and processes. This course explores the role of law in society, the sources of law, and the structure of legal systems. It emphasizes the development of legal reasoning, critical thinking, and analytical skills necessary to interpret and apply legal rules. Students will study key legal concepts such as justice, rights, obligations, and the rule of law, alongside specific branches like constitutional, civil, and criminal law. By the end of the course, students will have a solid grasp of basic legal concepts and their applications in real-world scenarios.,
    LearningObjectives: "1. Understand the fundamental principles and concepts of law.\n2. Analyze the structure and functions of legal systems.\n3. Develop skills in legal reasoning and critical thinking.\n4. Apply legal principles to hypothetical and real-world situations.\n5. Gain insight into different branches of law, including constitutional, civil, and criminal law.\n6. Explore the role of law in promoting justice and resolving disputes.,
    LearningUnits: "1. Foundations of Law and Legal Systems\n2. Sources of Law: Statutes, Case Law, and Custom\n3. Constitutional Law and the Rule of Law\n4. Civil Law: Contracts, Torts, and Obligations\n5. Criminal Law: Principles and Procedures,
    Evaluation: "1st Exam Period\nClass Participation and Attendance (10%)\nWritten Assignment or Case Study (20%)\nFinal Exam (70%)\n2nd Exam Period\nWritten Assignment or Case Study (25%)\nFinal Exam (75%),
    Credits: 6"""
     }
]

In [14]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo", temperature=1)

synthetic_data_generator = create_openai_data_generator(
    output_schema=Course,
    llm=llm,
    prompt=prompt_template,
)

In [15]:
n_cources = 100

conn = sqlite3.connect('aurora.db')
c = conn.cursor()

synthetic_results = synthetic_data_generator.generate(
        subject="univresity course unit",
        extra=f"try to represent as much deifferent sciences as possible try representing a lot of math, biology, engineering, law, etc",
        runs=n_cources)

# Insert sample data
for course in synthetic_results:
    c.execute("INSERT INTO course (Name, Syllabus, LearningObjectives, LearningUnits, Evaluation, Credits) VALUES (?, ?, ?, ?, ?, ?)", 
              (course.Name, course.Syllabus, course.LearningObjectives, course.LearningUnits, course.Evaluation, course.Credits))

conn.commit()
conn.close()

# previous_courses

In [16]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for user in range(1, n_users+1):
    if random.random() <= 0.8:
        num = random.randint(1, 15)
        cous = random.sample([_ for _ in range(1, n_cources+1)], num)
        for cou in cous:
            c.execute("INSERT INTO previous_courses (UserID, CourseID, GPA) VALUES (?, ?, ?)", 
                      (user, cou, random.randint(0, 3)+round(random.random(), 2)))

conn.commit()
conn.close()

# user_courses

In [17]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for user in range(1, n_users+1):
    if random.random() <= 0.95:
        num = random.randint(1, 15)
        try:
            aval = set([_ for _ in range(1, n_cources+1)]) - set([el[0] for el in eval(db.run(f"SELECT CourseID from previous_courses where UserID = {user}"))])
        except SyntaxError:
            aval = [_ for _ in range(1, n_cources+1)]
        cous = random.sample(list(aval), num)
        for cou in cous:
            c.execute("INSERT INTO user_courses (UserID, CourseID) VALUES (?, ?)", (user, cou))

conn.commit()
conn.close()

# educational_provider

In [18]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for city_info in eval(db.run("select * from city")):
        c.execute("INSERT INTO educational_provider (Name, Type, CityID) VALUES (?, ?, ?)", 
                 ("Online Courses in "+city_info[1], "online", city_info[0]))

        c.execute("INSERT INTO educational_provider (Name, Type, CityID) VALUES (?, ?, ?)", 
                 ("University of "+city_info[1], "InPerson", city_info[0]))
        
        c.execute("INSERT INTO educational_provider (Name, Type, CityID) VALUES (?, ?, ?)", 
                (city_info[1]+" Research Center", "InPerson", city_info[0]))

conn.commit()
conn.close()

# course_location

In [19]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for course in range(1, n_cources+1):
    n_provider = random.randint(1, 40)
    curr = [el[0] for el in eval(db.run("select ProviderID from educational_provider"))]
    providers = random.sample(curr, n_provider)
    for p in providers:
        c.execute("INSERT INTO course_location (ProviderID, CourseID) VALUES (?, ?)", (p, course))

conn.commit()
conn.close()

# Topic

In [20]:
# Create an OpenAI API client
client = OpenAI()

def get_completion(prompt, model='gpt-3.5-turbo', **kwargs):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **kwargs,# this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [21]:
course_topic = []
total_topic = set()

for course_name, info in eval(db.run("select Name, Syllabus from course")):
    prompt = f"""
    I'm interested to classify some subjects by topics(sciences) in format '[A-Z]w+' or a few in those in comma so for example 'Physics, Math'.
          For example for neurophisology I would say Medicine or Biology can you tell me the topic for this subject and nothing else {info}
    """
    answ = get_completion(prompt, model="gpt-3.5-turbo", top_p=0.2)
    answ = answ.split(", ")
    course_topic.append(answ)
    total_topic = total_topic.union(set(answ))
total_topic = list(total_topic)


In [22]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

# Insert sample data
for topic in total_topic:
    c.execute("INSERT INTO topic (Name) VALUES (?)", (topic,))

conn.commit()
conn.close()

# course_topic

In [23]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

for i, tops in enumerate(course_topic):
    for top in tops:
        top_id = eval(db.run(f"select TopicID from Topic where Name = '{top}'"))[0][0]
        c.execute("INSERT INTO course_topic (TopicID, CourseID) VALUES (?, ?)", (top_id, i+1))

conn.commit()
conn.close()

# papers

n_paper_per_topic = 3 + luck

In [24]:
class Paper(BaseModel):
    Author: str
    Content: str

In [25]:
examples = [
    {"example" : """
    Author: Carina Albuquerque,
    Content: CarinaAlbuquerque*, Roberto Henriques & Mauro Castelli Polyp detection through colonoscopy is a widely used method to prevent colorectal cancer. The automation of this process aided by artifcial intelligence allows faster and improved detection of polyps that can be missed during a standard colonoscopy. In this work, we propose to implement various object detection algorithms for polyp detection. To improve the mean average precision (mAP) of the detection, we combine the baseline models through a stacking approach. The experiments demonstrate the potential of this new methodology, which can reduce the workload for oncologists and increase the precision of the localization of polyps. Our proposal achieves a mAP of 0.86, translated into an improvement of 34.9% compared to the best baseline model and 28.8% with respect to the weighted boxes fusion ensemble technique. In the United States, colorectal cancer (CRC) stands as the third leading cause of cancer-related deaths and it is expected to cause more than 50.000 fatalities by 2022. Additionally, recent studies show that CRC incidence in adults younger than 50 years old has nearly doubled since the early 1990s2 . Colonoscopy is considered the most efective procedure to detect colon polyps and cancer3  and is of paramount importance for efective prevention and reduced risk of death from CRC. Evidence suggests that having a colonoscopy was associated with a decrease of 67% in the risk of death from CRC4  and a 70% reduction in the incidence of late-stage CRCs5 . However, research has shown that in patients undergoing colonoscopy, 25% of polyps are missed6 . Reasons behind the oversight include overloaded healthcare systems, the presence of fat and small-sized polyps, or workers’ lack of experience7–9. With the rise of artifcial intelligence, signifcant technological advances have occurred in the medical and healthcare feld10. Deep learning (DL) is widely used as a computer vision tool to classify and detect lesions and many diseases by efciently addressing the unique challenges of medical data11. In polyp detection, evidence shows that using convolutional neural networks (CNNs) to detect polyps automatically under colonoscopy can improve the detection rate. Qadir et al.12 proposed a single-shot feed-forward fully convolutional neural network to develop a real-time polyp detection model using two-dimensional Gaussian masks. Li et al.13 used an adaptive training sample to select high-quality training samples to improve generalizability on the accurate segmentation of polyps. Taş et al.14 proposed implementing Faster R-CNN with a preprocessing approach based on a super-resolution method to improve the model’s performance in detecting colon polyps. Tang et al.15 also used Faster R-CNN with transfer learning to improve polyp detection. Te YOLO algorithm has also been proposed to improve the efciency of polyp detection. Guo et al.16 proposed an automatic polyp detection framework based on Yolov3 and active learning to reduce the rate of false positive polyp detection. Pacal et al.17 considered Yolov4 for real-time polyp detection, and Wan et al.18 used YOLOv5 for the same purpose. Jha et al.19 applied EfcientDet, RetinaNet, Faster R-CNN, and YOLOv4 to compare their performance on polyp segmentation. Wu et al.20 compared UNet, Faster R-CNN, R-FCN, RetinaNet, Yolov3, FCOS, and PraNet and presented a spatial–temporal feature transformation to detect and localize polyps in endoscopy videos automatically. Ensemble techniques were also considered to improve the polyp detection task. Sharma et al.21 applied a voting ensemble technique combining the results of ResNet101, GoogLeNet, and Xception for polyp classifcation. Younas et al.22 proposed a similar approach by implementing a weighted ensemble of GoogleNet and ResNet50, among others, to improve the accuracy of the polyp class identifcation. In segmentation, DivergentNets23 combines fve models, and masks are averaged to make the fnal segmentation mask. In object detection, Hong et al.24 and Polat et al.25 used weighted boxes fusion methods as an ensemble technique to combine predictions from diferent models. Te purpose of our study was to analyse the efcacy of implementing a stacking approach to combine the predictions of distinct object detection techniques with the goal of improving the precision in polyp detection."""
     },
     {"example": """
    Author: Tetiana Zajchyk",
    Content: Simple Summary: Bees play an essential role in maintaining biodiversity, as they are crucial for the
pollination provisions of agricultural crops and for plants in general. In recent years, a decline in bee
populations has been noted, especially due to climatic and anthropometric factors and challenges. It
is important to develop (or redevelop) approaches that can help preserve these important insects.
Improving our knowledge on bees’ microbiology can be one of the ways to better understand
ecological relations and the life cycle of these beneficial tiny workers. One of the possible alternatives
to overcome the decline in bee populations, while improving their health and productivity, is to study
their microbiota, and that of their related products, and identify beneficial microorganisms that may
be useful for use as probiotics. The microorganisms present in bees’ microbiota make an essential
contribution to their health by being involved in metabolism and providing food supplies, helping
to digest and preserve food, and protecting them from the diseases. In this review, we highlight
some of the main bacterial representatives of the microbiota of different species of bees and their
by-products, with the focus on microorganisms with characteristics improving health, and other
possible applications in the food industry. At the end, “One Health” is not just a book concept, but a
real scientific strategy.
Bees are one of the best-known and, at the same time, perhaps the most enigmatic insects
on our planet, known for their organization and social structure, being essential for the pollination
of agricultural crops and several other plants, playing an essential role in food production and the
balance of ecosystems, being associated with the production of high-value-added inputs, and a
unique universe in relation to bees’ microbiota. In this review, we summarize information regarding
on different varieties of bees, with emphasis on their specificity related to microbial variations.
Noteworthy are fructophilic bacteria, a lesser-known bacterial group, which use fructose fermentation
as their main source of energy, with some strains being closely related to bees’ health status. The
beneficial properties of fructophilic bacteria may be extendable to humans and other animals as
probiotics. In addition, their biotechnological potential may ease the development of new-generation
antimicrobials with applications in biopreservation. The concept of “One Health” brings together
fundamental and applied research with the aim of clarifying that the connections between the
different components of ecosystems must be considered part of a mega-structure, with bees being an
iconic example in that the healthy functionality of their microbiota is directly and indirectly related
to agricultural production, bee health, quality of bee products, and the functional prosperity for
humans and other animals. In fact, good health of bees is clearly related to the stable functionality of
ecosystems and indirectly relates to humans’ wellbeing, a concept of the “One Health”."""
     },
     {"example" : """
    Author: Hans de Vries,
    Content": Engaging with art can move individuals through a myriad of emotions, provoke reflective
thoughts, and lead to new ideas. Could art also influence interpersonal outcomes pertaining
to the ways we interact with others and navigate the social world, that is, our suite of social
cognitive skills? Here, we focus on visual art to explore the effect of art engagement on personal aesthetic experience and social cognitive skills. Across two studies, using veridical
paintings and matched non-art photos, we examined the effect of art engagement on emotional (e.g., awe, being moved) and eudaimonic experiences (e.g., reflective thoughts), as
well as social cognitive skills pertaining to Theory of Mind (ToM) and recognition of other’s
emotions. Further, we varied the depth with which participants engaged with the experiences of the characters in the artworks, to assess whether deep social information processing could boost the effect of art engagement on social cognitive skills. Our findings showed
that art engagement altered personal aesthetic experience through changes in emotional
and eudaimonic outcomes. However, we did not find any support for the effect of art
engagement on social cognitive skills: Neither engaging with art, nor art in combination with
deep social information processing, influenced performance on social cognitive skills of
ToM and emotion recognition. The effect of art engagement on personal aesthetic experience and the absence of effect on social cognitive skills highlight the nuanced nature of individuals’ interactions with art. We discuss these results considering the varied ways of
engagement with different artforms and in relation to different operationalizations of social
cognitive skills.
      
      “Any form of art is a form of power; it has impact, it can affect change–it can not only move
us, it makes us move.” [1]
Many, like the actor and activist Ossie Davis, believed that art cannot only move individuals
through unique emotional experiences, but can also change the way we see the world and others. While empirical evidence suggested that art holds the emotional end of this bargain, evoking a myriad of strong emotions and thoughts in the viewer [2–6], less attention was given to
      its potential impact beyond such personal experiences. Could art also influence interpersonal
outcomes pertaining to the ways we interact with other people and navigate the social world?
Previous research suggests that reading literary fiction boosts social cognitive skills by
triggering social information processing mechanisms, such as simulation of characters’
experiences and perspective taking [7, but see 8, 9]. One-time, brief sessions of reading have
been shown to enhance Theory of Mind (ToM), understanding of others’ mental states, in
adults, suggesting “ToM may be influenced by engagement with works of art” [10, p. 377].
While there is empirical support for this effect for reading literary fiction and attending theatre [11], other artforms, such as visual art, remain untested. Although shared elements and
modes of engagement across the arts suggest the possibility of similar outcomes for various
artforms, distinctive properties, such as the brevity of engagement that characterizes visual
art experiences, or the ambiguous nature of visual narratives, may require specific
consideration.
In the present studies, we focus on visual art, specifically paintings, to explore whether art
engagement affects not only our personal aesthetic experience but also our social cognitive
skills. We examined the effect of visual art engagement on emotional and eudaimonic experiences, and tested whether art engagement facilitated Theory of Mind (ToM) and the recognition of other people’s emotions. Moreover, we varied the depth in which participants engaged
with the characters’ experiences in the artworks, to test whether the depth of social information
processing boosts the effect of art engagement on social cognitive skills and influences personal
aesthetic experience.
In what follows, we review how visual art engagement influences personal aesthetic experience. Then, we discuss research on the relationship between art engagement and social cognitive skills. In light of the literature, we discuss the potential for visual art, specifically
engagement with paintings, to improve social cognitive skills. Finally, we present an overview
of the current studies."""
     }
]

In [26]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo", temperature=1)

synthetic_data_generator = create_openai_data_generator(
    output_schema=Paper,
    llm=llm,
    prompt=prompt_template,
)

In [27]:
n_papers = 3
topic_of_paper = []
paper_id = 1

for tp_id, tp in eval(db.run("select * from topic")):
    for pp_i in range(n_papers):
        provider_id, provider_name = random.choice(eval(db.run("select ProviderID, Name from educational_provider")))

        if random.random() >= 0.5:
            randomised_topic_id, randomised_topic = random.choice(eval(db.run("select * from topic")))
            while randomised_topic_id == tp_id:
                randomised_topic_id, randomised_topic = random.choice(eval(db.run("select * from topic")))
        else:
            randomised_topic_id, randomised_topic = None, None

        synthetic_result = synthetic_data_generator.generate(
                subject="a very short scientific paper",
                extra=f"a paper made in {provider_name} on topic of {topic, randomised_topic}",
                runs=1)[0]
        
        conn = sqlite3.connect('aurora.db')
        c = conn.cursor()

        c.execute("INSERT INTO papers (ProviderID, Content, Author) VALUES (?, ?, ?)", 
        (provider_id, synthetic_result.Content, synthetic_result.Author))
        
        conn.commit()
        conn.close()
        paper_id += 1

        topic_of_paper.append((paper_id, tp_id))
        if randomised_topic_id is not None:
            topic_of_paper.append((paper_id, randomised_topic_id))

# paper_topic

In [28]:
conn = sqlite3.connect('aurora.db')
c = conn.cursor()

for i, tp_i in topic_of_paper:
    c.execute("INSERT INTO paper_topic (TopicID, PaperID) VALUES (?, ?)", (tp_i, i))
        
conn.commit()
conn.close()

# CHECKS


In [29]:
db_path = 'aurora.db'

# Create a SQLDatabase object
db = SQLDatabase.from_uri(f'sqlite:///{db_path}')

for p in eval(db.run("SELECT * from clients")):
    print(p)
    print("\n")

(1, 38, 'funny_banana_lover', 'Leticia Garcia', '1999-9-9', 'Banana!3456', 'leticiagarcia@example.com', 'Female', '+15555555555', 883, '10-16', 10)


(2, 10, 'fluffyunicorn_27', 'Haruto Takahashi', '2004-3-22', 'Rainbow1234!', 'harutotakahashi@example.com', 'Male', '+819876543210', 499, '14-20', 30)


(3, 38, 'whos_ur_daddy', 'Akshay Gupta', '2004-10-17', 'ImTheBoss@123', 'akshaygupta@example.com', 'Male', '+918765432109', 215, '18-22', 10)


(4, 40, 'bananarama_lover', 'Svetlana Ivanov', '2004-8-16', 'Pineapple!9876', 'svetlanai@example.com', 'Female', '+79999999999', 357, '12-18', 5)


(5, 34, 'loves_pizza_123', 'Eleanor Roosevelt', '2004-8-29', 'PizzaIsLife@123', 'eleanorroosevelt@example.com', 'Female', '+15551234567', 685, '16-21', 20)


(6, 43, 'funny_pizza_lover', 'Zephyr Johnson', '1947-5-2', 'Summ3rTime@2022', 'zephyrjohnson@example.com', 'Male', '+12345678901', 520, '14-20', 5)


(7, 44, 'pizza_holic_007', 'Daviona Brown', '2004-2-10', 'CheesePlease!2023', 'davionabrown@examp