# Generate a dataset using Google Gemini API

## Initialize the gemini client

In [5]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

## Set the model to Gemini-pro

In [7]:
model = genai.GenerativeModel('gemini-pro')

## Sample function to generate a single entry 

In [9]:
def generate_response(content:str):
    response = model.generate_content(f"""
{SYSTEM_PROMPT}+{content}
""")
    return response

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs


## Sample system prompt to generate the dataset

In [8]:
SYSTEM_PROMPT = """
You are supposed to create  a dataset so refrain from outputting anything unnecessary .
You are given pages of lecture slides from a curated collection. Ignore page numbers and personal information, extract the context.
Generate  logical questions, correct answer and three incorrect answers suitable for mcq from the context text. 
Do not generate questions that go like from the lecture? in this lecture? etc.
Go step by step.  final output should be in json format. json schema for the output is 
{
    "Output": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string"
            },
            "correct_answer": {
                "type": "string"
            },
            "incorrect_answers": {
                "type": "array",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["question", "correct_answer", "incorrect_answers"]
    }
}


"""

## Get a list of files that needs to be extracted

In [12]:
folder_path = './dataset/source/'
pdfs = os.listdir(folder_path)
pdfs

['while_loops.pdf',
 '5COSC010C.Lecture-02.Network-B-Protocol Stacks(1).pdf',
 'CSF_Lecture_1_Part_2 of 2.pdf',
 'CSF_IP Addressing.pdf',
 'Week07.pdf',
 'Lecture 1 - Introduction -2023.pdf',
 'Lecture_Week05_WebServices.pdf',
 'Lists_Tuples_20_21.pdf',
 'CSF_Lecture_Operating Systems.pdf',
 'CSF_Lecture2.pdf',
 'CSF_Lecture_1_Part_1 of 2.pdf',
 '5COSC010C.Lecture-03-B-Network.Layer-IP-Address.pdf',
 'Week09 - Tagged.pdf',
 'files_21-22.pdf',
 'Week10 - Tagged.pdf',
 '5COSC022C.Lecture.5.d.WebServices.Communication.pdf',
 'string slices.pdf',
 'Binary Operations.pdf',
 '2 Character Codes .pdf',
 '5COSC022C.Lecture.5.c.WebServices.ProtocolsAndModels.pdf',
 '5COSC010C.Lecture.4.a.ApplicationLayer-DNS.pdf',
 'Week04_lecture.pdf',
 'Lecture_Week07_REST_API_JAXRS.pdf',
 'ML2021_lecture2_VK (1).pdf',
 '5COSC010C.Lecture-03-C-TransportLayer-.pdf',
 'Lecture_Week03.pdf',
 '01 Introduction.ClientServerConcepts.pdf',
 'Lecture.6.j.ICT1.Review.WebServices.Introduction.pdf',
 'DiskSector 0.pdf',
 

## Use pandas and pypdf2 to load the pdf and extract text and then load it to a csv file

In [13]:

import PyPDF2
import pandas as pd

def pdf_to_pandas_df(folder_path, pdfs):
    all_pdfs = []
    for pdf in pdfs:
        with open(folder_path+pdf, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            data = {'Page':[page.extract_text() for page in reader.pages]}
            df = pd.DataFrame(data)
            all_pdfs.append(df)
    df = pd.concat(all_pdfs, ignore_index=True)
    return df


df = pdf_to_pandas_df(folder_path, pdfs)
df.to_csv('./df.csv', index=False)

In [14]:
df.head()

## Load the dataset as a pandas dataframe

In [100]:
df = pd.read_csv('./dataset/filtered_dataset.csv')

In [17]:
df = pd.read_csv('df_pruned.csv', header=None)

In [22]:

df.head()
df.to_csv('./df_pruned.csv', index=False)

## Sample function to generate the dataset at batch

In [29]:
import pandas as pd

def process_row(row):
    response = generate_response(row[0])
    return response.text

def gen_dataset(df, csv_path):
    # Create a new column for output, initially filled with None
    df['Output'] = None

    # Initialize a counter
    counter = 0

    # Iterate over the DataFrame rows
    for index, row in df.iterrows():
        # Process each row and update the 'Output' column
        df.at[index, 'Output'] = process_row(row)

        # Increment the counter
        counter += 1

        # Save the updated DataFrame to CSV after processing each row
        df.to_csv(csv_path, index=False)

        # Optionally, print the counter to track progress
        print(f"Processed {counter} rows")

    return df

# Usage example
csv_path = './dataset_v2_other.csv'  # Specify the path to your CSV file
df1 = gen_dataset(df, csv_path)

  response = generate_response(row[0])


Processed 1 rows
Processed 2 rows
Processed 3 rows
Processed 4 rows
Processed 5 rows
Processed 6 rows
Processed 7 rows
Processed 8 rows
Processed 9 rows
Processed 10 rows
Processed 11 rows
Processed 12 rows
Processed 13 rows
Processed 14 rows
Processed 15 rows
Processed 16 rows
Processed 17 rows
Processed 18 rows
Processed 19 rows
Processed 20 rows
Processed 21 rows
Processed 22 rows
Processed 23 rows
Processed 24 rows
Processed 25 rows
Processed 26 rows
Processed 27 rows
Processed 28 rows
Processed 29 rows
Processed 30 rows
Processed 31 rows
Processed 32 rows
Processed 33 rows
Processed 34 rows
Processed 35 rows
Processed 36 rows
Processed 37 rows
Processed 38 rows
Processed 39 rows
Processed 40 rows
Processed 41 rows
Processed 42 rows
Processed 43 rows
Processed 44 rows
Processed 45 rows
Processed 46 rows
Processed 47 rows
Processed 48 rows
Processed 49 rows
Processed 50 rows
Processed 51 rows
Processed 52 rows
Processed 53 rows
Processed 54 rows
Processed 55 rows
Processed 56 rows
P

## Optional: inspect the dataset

In [22]:
import json
import pandas as pd

# Assuming you have a DataFrame named 'df'
for index, row in df[:50].iterrows():
    # Access the values of each column in the current row
    # column1_value = row['Source']
    column2_value = row['Output']
    json_data = json.loads(column2_value)
    
    # Perform operations on the values or use them in your code
    # ...
    
    # Print the values if needed
    print(json_data)


{'Output': {'question': 'In the given program, what is the final value in b ?', 'correct_answer': '6', 'incorrect_answers': ['5', '4', '7']}}
{'Output': {'question': 'What is printed by the following?\nalist = [4, 2, 8, 6, 5]\nalist.pop(2)\nalist.pop()\nprint(alist)', 'correct_answer': '[4, 2, 5]', 'incorrect_answers': ['[4, 8, 6, 5]', '[4, 2, 8, 5]', '[4, 2, 8]']}}
{'Output': {'question': 'Which node is pointed to by current?', 'correct_answer': 'c', 'incorrect_answers': ['a', 'b', 'd']}}
{'Output': {'question': 'What is the main goal of social graph partitioning and clustering?', 'correct_answer': 'Identify groups within Facebook friends', 'incorrect_answers': ['Identify users with similar interests', 'Recommend friends to users', "Improve the performance of Facebook's newsfeed"]}}
{'Output': {'question': 'What is the syntax for the Point constructor in Java?', 'correct_answer': 'Point(int x,int y)', 'incorrect_answers': ['Point()', 'Point(x,y)', 'Point(int)']}}
{'Output': {'question

JSONDecodeError: Extra data: line 12 column 1 (char 379)

## Optional: verify if the output text is indeed json

In [26]:
import ijson

def parse_json_objects(json_string):
    objects = []
    parser = ijson.parse(json_string)
    for prefix, event, value in parser:
        if (prefix, event) == ('', 'map_key'):
            # Start of a new object
            builder = ijson.common.ObjectBuilder()
        builder.event(event, value)
        if (prefix, event) == ('', 'end_map'):
            # End of the object
            objects.append(builder.value)
    return objects

In [27]:
json_string = str('{\n    "Output": {\n        "question": "What is the role of a service producer in service-oriented architecture (SOA)?",\n        "correct_answer": "Provides and manages a service.",\n        "incorrect_answers": [\n            "Discovers and invokes services.",\n            "Registers and advertises services.",\n            "Maintains contracts between services."\n        ]\n    }\n}\n{\n    "Output": {\n        "question": "What is the purpose of a service registry in SOA?",\n        "correct_answer": "Stores and manages information about available services.",\n        "incorrect_answers": [\n            "Executes service requests.",\n            "Discovers and invokes services.",\n            "Provides and manages services."\n        ]\n    }\n}\n{\n    "Output": {\n        "question": "Which component is responsible for discovering and invoking services in SOA?",\n        "correct_answer": "Service consumer.",\n        "incorrect_answers": [\n            "Service producer.",\n            "Service registry.",\n            "Service contract."\n        ]\n    }\n}')
json_objects = parse_json_objects(json_string)
for obj in json_objects:
    print(obj)

IncompleteJSONError: parse error: trailing garbage
          services."         ]     } } {     "Output": {         "ques
                     (right here) ------^


## Create a jsonl file with formatting the datas into the needed structure for fine tuning

In [3]:
import json

path_to_file = 'dataset_mistral_instruct1.jsonl'

with open (path_to_file, 'w') as f:
    for index, row in df.iterrows():
        prompt = f"<s>[INST]### Instruction : You are given a sequence of text. Read the whole text and summarize its content. Then generate a logical question, correct answer and three incorrect answers suitable for mcq from the summary.The output should be only a json formatted in the below format." + "{\"type\": \"object\", \"properties\": {\"Output\": {\"type\": \"object\", \"properties\": {\"question\": {\"type\": \"string\"}, \"correct_answer\": {\"type\": \"string\"}, \"incorrect_answers\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"question\", \"correct_answer\", \"incorrect_answers\"]}}, \"required\": [\"Output\"]}" + f" ### Input : {row['Input']}[/INST]"
        completion = f"### Output : {row['Output']} ### End</s>"
        json_object = {
            "prompt": prompt,
            "completion": completion
        }
        f.write(json.dumps(json_object)+"\n")
        