## Load Packages


In [None]:
!pip install openai
!pip install langchain
!pip install PyPDF2
!pip install openai chromadb
!pip install tiktoken
!pip install python-pptx
!pip install pathlib
!pip install transformers

In [None]:
import csv
import json
import openai
import signal
import datetime
import time
import os
import pandas as pd
import subprocess
import langchain
import PyPDF2
from pptx import Presentation
from transformers import AutoTokenizer

## Prepare Data

In [None]:
# Set the working directory

current_directory = os.getcwd()
os.chdir(os.path.join(current_directory, '../TuningGPT'))

In [None]:
# Convert everything from a folder to .txt

def pdf_to_txt(pdf_file_path, txt_file_path):
    try:
        with open(pdf_file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                for page_num in range(num_pages):
                    page = pdf_reader.pages[page_num]
                    txt_file.write(page.extract_text())

        print(f"Successfully converted '{pdf_file_path}' to '{txt_file_path}'.")
    except Exception as e:
        print(f"Error occurred while converting '{pdf_file_path}': {e}")

def pptx_to_txt(pptx_file_path, txt_file_path):
    try:
        prs = Presentation(pptx_file_path)
        text_content = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_content.append(shape.text)

        with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write('\n'.join(text_content))

        print(f"Successfully converted '{pptx_file_path}' to '{txt_file_path}'.")
    except Exception as e:
        print(f"Error occurred while converting '{pptx_file_path}': {e}")

def convert_non_txt_to_txt(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            continue  # Skip txt files

        old_file_path = os.path.join(folder_path, filename)
        new_file_path = os.path.join(folder_path, os.path.splitext(filename)[0] + ".txt")

        if filename.endswith(".pdf"):
            pdf_to_txt(old_file_path, new_file_path)
        elif filename.endswith(".pptx"):
            pptx_to_txt(old_file_path, new_file_path)
        else:
            print(f"Unsupported file format: '{filename}'")

folder_path = "../Material"
convert_non_txt_to_txt(folder_path)


In [None]:
# Remove empty lines from all .txt files
"""
from pathlib import Path

def remove_empty_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter out empty lines
    non_empty_lines = [line.strip() for line in lines if line.strip()]

    with open(file_path, 'w') as file:
        file.write('\n'.join(non_empty_lines))

def remove_empty_lines_from_files(directory_path):
    path = Path(directory_path)
    txt_files = path.glob("*.txt")

    for file in txt_files:
        remove_empty_lines(file)

if __name__ == "__main__":
    directory_path = "/../Material"
    remove_empty_lines_from_files(directory_path)
"""

In [None]:
directory_path = '../Material'

file_contents = []

for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r') as file:
            file_contents.append(file.read())

merged_content = '\n'.join(file_contents)

merged_file_path = '../Material/merge.txt'

with open(merged_file_path, 'w') as merged_file:
    merged_file.write(merged_content)

## Fine tuning Conditional Generation


In [None]:
import os
import constants

# To use Langchain, we recommend having a .py file that contains the following line: APIKEY = 'Your_API_Key'
os.environ["OPENAI_API_KEY"] = constants.APIKEY

In [None]:
from langchain.document_loaders import TextLoader

loader = TextLoader('../Material/merge.txt')
data = loader.load()

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
from langchain.chat_models import ChatOpenAI
print(index.query('who is the instructor',llm = ChatOpenAI()))

The instructor of the course is Ryan Baker.


In [None]:
def process_dataframe(source, file_path):
    df = pd.read_csv(file_path)
    responses = []
    for index, row in df.iterrows():
        loader = TextLoader(source)
        data = loader.load()
        index = VectorstoreIndexCreator().from_loaders([loader])
        prompt = row["prompt"]
        response = index.query(prompt,llm = ChatOpenAI())
        responses.append(response)
    return responses

source = "../Material/syllabus.txt"
file_path = "../Test_NaturalQuestion.csv"
responses = process_dataframe(source, file_path)
responses

In [None]:
df = pd.read_csv(file_path)

# Make sure the number of items in the list matches the number of rows in the DataFrame
if len(responses) != len(df):
    print("Number of responses doesn't match the number of rows in the file.")
else:
    # Fill in the "response" column
    for i, response in enumerate(responses):
        df.at[i, "response"] = response

    # Save the updated DataFrame back to the CSV file
    df.to_csv(file_path, index=False)

    print("Responses successfully filled in the file.")

Responses successfully filled in the file.
