## Generate Text versions of Submission files

You will need to run this notebook cell to convert the submissions into a format usable by the GraderGPT_SP and GraderGPT_MP scripts.

The `rootSubmissionFolder` should contain the unzipped submission folders that are exported from Canvas.

The `textSubmissionsFolder` should ideally just point to `data/Converted Text Submissions` where the files are converted into a format and directory structure as needed for the scripts to run.

In [None]:
import os
import pypandoc
from pdf2docx  import parse

rootSubmissionFolder = os.path.join('data', 'Submissions')
textSubmissionsFolder = os.path.join('data', 'Converted Text Submissions')
tempFolder = 'temp'

for assignmentFolder in os.listdir(rootSubmissionFolder):
    if '.DS_Store' in assignmentFolder:
        print('Skipping Mac DS_Store folder.')
        continue
    
    assignmentFolderPath = os.path.join(rootSubmissionFolder, assignmentFolder)
    assignmentID = assignmentFolder.split('_')[-1]
    if not os.path.exists(os.path.join(textSubmissionsFolder, assignmentID)):
            os.mkdir(os.path.join(textSubmissionsFolder, assignmentID))

    for submissionFile in os.listdir(assignmentFolderPath):
        submissionFilePath = os.path.join(rootSubmissionFolder, assignmentFolder, submissionFile)
        fileFormat = submissionFile.split('.')[-1].lower()

        if 'LATE' in submissionFile:
            userID = submissionFile.split('_')[2]
        else:
            userID = submissionFile.split('_')[1]

        savedFileName = userID+'.txt'
        savedFilePath = os.path.join(textSubmissionsFolder, assignmentID, savedFileName)
        if os.path.exists(savedFilePath):
            continue
        print(submissionFilePath)
        
        try:
            if fileFormat=='docx':
                output = pypandoc.convert_file(submissionFilePath, 'plain')
            elif fileFormat=='pdf':
                tempFilePath = os.path.join(tempFolder, 'tempFile.docx')
                parse(submissionFilePath, tempFilePath)
                output = pypandoc.convert_file(tempFilePath, 'plain')
        
            if len(output.split('\n')) < 16:
                print('File seems to have no text content in it. Skipping file.')
            else:
                with open(savedFilePath, 'w') as textFile:
                    textFile.write(output)
        except Exception as e:
            print(f'Error in conversion: {e}')
            print('Skipping file.')
            continue

#### Check length of converted submissions to see if any could have been converted incorrectly.

In [None]:
textSubmissionsFolder = os.path.join('data', 'Converted Text Submissions')
tempFolder = 'temp'
for assignmentID in os.listdir(textSubmissionsFolder):
    assignmentFolderPath = os.path.join(textSubmissionsFolder, assignmentID)

    for submissionFile in os.listdir(assignmentFolderPath):
        submissionFilePath = os.path.join(textSubmissionsFolder, assignmentID, submissionFile)
        with open(submissionFilePath) as textFile:
            submission = textFile.readlines()

        if len(submission) < 16:
            print(submissionFilePath)

## Token counting and checking

In [None]:
from helper import *
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def num_tokens_from_string(string: str, encoding_name='cl100k_base') -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

config = Config()
config.setFromEnv()

# Option to use custom variables here.
# versionControl = 'V1'
# promptVersion = 'P1'
# courseShorthand = 'MOVESCI'

# customConfigParams = {
#                 'Save Name':f'{courseShorthand}-{versionControl}-{promptVersion}', 
#                 'Overwrite Saves': False, 
#                 'Use Custom Desc.': True
#                 }
# config.setSaveDetails(customConfigParams)

config.setSaveDetails()

gradeRubricAssignmentDF = getGRAData(config)

for index, row in gradeRubricAssignmentDF.iterrows():
    fullPrompt = processTokenCount(row, config)
    if fullPrompt:
        tokenCount = num_tokens_from_string(fullPrompt)
        if tokenCount > 8192:
            display(row)


## View & Save Full Prompt to File

In [None]:
from helper import *

config = Config()
config.setFromEnv()

# Option to use custom variables here.
versionControl = 'V4'
promptVersion = 'P2'
courseShorthand = 'MOVESCI'

customConfigParams = {
                'Save Name':f'{courseShorthand}-{versionControl}-{promptVersion}', 
                'Overwrite Saves': False, 
                'Use Custom Desc.': True
                }
config.setSaveDetails(customConfigParams)

config.setSaveDetails()

config.saveTemplatePrompt()

gradeRubricAssignmentDF = getGRAData(config)

for index, row in gradeRubricAssignmentDF.iterrows():
    fullPrompt = processTokenCount(row, config)

    if fullPrompt:
        print(fullPrompt)
        with open(os.path.join(config.baseOutputFolder, \
                               config.outputFolders['PROMPT_FILES'], \
                               config.fullName, \
                               f'{config.fullName}_exampleFilledPrompt.txt'), 'w') as textFile:
            textFile.write(fullPrompt)
        break

## Build custom file for manual criterion descriptions

In [None]:
from helper import *

config = Config()
config.setFromEnv()

gradeRubricAssignmentDF = getGRAData(config)

rubricData = gradeRubricAssignmentDF[['assignment_id', 'rubric_id', 'assignment_title', 'data_rubric']]\
                .drop_duplicates(subset=['assignment_id', 'rubric_id']).sort_values('assignment_id').reset_index(drop=True)

criterionList = []
for index, row in rubricData.iterrows():
    rubricDict = {param: row[param] for param in ['assignment_id', 'rubric_id', 'assignment_title']}
    for criteria in row['data_rubric']:
        criteriaDict = {param: row[param] for param in rubricDict}
        for param in ['id', 'points', 'ratings', 'description', 'long_description']:
            criteriaDict[param] = criteria[param]
        criterionList.append(criteriaDict)

customRubricTemplateDF = pd.DataFrame(criterionList)
customRubricTemplateDF['custom_description'] = None

filePath = os.path.join(config.baseDataFolder, \
                        config.dataFolders['CSV_DATA'], \
                        f'{config.courseName}criterion.csv')

if os.path.exists(filePath):
    print('File already exists. Not overwriting. Change file path to save elsewhere.')
else:
    customRubricTemplateDF.to_csv(filePath, index=False)
    print(f'Saving criterion CSV to {filePath}')

## Copy Prompts to prompts folder
This just copies the prompts from the nested output directrioes into a root 'prompt' folder instead for easier access.

In [None]:
import shutil
from helper import *

config = Config()

sourceFolder = os.path.join(config.baseOutputFolder, config.outputFolders['PROMPT_FILES'])
destinationFolder = 'prompts' # config.promptFolder

if os.path.exists(destinationFolder):
    shutil.rmtree(destinationFolder)
shutil.copytree(sourceFolder, destinationFolder)