### Install packages

In [70]:
%pip install -q pypandoc docx2txt PyPDF2 openpyxl python-dotenv openai num2words matplotlib plotly scipy scikit-learn pandas tiktoken ipywidgets
%load_ext dotenv
%dotenv

Note: you may need to restart the kernel to use updated packages.
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


Extract all submissions to a tmp folder


In [25]:
# Import the zipfile module
from zipfile import ZipFile
# Create a zip file object using ZipFile class
with ZipFile("data/submission.zip", "r") as zip_obj:
    # Extract all the files into a directory
    zip_obj.extractall("tmp/") 

In [26]:
# Import the os module
import os
import pandas as pd

# Define the path to list
temp_path = "tmp/submission/"

def is_folder_contains_file(folder_path, extension): 
    # Get a list of all files and directories in the path 
    names = os.listdir(folder_path) 
    for name in names: 
        if name.endswith(extension): 
            return True 
    return False   
    
# Get a list of all files and directories in the path
def get_submissions_df(path):
    assignment_folders = []
    names = os.listdir(path)
    # Loop through the list
    for name in names:
        # Join the path and the name
        full_path = os.path.join(path, name)
        # Check if it is a directory
        if os.path.isdir(full_path):
            # Print the directory name
            assignment_folders.append({
                "Student": name.split("_")[0],
                "Path": full_path,
                "ContainsDocxFile": is_folder_contains_file(full_path, ".docx"),                
                "ContainsPdfFile": is_folder_contains_file(full_path, ".pdf"),
                "ContainsZipFile": is_folder_contains_file(full_path, ".zip")
                })
    df = pd.DataFrame([p for p in assignment_folders])
    return df
df = get_submissions_df(temp_path)

In [27]:
df

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile
0,NG Man Hei,tmp/submission/NG Man Hei_2461662_assignsubmis...,False,True,False
1,LEUNG Chun Wa,tmp/submission/LEUNG Chun Wa_2461652_assignsub...,True,False,False
2,CHEUNG Chi Him,tmp/submission/CHEUNG Chi Him_2461641_assignsu...,True,False,False
3,NG Man Chun,tmp/submission/NG Man Chun_2461585_assignsubmi...,True,False,False
4,CHAN Hiu Po,tmp/submission/CHAN Hiu Po_2461655_assignsubmi...,True,False,False
...,...,...,...,...,...
67,HUI Man Chun,tmp/submission/HUI Man Chun_2461600_assignsubm...,True,False,False
68,CHEUNG Ka Kwai,tmp/submission/CHEUNG Ka Kwai_2461642_assignsu...,True,False,False
69,LAU Yuen Ting,tmp/submission/LAU Yuen Ting_2461612_assignsub...,True,False,False
70,LI Hoi Yung,tmp/submission/LI Hoi Yung_2461659_assignsubmi...,True,False,False


### Check any invalid submission files

In [28]:
def filter_df_by_not_contains_any_expected_files(df):
    return df[(df["ContainsDocxFile"] == False) & (df["ContainsPdfFile"] == False) & (df["ContainsZipFile"] == False)]
filter_df_by_not_contains_any_expected_files(df)

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile


Handle zip file.

In [29]:
import os
import shutil

def flatten(directory):
    for dirpath, _, filenames in os.walk(directory, topdown=False):
        for filename in filenames:
            i = 0
            source = os.path.join(dirpath, filename)
            target = os.path.join(directory, filename)

            while os.path.exists(target):
                i += 1
                file_parts = os.path.splitext(os.path.basename(filename))

                target = os.path.join(
                    directory,
                    file_parts[0] + "_" + str(i) + file_parts[1],
                )

            shutil.move(source, target)

            print("Moved ", source, " to ", target)

        if dirpath != directory:
            os.rmdir(dirpath)
            print("Deleted ", dirpath)

def get_first_file_path(path, ext):
    names = os.listdir(path)
    for name in names:
        if name.endswith(ext):
            return os.path.join(path, name)

def extract_zip_file_in_place(path):
    zip_path = get_first_file_path(path, ".zip")
    print(zip_path)
    import zipfile
    # Create a zip file object using ZipFile class
    with zipfile.ZipFile(zip_path, "r") as zip_obj:
        # Extract all the files into a directory
        zip_obj.extractall(path)
    flatten(path) 


def filter_df_by_contains_zip_file(df):
    return df[(df["ContainsZipFile"] == True)]

paths = filter_df_by_contains_zip_file(df)["Path"].values
for path in paths:
    extract_zip_file_in_place(path)

tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456.zip
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1_2.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456.zip  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456_1.zip
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1_2.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1_1.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1_1_1.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1_2_1.docx  to  tmp/submission/KAN Nin Chun_2461630_

In [30]:
df = get_submissions_df(temp_path)
## check all rows contains Docx or PDF file
def filter_df_by_contains_docx_or_pdf_file(df):
    return df[(df["ContainsDocxFile"] == True) | (df["ContainsPdfFile"] == True)]

filter_df_by_contains_docx_or_pdf_file(df)

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile
0,NG Man Hei,tmp/submission/NG Man Hei_2461662_assignsubmis...,False,True,False
1,LEUNG Chun Wa,tmp/submission/LEUNG Chun Wa_2461652_assignsub...,True,False,False
2,CHEUNG Chi Him,tmp/submission/CHEUNG Chi Him_2461641_assignsu...,True,False,False
3,NG Man Chun,tmp/submission/NG Man Chun_2461585_assignsubmi...,True,False,False
4,CHAN Hiu Po,tmp/submission/CHAN Hiu Po_2461655_assignsubmi...,True,False,False
...,...,...,...,...,...
67,HUI Man Chun,tmp/submission/HUI Man Chun_2461600_assignsubm...,True,False,False
68,CHEUNG Ka Kwai,tmp/submission/CHEUNG Ka Kwai_2461642_assignsu...,True,False,False
69,LAU Yuen Ting,tmp/submission/LAU Yuen Ting_2461612_assignsub...,True,False,False
70,LI Hoi Yung,tmp/submission/LI Hoi Yung_2461659_assignsubmi...,True,False,False


## Processing Docx files

In [31]:
def filter_df_by_contains_docx(df):
    return df[(df["ContainsDocxFile"] == True)]
words_df = filter_df_by_contains_docx(df)
paths = words_df["Path"].values

def get_all_docx_files(path):
    import glob
    return glob.glob(path + "/*.docx")

import docx2txt
from functools import reduce

students_words_files = list(map(get_all_docx_files, paths)) # List of lists of word files

file_contents =[];
for word_files in students_words_files:  
    file_contents.append(reduce(lambda x, y: x + y, map(lambda f: docx2txt.process(f), word_files), "\n\n"))
# reduce(map(lambda f: docx2txt.process(f), word_files), lambda x, y: x + y, "")
words_df.loc[:, "Sources"] = students_words_files
words_df.loc[:, "Answers"] = file_contents


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df.loc[:, "Sources"] = students_words_files
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df.loc[:, "Answers"] = file_contents


In [32]:
def filter_df_by_contains_pdf(df):
    return df[(df["ContainsPdfFile"] == True)]
pdfs_df = filter_df_by_contains_pdf(df)
paths = pdfs_df["Path"].values

def get_add_pdf_files(path):
    import glob
    return glob.glob(path + "/*.pdf")

import PyPDF2
from functools import reduce

def convert_pdf_all_pages_to_txt(path):
    pdfFileObj = open(path, 'rb')
    reader = PyPDF2.PdfReader(pdfFileObj)
    num_pages = len(reader.pages)
    count = 0
    text = ""
    while count < num_pages:
        pageObj = reader.pages[count]
        count += 1
        text += pageObj.extract_text()
        text += "\n\n"
    return text

students_pdf_files = list(map(get_add_pdf_files, paths)) # List of lists of word files

file_contents =[];
for pdf_files in students_pdf_files:
    file_contents.append(reduce(lambda x, y: x + y, map(convert_pdf_all_pages_to_txt, pdf_files), "\n\n"))

pdfs_df.loc[:, "Sources"] = students_pdf_files
pdfs_df.loc[:, "Answers"] = file_contents
pdfs_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdfs_df.loc[:, "Sources"] = students_pdf_files
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdfs_df.loc[:, "Answers"] = file_contents


Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile,Sources,Answers
0,NG Man Hei,tmp/submission/NG Man Hei_2461662_assignsubmis...,False,True,False,[tmp/submission/NG Man Hei_2461662_assignsubmi...,"\n\nAs a system engineer, my job responsibilit..."
8,FU Kit Lun,tmp/submission/FU Kit Lun_2461636_assignsubmis...,False,True,False,[tmp/submission/FU Kit Lun_2461636_assignsubmi...,\n\nI T P 4 1 2 1\nC l o u d\na n d\nD a t a\n...
19,KWAN Wan Loi,tmp/submission/KWAN Wan Loi_2461627_assignsubm...,True,True,False,[tmp/submission/KWAN Wan Loi_2461627_assignsub...,\n\nName: Kwan Wan Loi\nStudent id:210092959\n...
20,LO Fu Hon,tmp/submission/LO Fu Hon_2461619_assignsubmiss...,True,True,False,[tmp/submission/LO Fu Hon_2461619_assignsubmis...,\n\nJob Responsibility and Roles \nI am respo...
31,NAWAZ Raheem,tmp/submission/NAWAZ Raheem_2461598_assignsubm...,False,True,False,[tmp/submission/NAWAZ Raheem_2461598_assignsub...,"\n\nAs an intern at GTI, my role as a System A..."
45,CHONG Ka Fai,tmp/submission/CHONG Ka Fai_2461660_assignsubm...,False,True,False,[tmp/submission/CHONG Ka Fai_2461660_assignsub...,\n\nI will focus on the following items for th...
49,ZENG Haoxuan,tmp/submission/ZENG Haoxuan_2461633_assignsubm...,True,True,False,[tmp/submission/ZENG Haoxuan_2461633_assignsub...,\n\nITP4121 -Assignment -EA3 ...
54,LEE Kai,tmp/submission/LEE Kai_2461644_assignsubmissio...,False,True,False,[tmp/submission/LEE Kai_2461644_assignsubmissi...,\n\n\n\n\n\n\n\n\n\n


In [33]:
# combine two dataframes into one and export to excel
df_answers = pd.concat([words_df, pdfs_df])
df_answers.to_excel("data/answers.xlsx", index=False)

In [34]:
df_answers.head(1)["Answers"].values[0]

'\n\nITP4121 Workplace Reflective Journal\n\nLeung Chun Wa   /   210163465\n\n\n\nTable of contents\n\nJob Responsibility and Roles\n\nWorkplace experience\n\nLearning in Workplace\n\nComments on Workplace and Suggestions\n\nAbout WLA\n\nEvaluation\n\nCareer Path\n\n\n\nJob Responsibility and Roles\n\nAs an IT support team intern at Master Concept, my job responsibilities include the following.\n\n\n\nMaintaining IT equipment\n\nI am responsible for setting up and configuring new equipment, like computers, monitors, and other devices in the office. Also include setting up the equipment needed for some project, for example, HKEAA DSE related works.\n\n\n\nManaging user accounts\n\nCreating and managing user accounts for projects, which involves setting up new accounts, disabling old accounts, and managing access permissions. I assisted in this area for OpenData Hackathon 2022.\n\n\n\nManaging help desk services\n\nI need to call our own outsourced help desk, ensuring that the service qu

## Using Azure OpenAI ChatGPT to grade student answers

In [49]:
import os
import json
import openai
openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") 
openai.api_version = "2023-03-15-preview"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")

# read text file and return the content
def read_text_file(path):
    with open(path, 'r') as file:
        data = file.read().replace('\n', '')
    return data



def grade_answer(student_answer, marking_scheme):
    prompt=marking_scheme.replace("<ANSWER></ANSWER>", student_answer)
    try:
        response = openai.ChatCompletion.create(
            engine="gpt-35-turbo", # engine = "deployment_name".
            messages=[
                {"role": "system", "content": "You are a teaching assistant."},
                {"role": "user", "content": prompt},      
            ]
        )
        print(response)
        print(response['choices'][0]['message']['content'])        
        tokens = response['usage']['total_tokens']
        content = json.loads(response['choices'][0]['message']['content'])
        marks = content['marks']
        comments = content['comments']
        details = content['details']
        copyFromInternet = content['copyFromInternet']
        generativeAI = content['generativeAI']
        return marks, comments, details, copyFromInternet, generativeAI, tokens, False
    except Exception as e:
        print(e)
        return 0, "Error", [], 0, 0, 0, True



def grade_answers(df_answers, marking_scheme):
    for index, row in df_answers.iterrows():      
        student = row["Student"]
        print(student)
        answer = row["Answers"]
        marks, comments, details, copyFromInternet, generativeAI, tokens, error = grade_answer(answer, marking_scheme)
        df_answers.loc[index, "Marks"] = marks
        df_answers.loc[index, "Comments"] = comments
        df_answers.loc[index, "Details"] = "\n".join(list(map(lambda x: x["task"] + ": " + str(x["marks"]) +" marks\n"+ x["comments"], details)))
        df_answers.loc[index, "CopyFromInternet"] = copyFromInternet
        df_answers.loc[index, "GenerativeAI"] = generativeAI
        df_answers.loc[index, "ChatGptTokens"] = tokens
        df_answers.loc[index, "Error"] = error
    return df_answers

marking_scheme = read_text_file("marking_scheme.txt")
# get second row answer for df_answers


# student = df_answers.iloc[[2]]["Student"].values[0]
# student_answer = df_answers.iloc[[2]]["Answers"].values[0]
# print(student_answer)
# grade_answer(student_answer, marking_scheme)


df_marked = grade_answers(df_answers.head(2), marking_scheme)
df_marked.to_excel("data/marks.xlsx", index=False)

LEUNG Chun Wa
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{ \n    \"marks\": 65, \n    \"comments\": \"You have provided a well-organized reflective journal. Your response answers most of the questions asked. However, your answers seemed a bit brief and could use more details to support your claims. You have a clear understanding of your job responsibilities and have provided relevant examples to showcase your skills. Further, you have also highlighted the skills you have gained and how they contribute to your future career goals. In addition to this, you have shared your experience working at the company and have given suggestions to improve the workplace. Overall, keep up the good work! Next time, try to elaborate more on your experiences and provide specific examples to support your claims.\", \n    \"details\":[ \n        { \n            \"task\": \"1. Job Responsibility and Roles\", \n            \"marks\": 7, \n  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_answers.loc[index, "Marks"] = marks
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_answers.loc[index, "Comments"] = comments
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_answers.loc[index, "Details"] = "\n".join(list(map(lambda x: x["task"] + ": " + str(x["marks"]) +" marks\n"+ x["commen

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n    \"marks\": 65,\n    \"comments\": \"Great job overall! You have provided detailed and organized answers to all the questions. Keep up the good work!\",\n    \"details\":[\n        {\"task\": \"1. Job Responsibility and Roles\", \"marks\": 8, \"comments\":\"Your job responsibilities have been explained clearly. However, try to be more positive about your job role and avoid using words like 'passive'.\"},        \n        {\"task\": \"2. Workplace experience\", \"marks\": 9, \"comments\":\"You have shared your experience in a structured and detailed manner. The example of working with the government headquarters was particularly well-explained.\"}, \n        {\"task\": \"3. Learning in Workplace\", \"marks\": 10, \"comments\":\"You have mentioned various technical as well as soft skills you learned. Great job expressing the importance of communication in the workplace.\"}

In [50]:
df_marked

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile,Sources,Answers,Marks,Comments,Details,CopyFromInternet,GenerativeAI,ChatGptTokens,Error
1,LEUNG Chun Wa,tmp/submission/LEUNG Chun Wa_2461652_assignsub...,True,False,False,[tmp/submission/LEUNG Chun Wa_2461652_assignsu...,\n\nITP4121 Workplace Reflective Journal\n\nLe...,65.0,You have provided a well-organized reflective ...,1. Job Responsibility and Roles: 7 marks\nYou ...,0.1,0.1,2003.0,False
2,CHEUNG Chi Him,tmp/submission/CHEUNG Chi Him_2461641_assignsu...,True,False,False,[tmp/submission/CHEUNG Chi Him_2461641_assigns...,\n\nJob Responsibility and Roles\n\nWorkplace ...,0.0,Error,,0.0,0.0,0.0,True


### Embeddings and clustering

In [71]:
import openai
import os
import re
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding
import tiktoken

API_KEY = os.getenv("AZURE_OPENAI_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01" 

In [72]:
df_Answers = df_marked[['Student','Answers']]

Data cleaning by removing redundant whitespace and cleaning up

In [73]:
pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_Answers['Answers']= df_Answers["Answers"].apply(lambda x : normalize_text(x))

Remove any answers that are too long for the token limit (8192 tokens).

In [66]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df_Answers['n_tokens'] = df_Answers["Answers"].apply(lambda x: len(tokenizer.encode(x)))
df_Answers = df_Answers[df_Answers.n_tokens<8192]
len(df_Answers)

2

In [67]:
df_Answers['ada_v2'] = df_Answers["Answers"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002')) 
# engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model

In [68]:
df_Answers

Unnamed: 0,Student,Answers,n_tokens,ada_v2
1,LEUNG Chun Wa,ITP4121 Workplace Reflective Journal Leung Chu...,655,"[0.010281223803758621, -0.010676655918359756, ..."
2,CHEUNG Chi Him,Job Responsibility and Roles Workplace experie...,1237,"[-0.012838901951909065, -0.027127781882882118,..."


### Clustering based on the Embeddings

In [74]:
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
from sklearn.metrics import silhouette_score

X = df_Answers
cluster_results_km = pd.DataFrame({'K': range(6, 25), 'SIL': np.nan})
cluster_results_km.set_index('K', inplace=True)
for k in tqdm(cluster_results_km.index):
    km_model = KMeans(n_clusters = k, init ='k-means++', random_state = 42)
    y = km_model.fit_predict(X)
    cluster_results_km.loc[k, 'SIL'] = silhouette_score(X, y)
    
cluster_results_km.idxmax(), cluster_results_km.max()

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html