### Install packages

In [2]:
%pip install -q pandas pypandoc docx2txt openai

Note: you may need to restart the kernel to use updated packages.


## Extract all submissions to tmp folder


In [3]:
# Import the zipfile module
from zipfile import ZipFile
# Create a zip file object using ZipFile class
with ZipFile("data/submission.zip", "r") as zip_obj:
    # Extract all the files into a directory
    zip_obj.extractall("tmp/") 

In [4]:
# Import the os module
import os
import pandas as pd

# Define the path to list
temp_path = "tmp/submission/"

def is_folder_contains_file(folder_path, extension): 
    # Get a list of all files and directories in the path 
    names = os.listdir(folder_path) 
    for name in names: 
        if name.endswith(extension): 
            return True 
    return False   
    
# Get a list of all files and directories in the path
def get_submissions_df(path):
    assignment_folders = []
    names = os.listdir(path)
    # Loop through the list
    for name in names:
        # Join the path and the name
        full_path = os.path.join(path, name)
        # Check if it is a directory
        if os.path.isdir(full_path):
            # Print the directory name
            assignment_folders.append({
                "Student": name.split("_")[0],
                "Path": full_path,
                "ContainsDocxFile": is_folder_contains_file(full_path, ".docx"),                
                "ContainsPdfFile": is_folder_contains_file(full_path, ".pdf"),
                "ContainsZipFile": is_folder_contains_file(full_path, ".zip")
                })
    df = pd.DataFrame([p for p in assignment_folders])
    return df
df = get_submissions_df(temp_path)

In [5]:
df

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile
0,NG Man Hei,tmp/submission/NG Man Hei_2461662_assignsubmis...,False,True,False
1,LEUNG Chun Wa,tmp/submission/LEUNG Chun Wa_2461652_assignsub...,True,False,False
2,CHEUNG Chi Him,tmp/submission/CHEUNG Chi Him_2461641_assignsu...,True,False,False
3,NG Man Chun,tmp/submission/NG Man Chun_2461585_assignsubmi...,True,False,False
4,CHAN Hiu Po,tmp/submission/CHAN Hiu Po_2461655_assignsubmi...,True,False,False
...,...,...,...,...,...
67,HUI Man Chun,tmp/submission/HUI Man Chun_2461600_assignsubm...,True,False,False
68,CHEUNG Ka Kwai,tmp/submission/CHEUNG Ka Kwai_2461642_assignsu...,True,False,False
69,LAU Yuen Ting,tmp/submission/LAU Yuen Ting_2461612_assignsub...,True,False,False
70,LI Hoi Yung,tmp/submission/LI Hoi Yung_2461659_assignsubmi...,True,False,False


In [6]:
def filter_df_by_not_contains_any_expected_files(df):
    return df[(df["ContainsDocxFile"] == False) & (df["ContainsPdfFile"] == False) & (df["ContainsZipFile"] == False)]
filter_df_by_not_contains_any_expected_files(df)

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile


In [7]:
import os
import shutil


def flatten(directory):
    for dirpath, _, filenames in os.walk(directory, topdown=False):
        for filename in filenames:
            i = 0
            source = os.path.join(dirpath, filename)
            target = os.path.join(directory, filename)

            while os.path.exists(target):
                i += 1
                file_parts = os.path.splitext(os.path.basename(filename))

                target = os.path.join(
                    directory,
                    file_parts[0] + "_" + str(i) + file_parts[1],
                )

            shutil.move(source, target)

            print("Moved ", source, " to ", target)

        if dirpath != directory:
            os.rmdir(dirpath)
            print("Deleted ", dirpath)

def get_first_file_path(path, ext):
    names = os.listdir(path)
    for name in names:
        if name.endswith(ext):
            return os.path.join(path, name)

def extract_zip_file_in_place(path):
    zip_path = get_first_file_path(path, ".zip")
    print(zip_path)
    import zipfile
    # Create a zip file object using ZipFile class
    with zipfile.ZipFile(zip_path, "r") as zip_obj:
        # Extract all the files into a directory
        zip_obj.extractall(path)
    flatten(path) 


def filter_df_by_contains_zip_file(df):
    return df[(df["ContainsZipFile"] == True)]

paths = filter_df_by_contains_zip_file(df)["Path"].values
for path in paths:
    extract_zip_file_in_place(path)

tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456.zip
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1_1.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456.zip  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456_2.zip
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_VideoLink_1_1.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing.docx  to  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_210226456_Writing_1.docx
Moved  tmp/submission/KAN Nin Chun_2461630_assignsubmission_file_/EA3_KanNinChun_210226456_1.zip  to  tmp/submission/KAN Nin Chun_2461630_assignsubmissi

In [8]:
df = get_submissions_df(temp_path)
filter_df_by_contains_zip_file(df)

Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile
17,KAN Nin Chun,tmp/submission/KAN Nin Chun_2461630_assignsubm...,True,False,True
27,MOU Rong,tmp/submission/MOU Rong_2461616_assignsubmissi...,True,False,True
28,SHIU Chung Hei,tmp/submission/SHIU Chung Hei_2461591_assignsu...,True,False,True
41,YU Nga Man,tmp/submission/YU Nga Man_2461593_assignsubmis...,True,False,True
55,LEUNG Yu Ho,tmp/submission/LEUNG Yu Ho_2461615_assignsubmi...,True,False,True
59,LEE Kam Chun Jerry,tmp/submission/LEE Kam Chun Jerry_2461658_assi...,True,False,True
66,LI Hoi Ming,tmp/submission/LI Hoi Ming_2461640_assignsubmi...,True,False,True


In [9]:
def filter_df_by_contains_docx_or_pdf(df):
    return df[(df["ContainsDocxFile"] == True) | (df["ContainsPdfFile"] == True)]
len(filter_df_by_contains_docx_or_pdf(df))

72

## Processing Docx

In [24]:
def filter_df_by_contains_docx(df):
    return df[(df["ContainsDocxFile"] == True)]
words_df = filter_df_by_contains_docx(df)
paths = words_df["Path"].values

def get_all_docx_or_doc_files(path):
    import glob
    return glob.glob(path + "/*.docx")

import docx2txt
from functools import reduce

students_words_files = list(map(get_all_docx_or_doc_files, paths)) # List of lists of word files

file_contents =[];
for word_files in students_words_files:  
    file_contents.append(reduce(lambda x, y: x + y, map(lambda f: docx2txt.process(f), word_files), "\n\n"))
# reduce(map(lambda f: docx2txt.process(f), word_files), lambda x, y: x + y, "")
words_df.loc[:, "Sources"] = students_words_files
words_df.loc[:, "Answers"] = file_contents


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df.loc[:, "Sources"] = students_words_files
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df.loc[:, "Answers"] = file_contents


In [23]:
words_df


Unnamed: 0,Student,Path,ContainsDocxFile,ContainsPdfFile,ContainsZipFile,Words,Answers
1,LEUNG Chun Wa,tmp/submission/LEUNG Chun Wa_2461652_assignsub...,True,False,False,[tmp/submission/LEUNG Chun Wa_2461652_assignsu...,\n\nITP4121 Workplace Reflective Journal\n\nLe...
2,CHEUNG Chi Him,tmp/submission/CHEUNG Chi Him_2461641_assignsu...,True,False,False,[tmp/submission/CHEUNG Chi Him_2461641_assigns...,\n\nJob Responsibility and Roles\n\nWorkplace ...
3,NG Man Chun,tmp/submission/NG Man Chun_2461585_assignsubmi...,True,False,False,[tmp/submission/NG Man Chun_2461585_assignsubm...,\n\nITP4121 Cloud and Data Centre Workplace Pr...
4,CHAN Hiu Po,tmp/submission/CHAN Hiu Po_2461655_assignsubmi...,True,False,False,[tmp/submission/CHAN Hiu Po_2461655_assignsubm...,\n\nChan HiuPo 210151024\n\n\n\nTable of conte...
5,IP Siu Fung Ernest,tmp/submission/IP Siu Fung Ernest_2461586_assi...,True,False,False,[tmp/submission/IP Siu Fung Ernest_2461586_ass...,\n\nIP Siu Fung Ernest (210123688)\n\nIT114115...
...,...,...,...,...,...,...,...
67,HUI Man Chun,tmp/submission/HUI Man Chun_2461600_assignsubm...,True,False,False,[tmp/submission/HUI Man Chun_2461600_assignsub...,\n\nITP4121 Cloud and Data Centre Workplace Pr...
68,CHEUNG Ka Kwai,tmp/submission/CHEUNG Ka Kwai_2461642_assignsu...,True,False,False,[tmp/submission/CHEUNG Ka Kwai_2461642_assigns...,\n\nWorkplace Reflective Journal: Smartone NOC...
69,LAU Yuen Ting,tmp/submission/LAU Yuen Ting_2461612_assignsub...,True,False,False,[tmp/submission/LAU Yuen Ting_2461612_assignsu...,\n\nJob Responsibilities and Roles\n\nAs an IT...
70,LI Hoi Yung,tmp/submission/LI Hoi Yung_2461659_assignsubmi...,True,False,False,[tmp/submission/LI Hoi Yung_2461659_assignsubm...,\n\nITP4121 Cloud and Data Centre Workplace Pr...
