In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import json
import pandas as pd
import os
import re
import unicodedata
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
input_file_path = os.path.join('dataset', 'keath_dataset.json')
output_file_path = os.path.join('dataset', 'data_with_ids.json')
output_duplicates_file = os.path.join('dataset/discussion', 'duplicates_grouped.json')

In [5]:
# --- DATA LOADING ---

def load_json_data(filepath: str) -> list:
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            report_data = json.load(f)

        if report_data:
            if isinstance(report_data, list):
                print(f"Successfully loaded JSON. Found {len(report_data)} records.")
            else:
                print(f"Loaded JSON, but it's not a list. Type: {type(report_data)}. Handling appropriately.")
                report_data = [report_data] if isinstance(report_data, dict) else []
        else:
            print("WARNING: JSON file was loaded but appears to be empty or None.")
            report_data = []
        return report_data

    except FileNotFoundError:
        print(f"ERROR: File not found at '{filepath}'. Please check the path.")
    except json.JSONDecodeError as e:
        print(f"ERROR: Could not decode JSON from file {filepath}. The file may be corrupted. Error: {e}")
    except Exception as e:
        print(f"ERROR: An unexpected error occurred during file loading: {e}")
    return []

In [6]:
# --- DATA VALIDATION & CLEANING FUNCTIONS ---
def validate_data(df: pd.DataFrame) -> list:
    
    # Check: Content Uniqueness
    # This function checks if the 'paper_content' column has unique entries.
    if 'paper_content' in df.columns:
        total_records = len(df)
        unique_content_records = df['paper_content'].nunique()
        if total_records == unique_content_records:
            print("Uniqueness Check: All 'paper_content' entries are unique.")
        else:
            duplicate_count = total_records - unique_content_records
            print(f"Uniqueness Check: Found {duplicate_count} duplicate 'paper_content' entries.")
    else:
        print("ERROR: 'paper_content' column not found for uniqueness check.")

    # Check: Substantive Content
    # This function checks if the content is substantive based on declared word count and actual word count.
    flagged_for_review = []
    for i, row in df.iterrows():
        content = row.get('paper_content', '')
        declared_match = re.search(r'word count:?\s*(\d{1,5})', content, re.IGNORECASE)
        declared_count = int(declared_match.group(1)) if declared_match else None
        actual_count = len(re.findall(r'\w+', content))
        
        is_substantive = True
        reason = ""

        if declared_count is not None:
            difference_ratio = actual_count / declared_count if declared_count > 0 else 0
            if difference_ratio < 0.80:
                is_substantive = False
                reason = f"Huge discrepancy -> Declared: {declared_count}, Actual: {actual_count}"
        elif actual_count < 300:
            is_substantive = False
            reason = f"No declared word count and content is very short -> Actual: {actual_count}"
        
        if not is_substantive:
            flagged_for_review.append({'index': i, 'reason': reason})
            
    if flagged_for_review:
        print(f"\n Substantive Content Check: Found {len(flagged_for_review)} records that may be non-substantive.")
        print("Details of Flagged Non-Substantive Records")
        for flag in flagged_for_review:
             print(f"  - Index {flag['index']}: {flag['reason']}")
    else:
        print("\n Substantive Content Check: All records appear to have substantive content.")
    
    # Return just the list of indices to be dropped
    return [flag['index'] for flag in flagged_for_review]

# --- DATA PREPROCESSING FUNCTIONS ---
def deduplicate_data(df: pd.DataFrame) -> pd.DataFrame:

    if 'paper_content' not in df.columns:
        print("Cannot de-duplicate: 'paper_content' column missing.")
        return df
        
    records_before = len(df)
    clean_df = df.drop_duplicates(subset=['paper_content'], keep='first').copy()
    records_after = len(clean_df)
    print(f"Removed {records_before - records_after} duplicate records. {records_after} unique records remain.")
    return clean_df

# This function assigns a unique ID to each record based on the index.
def assign_sequential_ids(df: pd.DataFrame) -> pd.DataFrame:

    print("\nAssigning Final Sequential IDs")
    # Reset the index to ensure it's a clean sequence from 0 to N-1
    df_with_new_index = df.reset_index(drop=True)
    # Assign the new index as the 'assignment_id'
    df_with_new_index['assignment_id'] = df_with_new_index.index
    print(f"Assigned new 'assignment_id' to {len(df_with_new_index)} records.")
    return df_with_new_index


In [7]:
# --- SCRIPT EXECUTION ---

# Load the raw data
raw_data = load_json_data(input_file_path)

if raw_data:
    main_df = pd.DataFrame(raw_data)
    
    # Perform validation checks and get indices of non-substantive records
    non_substantive_indices = validate_data(main_df)
    
    # Filter out non-substantive records
    print("\nFiltering Non-Substantive Records")
    if non_substantive_indices:
        substantive_df = main_df.drop(non_substantive_indices).copy()
        print(f"Removed {len(non_substantive_indices)} records. Proceeding with {len(substantive_df)} records.")
    else:
        substantive_df = main_df.copy()
        print("No non-substantive records to remove.")

    # Identify and export duplicates BEFORE removing them
    print("\nExporting Duplicates for Analysis")
    duplicate_mask = substantive_df.duplicated(subset=['paper_content'], keep=False)
    duplicates_df = substantive_df[duplicate_mask]

    if not duplicates_df.empty:
        sorted_duplicates_df = duplicates_df.sort_values(by='paper_content').reset_index(drop=True)
        sorted_duplicates_df.to_json(output_duplicates_file, orient='records', indent=4)
        print(f"Analysis file of {len(sorted_duplicates_df)} duplicate records saved to '{output_duplicates_file}'")
    else:
        print("No duplicate records found to export.")

    # Clean the data by de-duplicating
    deduplicated_df = deduplicate_data(substantive_df)
    
    # Assign final IDs to the clean, de-duplicated data
    final_df = assign_sequential_ids(deduplicated_df)

    # Save the final clean file
    try:
        final_df.to_json(output_file_path, orient='records', indent=4)
        print(f"\n Final clean data successfully saved to: '{output_file_path}'")
    except Exception as e:
        print(f"ERROR: Could not save the clean file. Error: {e}")


Successfully loaded JSON. Found 1979 records.
Uniqueness Check: Found 210 duplicate 'paper_content' entries.

 Substantive Content Check: Found 91 records that may be non-substantive.
Details of Flagged Non-Substantive Records
  - Index 0: Huge discrepancy -> Declared: 5764, Actual: 33
  - Index 364: No declared word count and content is very short -> Actual: 251
  - Index 370: No declared word count and content is very short -> Actual: 251
  - Index 375: No declared word count and content is very short -> Actual: 251
  - Index 378: No declared word count and content is very short -> Actual: 257
  - Index 379: No declared word count and content is very short -> Actual: 0
  - Index 428: No declared word count and content is very short -> Actual: 81
  - Index 441: Huge discrepancy -> Declared: 2340, Actual: 265
  - Index 622: No declared word count and content is very short -> Actual: 253
  - Index 624: No declared word count and content is very short -> Actual: 274
  - Index 626: No dec

In [8]:
final_df.head()  # Display the first few rows of the final DataFrame for verification
final_df.info()

Unnamed: 0,project_id,paper_content,assignment_id,evaluation,score,rubrics
0,464,BPP Business School \n\nCoursework Cover Sheet...,0,"[{""item"":""Task 1 - Management Practices"",""sub_...",13.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
1,464,\n\nCourseworkCoverSheet\n\nModuleName\n\nMana...,1,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
2,464,BPP Business School \n\nCoursework Cover Shee...,2,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
3,464,BPP Business School \n\nCoursework Cover Sheet...,3,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
4,464,BPP Coursework Cover Sheet ...,4,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   project_id     1693 non-null   int64  
 1   paper_content  1693 non-null   object 
 2   assignment_id  1693 non-null   int64  
 3   evaluation     1693 non-null   object 
 4   score          1691 non-null   float64
 5   rubrics        1693 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 79.5+ KB


In [9]:
final_df['evaluation'][0]

'[{"item":"Task 1 - Management Practices","sub_score":14,"score":0,"comment":"<b>Positive Aspects</b>: \\n- The submission effectively outlines <b>Qatar Airways\' mission and values</b>, demonstrating a foundational understanding of the company\'s strategic direction.\\n- The identification of <b>leadership influence</b> and <b>technological investments</b> as key components of management practices aligns well with the learning objectives.\\n\\n<b>Areas for Development</b>: \\n- The linkage between <b>Qatar Airways\' values and specific management practices</b> could be further detailed to demonstrate a deeper understanding of strategic alignment.\\n- Incorporation of <b>academic literature</b> to support the analysis was minimal. Utilizing contemporary sources could strengthen the argument and provide a more robust analysis.\\n\\n<b>Next Steps</b>: \\n- Engage with more <b>academic and industry-specific literature</b> to provide evidence for the alignment between the company\'s values

In [10]:
# Define a function to clean the Paper_Content && Other Columns
# This function will normalize unicode characters, remove HTML tags, and clean up whitespace.
def text_cleaner(text):

    if not isinstance(text, str):
        return ""

    # 1. Normalize unicode characters (handles `\u2019s`, etc.)
    text = unicodedata.normalize('NFKD', text)

    # 2. Replace escaped quotes and slashes
    text = text.replace('\\"', '"')
    text = text.replace('\\/', '/')
    text = unicodedata.normalize('NFKD', text)

    # 3. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 4. Remove escaped newlines and tabs
    text = text.replace('\\n', ' ').replace('\\t', ' ')

    # 5. Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply the function to the 'paper_content' column
final_df['paper_content'] = final_df['paper_content'].apply(text_cleaner)

# Display the DataFrame to verify the changes
final_df['paper_content'][0]

"BPP Business School Coursework Cover Sheet Please use this document as the cover sheet of for the 1st page of your assessment. Please complete the below table – the grey columns Module Name Management Essentials Student Reference Number (SRN) BP0262445 Assessment Title QATAR AIRWAYS BUSINESS REPORT Please complete the yellow sections in the below declaration: Declaration of Original Work: I hereby declare that I have read and understood BPP’s regulations on plagiarism and that this is my original work, researched, undertaken, completed and submitted in accordance with the requirements of BPP School of Business and Technology. The word count, excluding contents table, bibliography and appendices, is 5000 words. Student Reference Number: BP0262445 Date: 21-06-2023 By submitting this coursework you agree to all rules and regulations of BPP regarding assessments and awards for programmes. Please note that by submitting this assessment you are declaring that you are fit to sit this assessm

In [11]:
final_df.head()  # Display DataFrame info for verification

Unnamed: 0,project_id,paper_content,assignment_id,evaluation,score,rubrics
0,464,BPP Business School Coursework Cover Sheet Ple...,0,"[{""item"":""Task 1 - Management Practices"",""sub_...",13.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
1,464,CourseworkCoverSheet ModuleName Management Ess...,1,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
2,464,BPP Business School Coursework Cover Sheet Ple...,2,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
3,464,BPP Business School Coursework Cover Sheet Ple...,3,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."
4,464,BPP Coursework Cover Sheet Please use the tabl...,4,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc..."


In [12]:
# Create a new Rubrics table ---
# Assign a unique rubric_id to each group of identical rubrics based on project_id.
def assign_rubric_ids(df: pd.DataFrame):
    
    # Dictionary to map a project_id to its assigned rubric_id
    project_to_rubric_id_map = {}
    # Dictionary to store the unique rubrics
    unique_rubrics_data = {}
    # To keep track of the next available rubric_id
    next_rubric_id = 0

    # This will hold the rubric_id for each row in the original dataframe
    rubric_id_column = []

    # Iterate through the main DataFrame
    for index, row in df.iterrows():
        project_id = row['project_id']
        rubric_content = row['rubrics']

        # Check if we've already processed this project_id
        if project_id not in project_to_rubric_id_map:
            # This is a new project, so it gets a new rubric_id
            current_rubric_id = next_rubric_id

            # Store the mapping from this project_id to the new rubric_id
            project_to_rubric_id_map[project_id] = current_rubric_id

            # Store the actual rubric content in our unique rubrics dictionary
            unique_rubrics_data[current_rubric_id] = {
                'rubric_id': current_rubric_id,
                'source_project_id': project_id, # Keep track of where it came from
                'rubric_content': rubric_content
            }
            # Increment the ID for the next unique rubric we find
            next_rubric_id += 1
        else:
            # We've seen this project_id before, so get its existing rubric_id
            current_rubric_id = project_to_rubric_id_map[project_id]

        # Append the determined rubric_id to our list
        rubric_id_column.append(current_rubric_id)

    # Add the new 'rubric_id' column to the main DataFrame
    df['rubric_id'] = rubric_id_column

    # Create the new, separate rubrics DataFrame from our dictionary
    rubrics_df = pd.DataFrame(list(unique_rubrics_data.values()))

    return df, rubrics_df

# --- Execute the function ---
final_df, rubrics_table = assign_rubric_ids(final_df)

# --- Display the results ---
print("Main DataFrame with 'rubric_id' column")
# processed_df.drop(columns=['rubrics'])
final_df

print("New 'rubrics_table' DataFrame")
rubrics_table

Main DataFrame with 'rubric_id' column


Unnamed: 0,project_id,paper_content,assignment_id,evaluation,score,rubrics,rubric_id
0,464,BPP Business School Coursework Cover Sheet Ple...,0,"[{""item"":""Task 1 - Management Practices"",""sub_...",13.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
1,464,CourseworkCoverSheet ModuleName Management Ess...,1,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
2,464,BPP Business School Coursework Cover Sheet Ple...,2,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
3,464,BPP Business School Coursework Cover Sheet Ple...,3,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
4,464,BPP Coursework Cover Sheet Please use the tabl...,4,"[{""item"":""Task 1 - Management Practices"",""sub_...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
...,...,...,...,...,...,...,...
1688,722,Critically evaluate the accuracy of eyewitness...,1688,"[{""item"":""Introduction"",""comment"":""<b>Positive...",10.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1689,722,"Q1. Drawing upon empirical studies, discuss so...",1689,"[{""item"":""Introduction"",""comment"":""<b>Positive...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1690,722,"Drawing upon empirical studies, discuss some o...",1690,"[{""item"":""Introduction"",""comment"":""<b>Positive...",22.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1691,722,Experiencing Racial/Ethnic Discrimination Intr...,1691,"[{""item"":""Introduction"",""comment"":""<b>Positive...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48


New 'rubrics_table' DataFrame


Unnamed: 0,rubric_id,source_project_id,rubric_content
0,0,464,"[{'item': 'Task 1 - Management Practices', 'sc..."
1,1,428,[{'item': 'AC1 – Is the session plan presented...
2,2,429,[{'item': 'AC1 – Is the session plan presented...
3,3,446,"[{'item': 'Task 1 - Management Practices', 'sc..."
4,4,477,[{'item': 'Generic skills: communication and p...
...,...,...,...
71,71,716,"[{'item': 'Business Summary', 'scope': 'Full',..."
72,72,718,"[{'item': 'The strategic analysis', 'scope': '..."
73,73,719,"[{'item': 'Introduction', 'scope': 'Full', 'ru..."
74,74,723,"[{'item': 'Content', 'scope': 'Full', 'rubrics..."


In [13]:
rubrics_table.loc[0, 'rubric_content']

[{'item': 'Task 1 - Management Practices',
  'scope': 'Full',
  'rubrics': [{'score_range': '16-20',
    'rubric': "Outstanding analysis of Qatar Airways' mission, values, and management practices, brilliantly demonstrating alignment between management practices and the company's core values. Extensively supported by academic literature and independent research."},
   {'score_range': '14-15',
    'rubric': "Excellent analysis of Qatar Airways' mission, values, and management practices with coherent arguments demonstrating the alignmezzt between them. Well supported by academic literature and independent research."},
   {'score_range': '12-13',
    'rubric': "Good analysis of Qatar Airways' mission, values, and management practices but depth in demonstrating their alignment is limited. Moderately supported by academic resources."},
   {'score_range': '10-11',
    'rubric': "Adequate analysis of Qatar Airways' mission and values with basic attention given to management practices. Limited

In [14]:
# Remove the String by Parsing it [Evalution]
def parse_json_string(json_string):

    try:
        # If it succeeds, it returns the parsed list
        return json.loads(json_string)
    except Exception as e:
        # If it fails, it PRINTS the string and the error, then returns an empty list
        print("--- PARSING FAILED! ---")
        print(f"Error: {e}")
        print(f"Problematic String: {json_string[:200]}...") # Print first 200 chars
        print("-" * 25)
        return []

# Apply the DEBUG function 
# It's best to test on just the first few rows to avoid a wall of text
print("Running the debug function on the first 5 rows...")
final_df['evaluation'] = final_df['evaluation'].apply(parse_json_string)


# Look at the result 
print("Final DataFrame")
final_df

Running the debug function on the first 5 rows...
Final DataFrame


Unnamed: 0,project_id,paper_content,assignment_id,evaluation,score,rubrics,rubric_id
0,464,BPP Business School Coursework Cover Sheet Ple...,0,"[{'item': 'Task 1 - Management Practices', 'su...",13.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
1,464,CourseworkCoverSheet ModuleName Management Ess...,1,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
2,464,BPP Business School Coursework Cover Sheet Ple...,2,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
3,464,BPP Business School Coursework Cover Sheet Ple...,3,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
4,464,BPP Coursework Cover Sheet Please use the tabl...,4,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0
...,...,...,...,...,...,...,...
1688,722,Critically evaluate the accuracy of eyewitness...,1688,"[{'item': 'Introduction', 'comment': '<b>Posit...",10.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1689,722,"Q1. Drawing upon empirical studies, discuss so...",1689,"[{'item': 'Introduction', 'comment': '<b>Posit...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1690,722,"Drawing upon empirical studies, discuss some o...",1690,"[{'item': 'Introduction', 'comment': '<b>Posit...",22.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48
1691,722,Experiencing Racial/Ethnic Discrimination Intr...,1691,"[{'item': 'Introduction', 'comment': '<b>Posit...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48


In [15]:
print(type(final_df['evaluation'][0]))

<class 'list'>


In [16]:
final_df.to_json('dataset/evaluation_to_json.json', orient='records', indent=4)

PARSING RUBRICS FOR AI

In [17]:
# Parsing Process ---
df_exploded_tasks = rubrics_table.explode('rubric_content')

# The rest of the logic is the same as before
df_normalized_tasks = pd.json_normalize(df_exploded_tasks['rubric_content'])

df_with_tasks = pd.concat([
    df_exploded_tasks[['rubric_id', 'source_project_id']].reset_index(drop=True),
    df_normalized_tasks
], axis=1)

df_exploded_scores = df_with_tasks.explode('rubrics')
df_normalized_scores = pd.json_normalize(df_exploded_scores['rubrics'])

structured_rubrics_df = pd.concat([
    df_exploded_scores.drop(columns=['rubrics']).reset_index(drop=True),
    df_normalized_scores
], axis=1)


#  Display the Final, Clean Result
rubrics_table
structured_rubrics_df

Unnamed: 0,rubric_id,source_project_id,rubric_content
0,0,464,"[{'item': 'Task 1 - Management Practices', 'sc..."
1,1,428,[{'item': 'AC1 – Is the session plan presented...
2,2,429,[{'item': 'AC1 – Is the session plan presented...
3,3,446,"[{'item': 'Task 1 - Management Practices', 'sc..."
4,4,477,[{'item': 'Generic skills: communication and p...
...,...,...,...
71,71,716,"[{'item': 'Business Summary', 'scope': 'Full',..."
72,72,718,"[{'item': 'The strategic analysis', 'scope': '..."
73,73,719,"[{'item': 'Introduction', 'scope': 'Full', 'ru..."
74,74,723,"[{'item': 'Content', 'scope': 'Full', 'rubrics..."


Unnamed: 0,rubric_id,source_project_id,item,scope,scopeType,id,score_range,rubric,rubrics
0,0,464,Task 1 - Management Practices,Full,,,16-20,Outstanding analysis of Qatar Airways' mission...,
1,0,464,Task 1 - Management Practices,Full,,,14-15,"Excellent analysis of Qatar Airways' mission, ...",
2,0,464,Task 1 - Management Practices,Full,,,12-13,"Good analysis of Qatar Airways' mission, value...",
3,0,464,Task 1 - Management Practices,Full,,,10-11,Adequate analysis of Qatar Airways' mission an...,
4,0,464,Task 1 - Management Practices,Full,,,8-9,Poor analysis with superficial treatment of Qa...,
...,...,...,...,...,...,...,...,...,...
1888,75,727,Explores Professional Ethics in the Light of O...,Full,,,0-9,Lacks exploration of professional ethics in re...,
1889,75,727,"Considers Strategies for Promoting Equality, J...",Full,,,14-20,Exceptional strategies with comprehensive and ...,
1890,75,727,"Considers Strategies for Promoting Equality, J...",Full,,,12-13,Good strategies with clear approaches for prom...,
1891,75,727,"Considers Strategies for Promoting Equality, J...",Full,,,10-11,Basic strategies with some approaches but limi...,


EVALUATION_TABLE

In [18]:
#  Assign a unique evaluation_id for each unique evaluation based on assignment_id.
def assign_evaluation_ids(df: pd.DataFrame):
   
    # Dictionary to map an assignment_id to its new evaluation_id
    assignment_to_evaluation_id_map = {}
    # Dictionary to store the unique evaluation content
    unique_evaluations_data = {}
    next_evaluation_id = 0

    evaluation_id_column = []

    # Iterate through the main DataFrame
    for index, row in df.iterrows():
        assignment_id = row['assignment_id']
        evaluation_content = row['evaluation']

        # Check if we've already created an ID for this assignment's evaluation
        if assignment_id not in assignment_to_evaluation_id_map:
            current_evaluation_id = next_evaluation_id
            assignment_to_evaluation_id_map[assignment_id] = current_evaluation_id

            # Store the evaluation content in our new table
            unique_evaluations_data[current_evaluation_id] = {
                'evaluation_id': current_evaluation_id,
                'source_assignment_id': assignment_id,
                'evaluation_content': evaluation_content
            }
            next_evaluation_id += 1
        else:
            current_evaluation_id = assignment_to_evaluation_id_map[assignment_id]

        evaluation_id_column.append(current_evaluation_id)

    # Add the new 'evaluation_id' column to the main DataFrame
    df['evaluation_id'] = evaluation_id_column

    # Create the new, separate evaluations DataFrame
    evaluations_table = pd.DataFrame(list(unique_evaluations_data.values()))

    return df, evaluations_table

# Execute the new function 
final_df, evaluations_table = assign_evaluation_ids(final_df)

# Display the results 
print("Main DataFrame with 'evaluation_id' and 'rubric_id'")
final_df

print("New 'evaluations_table' DataFrame")
evaluations_table 

Main DataFrame with 'evaluation_id' and 'rubric_id'


Unnamed: 0,project_id,paper_content,assignment_id,evaluation,score,rubrics,rubric_id,evaluation_id
0,464,BPP Business School Coursework Cover Sheet Ple...,0,"[{'item': 'Task 1 - Management Practices', 'su...",13.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0,0
1,464,CourseworkCoverSheet ModuleName Management Ess...,1,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0,1
2,464,BPP Business School Coursework Cover Sheet Ple...,2,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0,2
3,464,BPP Business School Coursework Cover Sheet Ple...,3,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0,3
4,464,BPP Coursework Cover Sheet Please use the tabl...,4,"[{'item': 'Task 1 - Management Practices', 'su...",14.0,"[{'item': 'Task 1 - Management Practices', 'sc...",0,4
...,...,...,...,...,...,...,...,...
1688,722,Critically evaluate the accuracy of eyewitness...,1688,"[{'item': 'Introduction', 'comment': '<b>Posit...",10.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48,1688
1689,722,"Q1. Drawing upon empirical studies, discuss so...",1689,"[{'item': 'Introduction', 'comment': '<b>Posit...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48,1689
1690,722,"Drawing upon empirical studies, discuss some o...",1690,"[{'item': 'Introduction', 'comment': '<b>Posit...",22.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48,1690
1691,722,Experiencing Racial/Ethnic Discrimination Intr...,1691,"[{'item': 'Introduction', 'comment': '<b>Posit...",1.0,"[{'item': 'Introduction', 'scope': 'Full', 'ru...",48,1691


New 'evaluations_table' DataFrame


Unnamed: 0,evaluation_id,source_assignment_id,evaluation_content
0,0,0,"[{'item': 'Task 1 - Management Practices', 'su..."
1,1,1,"[{'item': 'Task 1 - Management Practices', 'su..."
2,2,2,"[{'item': 'Task 1 - Management Practices', 'su..."
3,3,3,"[{'item': 'Task 1 - Management Practices', 'su..."
4,4,4,"[{'item': 'Task 1 - Management Practices', 'su..."
...,...,...,...
1688,1688,1688,"[{'item': 'Introduction', 'comment': '<b>Posit..."
1689,1689,1689,"[{'item': 'Introduction', 'comment': '<b>Posit..."
1690,1690,1690,"[{'item': 'Introduction', 'comment': '<b>Posit..."
1691,1691,1691,"[{'item': 'Introduction', 'comment': '<b>Posit..."


In [19]:
# The Explode and Normalize Process ---

# A. Explode the list in the 'evaluation_content' column
exploded_df = evaluations_table.explode('evaluation_content')

# B. Normalize the dictionaries into new columns
normalized_df = pd.json_normalize(exploded_df['evaluation_content'])

# C. Combine the original IDs with the new normalized columns
structured_evaluations_df = pd.concat([
    exploded_df.drop(columns=['evaluation_content']).reset_index(drop=True),
    normalized_df.reset_index(drop=True)
], axis=1)


# --- 3. Display the Final, Structured Result ---
print("Fully Parsed and Structured Evaluations Table")
structured_evaluations_df = structured_evaluations_df.drop(columns=['is_evaluation','expand', 'opacity', 'is_check', 'hidden_score', 'hidden_item']) 
structured_evaluations_df.head(10)

Fully Parsed and Structured Evaluations Table


Unnamed: 0,evaluation_id,source_assignment_id,item,sub_score,score,comment,evidence,type,loading,agree,evidence.first_point,evidence.second_point,evidence.Text 2,evidence.Text 3,evidence.fact,evidence.opinion,rate,feedback,positivity
0,0,0,Task 1 - Management Practices,14,0.0,<b>Positive Aspects</b>: \n- The submission ef...,The submission clearly delineates <q>Qatar Air...,sub_score,,,,,,,,,,,
1,0,0,Task 2 - Analysis of Business Challenges,12,0.0,<b>Positive Aspects</b>: \n- Successfully iden...,The submission outlines significant challenges...,sub_score,,,,,,,,,,,
2,0,0,Task 3 - Management Behaviors of Emotional Int...,13,0.0,<b>Positive Aspects</b>: \n- Acknowledges the ...,The submission provides a general discussion o...,sub_score,,,,,,,,,,,
3,0,0,Task 4 - Management Competencies Development,18,0.0,<b>Positive Aspects</b>: \n- Identifies compet...,The submission identifies some competencies bu...,sub_score,,,,,,,,,,,
4,0,0,"Presentation, Structure, and Referencing",10,0.0,<b>Positive Aspects</b>: \n- The report is wel...,"The report is well-organized, following the as...",sub_score,,,,,,,,,,,
5,0,0,Total Score,0,52.0,Comments: The student has comprehensively addr...,,score,,,,,,,,,,,
6,1,1,Task 1 - Management Practices,14,0.0,<b>Positive Aspects</b>: \n- The submission de...,The submission demonstrates a good understandi...,sub_score,,,,,,,,,,,
7,1,1,Task 2 - Analysis of Business Challenges,13,0.0,<b>Positive Aspects</b>: \n- The submission id...,The submission outlines current significant ch...,sub_score,,,,,,,,,,,
8,1,1,Task 3 - Management Behaviors of Emotional Int...,13,0.0,<b>Positive Aspects</b>: \n- The submission's ...,The submission details examples of how Qatar A...,sub_score,,,,,,,,,,,
9,1,1,Task 4 - Management Competencies Development,18,0.0,<b>Positive Aspects</b>: \n- The submission ef...,The submission identifies core values and mana...,sub_score,,,,,,,,,,,


In [20]:
# Apply the text_cleaner function to the 'comment' column of the structured evaluations DataFrame
structured_evaluations_df['comment'] = structured_evaluations_df['comment'].apply(text_cleaner)
structured_evaluations_df['evidence'] = structured_evaluations_df['evidence'].apply(text_cleaner)

# Display the result
structured_evaluations_df.head(10)  # Display the first 10 rows to verify the cleaning

Unnamed: 0,evaluation_id,source_assignment_id,item,sub_score,score,comment,evidence,type,loading,agree,evidence.first_point,evidence.second_point,evidence.Text 2,evidence.Text 3,evidence.fact,evidence.opinion,rate,feedback,positivity
0,0,0,Task 1 - Management Practices,14,0.0,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,sub_score,,,,,,,,,,,
1,0,0,Task 2 - Analysis of Business Challenges,12,0.0,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,sub_score,,,,,,,,,,,
2,0,0,Task 3 - Management Behaviors of Emotional Int...,13,0.0,Positive Aspects: - Acknowledges the importanc...,The submission provides a general discussion o...,sub_score,,,,,,,,,,,
3,0,0,Task 4 - Management Competencies Development,18,0.0,Positive Aspects: - Identifies competencies re...,The submission identifies some competencies bu...,sub_score,,,,,,,,,,,
4,0,0,"Presentation, Structure, and Referencing",10,0.0,Positive Aspects: - The report is well-organiz...,"The report is well-organized, following the as...",sub_score,,,,,,,,,,,
5,0,0,Total Score,0,52.0,Comments: The student has comprehensively addr...,,score,,,,,,,,,,,
6,1,1,Task 1 - Management Practices,14,0.0,Positive Aspects: - The submission demonstrate...,The submission demonstrates a good understandi...,sub_score,,,,,,,,,,,
7,1,1,Task 2 - Analysis of Business Challenges,13,0.0,Positive Aspects: - The submission identifies ...,The submission outlines current significant ch...,sub_score,,,,,,,,,,,
8,1,1,Task 3 - Management Behaviors of Emotional Int...,13,0.0,Positive Aspects: - The submission's focus on ...,The submission details examples of how Qatar A...,sub_score,,,,,,,,,,,
9,1,1,Task 4 - Management Competencies Development,18,0.0,Positive Aspects: - The submission effectively...,The submission identifies core values and mana...,sub_score,,,,,,,,,,,


THIS IS WHERE WE MERGE

In [21]:
# --- Merge Process ---

# Combine the main table with the structured evaluations
# Use 'evaluation_id' as the key to link them.
merged_evals_df = pd.merge(
    final_df,
    structured_evaluations_df,
    on='evaluation_id',
    how='left'  # 'left' join ensures we keep all original assignments
)

# Combine the result with the structured rubrics
master_df = pd.merge(
    merged_evals_df,
    structured_rubrics_df,
    on=['rubric_id', 'item'], # Joining on a composite key
    how='left' # 'left' join ensures we keep all original evaluations
)

# --- Final Cleanup and Display ---
# Drop the original messy columns
final_columns_to_keep = [
    'project_id',
    'paper_content',
    'assignment_id',
    'score_y', # Original score from the main DataFrame
    'sub_score',
    'type',
    'item',
    'comment',
    'evidence',
    'score_range',
    'rubric'
    ]

master_df = master_df[final_columns_to_keep]

# Rename columns for clarity
master_df = master_df.rename(columns={'score_y': 'score'})

print("The Final, Merged Master DataFrame")
master_df.head(10)

The Final, Merged Master DataFrame


Unnamed: 0,project_id,paper_content,assignment_id,score,sub_score,type,item,comment,evidence,score_range,rubric
0,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,16-20,Outstanding analysis of Qatar Airways' mission...
1,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
2,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,12-13,"Good analysis of Qatar Airways' mission, value..."
3,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,10-11,Adequate analysis of Qatar Airways' mission an...
4,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,8-9,Poor analysis with superficial treatment of Qa...
5,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,14,sub_score,Task 1 - Management Practices,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,0-7,Lacks substantial analysis or understanding of...
6,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,12,sub_score,Task 2 - Analysis of Business Challenges,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,16-20,Comprehensive and critical discussion of the k...
7,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,12,sub_score,Task 2 - Analysis of Business Challenges,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,14-15,Thorough discussion of key challenges with det...
8,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,12,sub_score,Task 2 - Analysis of Business Challenges,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,12-13,Identifies significant challenges and discusse...
9,464,BPP Business School Coursework Cover Sheet Ple...,0,0.0,12,sub_score,Task 2 - Analysis of Business Challenges,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,10-11,Highlights some challenges and briefly touches...


In [22]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34845 entries, 0 to 34844
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   project_id     34845 non-null  int64  
 1   paper_content  34845 non-null  object 
 2   assignment_id  34845 non-null  int64  
 3   score          34843 non-null  float64
 4   sub_score      34845 non-null  object 
 5   type           34845 non-null  object 
 6   item           34845 non-null  object 
 7   comment        34845 non-null  object 
 8   evidence       34845 non-null  object 
 9   score_range    30125 non-null  object 
 10  rubric         30125 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.9+ MB


In [23]:
# --- Prepare the Final Clean Data for Analysis ---

# Convert 'sub_score' to numeric, coercing errors to NaN
master_df['sub_score'] = pd.to_numeric(master_df['sub_score'], errors='coerce')

# This function will find the correct rubric based on the score and the rubric group.
def find_correct_rubric(score, rubric_group):
    for index, row in rubric_group.iterrows():
        try:
            low, high = map(int, row['score_range'].split('-'))
            if low <= score <= high:
                # Return a tuple with both pieces of information
                return row['rubric'], row['score_range']
        except (ValueError, AttributeError):
            continue
    return None, None # Return None for both if no match is found

# Group by the columns that define a unique evaluation
grouped = master_df.groupby(['assignment_id', 'item'])

# This list will hold our final, clean data
prepared_data = []

for (assign_id, item_name), group_df in grouped:
    # Get the type for this group
    group_type = group_df['type'].iloc[0]

    # Use if/elif to handle both cases
    if group_type == 'sub_score':
        # Sub_score feedback 
        comment = group_df['comment'].iloc[0]
        sub_score = group_df['sub_score'].iloc[0]
        evidence = group_df['evidence'].iloc[0]
        paper_content = group_df['paper_content'].iloc[0]
        
        correct_rubric, correct_score_range = find_correct_rubric(sub_score, group_df)
        
        prepared_data.append({
            'assignment_id': assign_id,
            'paper_content': paper_content,
            'item': item_name,
            'type': 'sub_score',
            'score': sub_score, # Use a consistent name for the score value
            'comment': comment,
            'evidence': evidence,
            'score_range': correct_score_range, # Add the score_range
            'rubric': correct_rubric
            
        })

    elif group_type == 'score':
        # Total_score feedback 
        prepared_data.append({
            'assignment_id': assign_id,
            'paper_content': group_df['paper_content'].iloc[0],
            'item': item_name,
            'type': 'score',
            'score': group_df['score'].iloc[0], # Use the main 'score' column here
            'comment': group_df['comment'].iloc[0],
            'evidence': None, # Not applicable for total score
            'score_range': None, # No score range for total score
            'rubric': None # No specific rubric for total score
        })

# Create the final, clean master DataFrame
master_df = pd.DataFrame(prepared_data)

# Display the result
master_df.head(10)

Unnamed: 0,assignment_id,paper_content,item,type,score,comment,evidence,score_range,rubric
0,0,BPP Business School Coursework Cover Sheet Ple...,"Presentation, Structure, and Referencing",sub_score,10.0,Positive Aspects: - The report is well-organiz...,"The report is well-organized, following the as...",7-10,The report is exceptionally well structured an...
1,0,BPP Business School Coursework Cover Sheet Ple...,Task 1 - Management Practices,sub_score,14.0,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
2,0,BPP Business School Coursework Cover Sheet Ple...,Task 2 - Analysis of Business Challenges,sub_score,12.0,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,12-13,Identifies significant challenges and discusse...
3,0,BPP Business School Coursework Cover Sheet Ple...,Task 3 - Management Behaviors of Emotional Int...,sub_score,13.0,Positive Aspects: - Acknowledges the importanc...,The submission provides a general discussion o...,12-13,Good evaluation showing general effects of Emo...
4,0,BPP Business School Coursework Cover Sheet Ple...,Task 4 - Management Competencies Development,sub_score,18.0,Positive Aspects: - Identifies competencies re...,The submission identifies some competencies bu...,18-20,Competent assessment of management competencie...
5,0,BPP Business School Coursework Cover Sheet Ple...,Total Score,score,52.0,Comments: The student has comprehensively addr...,,,
6,1,CourseworkCoverSheet ModuleName Management Ess...,"Presentation, Structure, and Referencing",sub_score,10.0,Positive Aspects: - The report is well-structu...,"The report is well-structured, with clear sect...",7-10,The report is exceptionally well structured an...
7,1,CourseworkCoverSheet ModuleName Management Ess...,Task 1 - Management Practices,sub_score,14.0,Positive Aspects: - The submission demonstrate...,The submission demonstrates a good understandi...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
8,1,CourseworkCoverSheet ModuleName Management Ess...,Task 2 - Analysis of Business Challenges,sub_score,13.0,Positive Aspects: - The submission identifies ...,The submission outlines current significant ch...,12-13,Identifies significant challenges and discusse...
9,1,CourseworkCoverSheet ModuleName Management Ess...,Task 3 - Management Behaviors of Emotional Int...,sub_score,13.0,Positive Aspects: - The submission's focus on ...,The submission details examples of how Qatar A...,12-13,Good evaluation showing general effects of Emo...


SOME OF ROWS DON'T HAVE RUBRICS || WE DROP THEM 

In [24]:
# Condition 1: The row has a valid, non-empty rubric
has_rubric = master_df['rubric'].notna() & (master_df['rubric'].str.strip() != '')

# Condition 2: The row is a special 'total score' row that we want to keep anyway
is_total_score_row = master_df['type'] == 'score'

# Keep a row if EITHER Condition 1 OR Condition 2 is true
master_df = master_df[has_rubric | is_total_score_row]

master_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8979 entries, 0 to 12077
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   assignment_id  8979 non-null   int64  
 1   paper_content  8979 non-null   object 
 2   item           8979 non-null   object 
 3   type           8979 non-null   object 
 4   score          8979 non-null   float64
 5   comment        8979 non-null   object 
 6   evidence       7288 non-null   object 
 7   score_range    7288 non-null   object 
 8   rubric         7288 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 701.5+ KB


CHECK FOR EMPTY TEXT FIELDS & DELETE THEM

Text columns that are crucial for the Judge LLM test cases 

In [25]:
# Check for empty text fields and delete them
(master_df['evidence'].str.strip() == '').sum()
(master_df['comment'].str.strip() == '').sum()
(master_df['rubric'].str.strip() == '').sum()
(master_df['paper_content'].str.strip() == '').sum()

# Define the important text columns to check for emptiness
# Text columns that are crucial for the Judge LLM test cases 
important_text_cols = ['paper_content', 'comment', 'rubric', 'evidence']

for col in important_text_cols:
    # First, ensure the column exists in the DataFrame to avoid errors
    if col in master_df.columns:
        # Remove rows where the stripped value of the column is an empty string
        # We also use .notna() to handle any potential None/NaN values safely
        master_df = master_df[(master_df[col].str.strip() != '')]

(master_df['evidence'].str.strip() == '').sum()
(master_df['comment'].str.strip() == '').sum()
(master_df['rubric'].str.strip() == '').sum()
(master_df['paper_content'].str.strip() == '').sum()

master_df.info()

125

109

0

0

0

0

0

0

<class 'pandas.core.frame.DataFrame'>
Index: 8745 entries, 0 to 12077
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   assignment_id  8745 non-null   int64  
 1   paper_content  8745 non-null   object 
 2   item           8745 non-null   object 
 3   type           8745 non-null   object 
 4   score          8745 non-null   float64
 5   comment        8745 non-null   object 
 6   evidence       7134 non-null   object 
 7   score_range    7134 non-null   object 
 8   rubric         7134 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 683.2+ KB


CHECK FOR OF ROWS HAVE DUPLICATED 'COMMENTS' & EVIDENCE

In [26]:
dupes = master_df[
    master_df.duplicated('comment', keep=False) 
]
dupes['comment'].tolist()

["Positive Aspects: - The assignment demonstrates a well-structured narrative, particularly in the discussion of Kraft’s acquisition strategy and the subsequent integration of Cadbury. The sections detailing the hostile takeover and the strategic implications for both companies are clearly delineated, reflecting the LO's emphasis on structured communication. - The use of citations such as Wiggins 2010 and Jones and Dorfman 2010 shows an attempt to back arguments with relevant sources, aligning with the LOs on research skills. Areas for Development: - The report could benefit from a more consistent application of the Harvard referencing style. For instance, the citation format for Wiggins 2010 and Jones and Dorfman 2010 should be checked for consistency with the latest guidelines. - While the narrative is generally well-structured, there are sections where the argument could be made more coherent by directly linking Kraft’s strategic objectives with the outcomes of the acquisition, as s

In [27]:
master_df = master_df.drop_duplicates(subset='comment')
master_df.info()
master_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 8634 entries, 0 to 12077
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   assignment_id  8634 non-null   int64  
 1   paper_content  8634 non-null   object 
 2   item           8634 non-null   object 
 3   type           8634 non-null   object 
 4   score          8634 non-null   float64
 5   comment        8634 non-null   object 
 6   evidence       7023 non-null   object 
 7   score_range    7023 non-null   object 
 8   rubric         7023 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 674.5+ KB


Unnamed: 0,assignment_id,paper_content,item,type,score,comment,evidence,score_range,rubric
0,0,BPP Business School Coursework Cover Sheet Ple...,"Presentation, Structure, and Referencing",sub_score,10.0,Positive Aspects: - The report is well-organiz...,"The report is well-organized, following the as...",7-10,The report is exceptionally well structured an...
1,0,BPP Business School Coursework Cover Sheet Ple...,Task 1 - Management Practices,sub_score,14.0,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
2,0,BPP Business School Coursework Cover Sheet Ple...,Task 2 - Analysis of Business Challenges,sub_score,12.0,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,12-13,Identifies significant challenges and discusse...
3,0,BPP Business School Coursework Cover Sheet Ple...,Task 3 - Management Behaviors of Emotional Int...,sub_score,13.0,Positive Aspects: - Acknowledges the importanc...,The submission provides a general discussion o...,12-13,Good evaluation showing general effects of Emo...
4,0,BPP Business School Coursework Cover Sheet Ple...,Task 4 - Management Competencies Development,sub_score,18.0,Positive Aspects: - Identifies competencies re...,The submission identifies some competencies bu...,18-20,Competent assessment of management competencie...
5,0,BPP Business School Coursework Cover Sheet Ple...,Total Score,score,52.0,Comments: The student has comprehensively addr...,,,
6,1,CourseworkCoverSheet ModuleName Management Ess...,"Presentation, Structure, and Referencing",sub_score,10.0,Positive Aspects: - The report is well-structu...,"The report is well-structured, with clear sect...",7-10,The report is exceptionally well structured an...
7,1,CourseworkCoverSheet ModuleName Management Ess...,Task 1 - Management Practices,sub_score,14.0,Positive Aspects: - The submission demonstrate...,The submission demonstrates a good understandi...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
8,1,CourseworkCoverSheet ModuleName Management Ess...,Task 2 - Analysis of Business Challenges,sub_score,13.0,Positive Aspects: - The submission identifies ...,The submission outlines current significant ch...,12-13,Identifies significant challenges and discusse...
9,1,CourseworkCoverSheet ModuleName Management Ess...,Task 3 - Management Behaviors of Emotional Int...,sub_score,13.0,Positive Aspects: - The submission's focus on ...,The submission details examples of how Qatar A...,12-13,Good evaluation showing general effects of Emo...


Analyze the distribution of character lengths in your evidence column. 

This will almost certainly reveal two very different groups of data: the short, valid evidence snippets and the long, corrupted "walls of text."

In [28]:
# # --- 1. Calculate the Length of Each 'evidence' String ---
# # We create a new column to hold the character count.
# # We use .str.len() and fill any missing values with 0.
# master_df['evidence_length'] = master_df['evidence'].str.len().fillna(0)


# # --- 2. Visualize the Distribution ---
# plt.figure(figsize=(10, 6))
# sns.histplot(master_df['evidence_length'], bins=50)
# plt.title('Distribution of Evidence Character Lengths')
# plt.xlabel('Character Count')
# plt.ylabel('Frequency')
# plt.show()

# # --- 3. Analyze the Results ---
# print("--- Descriptive Statistics for Evidence Length ---")
# print(master_df['evidence_length'].describe())

Based on this data, a very safe and effective threshold would be 3,000 characters.

This is well above the 75th percentile (so you won't cut out any valid data) but will almost certainly remove all the corrupted rows.

In [29]:
# master_df = master_df[master_df['evidence'].str.len() < 1000]
# master_df.drop(columns=['evidence_length'], inplace=True)
# master_df.info()
# master_df.head(10)

Check for Scores above 100

In [37]:
# master_df['score'].describe()
# master_df[master_df['score'] > 100].count()
# master_df = master_df[master_df['score'] <= 100]
# master_df[master_df['score'] > 100].count()

CREATING DATAFRAMES FOR EACH METRIC

In [30]:
# For Rubric Alignment and Factual Consistency 
# This will drop rows where 'evidence', 'rubric' are NaN.
# This ensures that we only keep rows with complete information for further analysis.
master_second_df = master_df.dropna(subset=['evidence', 'rubric'])
master_second_df.info()
master_second_df

<class 'pandas.core.frame.DataFrame'>
Index: 7023 entries, 0 to 12077
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   assignment_id  7023 non-null   int64  
 1   paper_content  7023 non-null   object 
 2   item           7023 non-null   object 
 3   type           7023 non-null   object 
 4   score          7023 non-null   float64
 5   comment        7023 non-null   object 
 6   evidence       7023 non-null   object 
 7   score_range    7023 non-null   object 
 8   rubric         7023 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 548.7+ KB


Unnamed: 0,assignment_id,paper_content,item,type,score,comment,evidence,score_range,rubric
0,0,BPP Business School Coursework Cover Sheet Ple...,"Presentation, Structure, and Referencing",sub_score,10.0,Positive Aspects: - The report is well-organiz...,"The report is well-organized, following the as...",7-10,The report is exceptionally well structured an...
1,0,BPP Business School Coursework Cover Sheet Ple...,Task 1 - Management Practices,sub_score,14.0,Positive Aspects: - The submission effectively...,The submission clearly delineates Qatar Airway...,14-15,"Excellent analysis of Qatar Airways' mission, ..."
2,0,BPP Business School Coursework Cover Sheet Ple...,Task 2 - Analysis of Business Challenges,sub_score,12.0,Positive Aspects: - Successfully identifies ke...,The submission outlines significant challenges...,12-13,Identifies significant challenges and discusse...
3,0,BPP Business School Coursework Cover Sheet Ple...,Task 3 - Management Behaviors of Emotional Int...,sub_score,13.0,Positive Aspects: - Acknowledges the importanc...,The submission provides a general discussion o...,12-13,Good evaluation showing general effects of Emo...
4,0,BPP Business School Coursework Cover Sheet Ple...,Task 4 - Management Competencies Development,sub_score,18.0,Positive Aspects: - Identifies competencies re...,The submission identifies some competencies bu...,18-20,Competent assessment of management competencie...
...,...,...,...,...,...,...,...,...,...
12072,1691,Experiencing Racial/Ethnic Discrimination Intr...,Writing Style and Referencing,sub_score,5.0,Positive Aspects: - The writing style is clear...,The feedback highlights the positive aspects o...,0-25,Poor writing quality with numerous errors. Inc...
12073,1692,How do individuals experience discrimination? ...,Conclusion,sub_score,12.5,Positive Aspects: - The conclusion summarizes ...,The conclusion summarizes the key themes of di...,0-25,Inadequate conclusion with no clear summary of...
12074,1692,How do individuals experience discrimination? ...,Content and Argument,sub_score,42.5,Positive Aspects: - The content covers various...,The content covers various aspects of racial a...,26-50,Basic content with some arguments supported by...
12075,1692,How do individuals experience discrimination? ...,Introduction,sub_score,4.5,Positive Aspects: - The introduction provides ...,The introduction provides a broad overview of ...,0-25,Poor or missing introduction with little to no...


In [31]:
master_df.to_json('dataset/preprocessed/master_df.json', orient='records', indent=4)
master_second_df.to_json('dataset/preprocessed/master_second_df.json', orient='records', indent=4)
master_df.head(100).to_json('dataset/preprocessed/master_df_preview.json', orient='records', indent=4)
master_second_df.head(100).to_json('dataset/preprocessed/master_second_df_preview.json', orient='records', indent=4)

In [33]:
# Select and order the columns to display
appendix_table = master_df[['comment', 'evidence', 'rubric']].head(5)

# Apply styling to wrap text and left-align the content and headers
styled_table = appendix_table.style.set_properties(**{
    'white-space': 'pre-wrap', # Allows text to wrap within the cell
    'text-align': 'left'
}).set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]} # Left-aligns the headers
])

# Display the final, styled table 
styled_table

Unnamed: 0,comment,evidence,rubric
0,"Positive Aspects: - The report is well-organized, following the assignment's structure effectively and providing a clear overview in the introduction. - The conclusion summarizes key findings, although it could more clearly state actionable recommendations. Areas for Development: - The depth of analysis and integration of sections could be improved to provide a more comprehensive understanding of the topics discussed. - The submission does not specify adherence to the Harvard referencing style, and the quality of citations could be enhanced for academic rigor. Next Steps: - Enhance the depth of analysis in each section, ensuring a cohesive flow between the report's various components. - Ensure all references are correctly formatted according to the Harvard style, and incorporate a wider range of academic sources to support arguments.","The report is well-organized, following the assignment's structure effectively and providing a clear overview in the introduction. The information is organized in a structured manner, following the assignment's tasks. The introduction sets the purpose and scope of the report effectively, providing a clear overview of the content. The conclusion summarizes key findings, although it could more clearly state actionable recommendations. The conclusion summarizes key findings but lacks a comprehensive summary of recommendations. The effectiveness of the conclusion could be enhanced by more clearly stating actionable recommendations. The depth of analysis and integration of sections could be improved to provide a more comprehensive understanding of the topics discussed. However, the depth of analysis and integration of sections could be improved. The submission does not specify adherence to the Harvard referencing style, and the quality of citations could be enhanced for academic rigor. The sources cited appear to follow a structured format, but the submission does not specify if it adheres to the Harvard referencing style. A detailed review of referencing quality is not possible without direct examination of the references list and in-text citations.","The report is exceptionally well structured and thoughtfully presented, featuring flawless grammar and punctuation. The introduction provides an excellent definition of leadership and management with highly credible references. The conclusion excellently summarizes the recommended changes. Referencing is impeccable, adhering strictly to the Harvard style with a wide range of credible academic sources effectively supporting arguments."
1,"Positive Aspects: - The submission effectively outlines Qatar Airways' mission and values, demonstrating a foundational understanding of the company's strategic direction. - The identification of leadership influence and technological investments as key components of management practices aligns well with the learning objectives. Areas for Development: - The linkage between Qatar Airways' values and specific management practices could be further detailed to demonstrate a deeper understanding of strategic alignment. - Incorporation of academic literature to support the analysis was minimal. Utilizing contemporary sources could strengthen the argument and provide a more robust analysis. Next Steps: - Engage with more academic and industry-specific literature to provide evidence for the alignment between the company's values and management practices. - Develop a more critical analysis of how Qatar Airways' leadership directly influences its management practices, supported by empirical evidence or case studies.","The submission clearly delineates Qatar Airways' mission to become the world’s best airline by providing incomparable service and quality and identifies the company's values, such as service, safety, innovation, global connection, leadership values, and corporate social responsibility. However, the direct linkage of these values to specific management practices could be further detailed. The submission provides examples demonstrating alignment, such as the focus on customer service, technological investment, and safety commitment. The submission mentions Mr. Akbar Al Baker's role in guiding Qatar Airways to success, highlighting leadership influence on corporate culture and strategy. The submission reflects on the integration of the company’s values into its management practices but does so without a critical analysis backed by contemporary academic literature. The discussion remains descriptive rather than analytical. The submission lacks actionable recommendations and theoretical models for better alignment of management practices with the company's vision and mission. It does not provide a clear framework or suggestions for improvement. The submission does not offer an in-depth and critical examination of the mission and values, nor their influence on the organization. The analysis remains surface-level, missing a critical perspective.","Excellent analysis of Qatar Airways' mission, values, and management practices with coherent arguments demonstrating the alignmezzt between them. Well supported by academic literature and independent research."
2,"Positive Aspects: - Successfully identifies key challenges faced by Qatar Airways, including the impact of COVID-19 and financial challenges, which is crucial for understanding the current business environment. Areas for Development: - The submission lacks detailed examples of creative problem-solving and the application of innovation frameworks to address these challenges. - There is an inadequate linkage between the identified challenges and strategic decisions within management practices, limiting the depth of analysis. Next Steps: - Integrate specific models or theories that foster creativity and innovation at Qatar Airways to provide a clearer framework for analysis. - Conduct an impact assessment of the strategies mentioned to evaluate their effectiveness in addressing the challenges.","The submission outlines significant challenges faced by Qatar Airways, such as the COVID-19 impact, financial challenges, social challenges, and operational challenges, supported with recent data and statements from the CEO. This indicates a successful identification of key challenges, acknowledging the complex environment in which Qatar Airways operates, particularly under the unprecedented pressures of the global pandemic and its financial repercussions on the aviation industry. However, when it comes to addressing these challenges through creative problem-solving and innovation, the submission falls short. It mentions the importance of creativity and innovation but does not provide detailed examples or relevant theories to back up this claim. The lack of specificity and depth in this area suggests a superficial treatment of how Qatar Airways employs innovative strategies to navigate its challenges. Furthermore, the submission does not mention specific models or theories that foster creativity and innovation at Qatar Airways, leaving a gap in the analysis. The discussion on innovation lacks depth and clarity, failing to provide a clear framework for understanding how the company might leverage creativity and innovation to overcome its identified challenges. Moreover, the submission inadequately links the identified challenges to strategic decisions within management practices. The discussion on how challenges influence strategic decisions is not explored in depth, which is a critical oversight. Understanding the connection between the challenges faced by the company and the strategic decisions made in response is essential for a comprehensive analysis. This linkage is crucial for demonstrating a thorough understanding of the strategic management process within the context of Qatar Airways. In summary, while the submission successfully identifies the challenges faced by Qatar Airways, it lacks detailed exploration of creative problem-solving and the application of innovation frameworks. The absence of specific examples, theoretical frameworks, and a clear linkage between challenges and strategic decisions limits the depth and comprehensiveness of the analysis.",Identifies significant challenges and discusses the role of creativity and innovation with moderate depth. Supported by some examples and basic data.
3,"Positive Aspects: - Acknowledges the importance of emotional intelligence in management practices, which is a key component of effective leadership. Areas for Development: - The discussion on emotional intelligence lacks detailed examples within Qatar Airways' operations, making it difficult to assess its practical application. - The correlation between emotional intelligence, employee engagement, and organizational performance is mentioned but not explored in detail. Next Steps: - Provide specific, contextually relevant examples of emotional intelligence in action within Qatar Airways to illustrate its impact on the organization. - Draw a clearer connection between emotional intelligence and organizational outcomes, supported by empirical data or theoretical frameworks.","The submission provides a general discussion on the importance of emotional intelligence in management practices but lacks detailed examples of its demonstration within Qatar Airways' operations. The submission reflects on the integration of the company’s values into its management practices but does so without a critical analysis backed by contemporary academic literature. The discussion remains descriptive rather than analytical. Furthermore, The article includes minimal behavioral evidence that showcases Emotional Intelligence in leadership. Empirical evidence supporting these behaviors is not provided. Additionally, the analysis does not draw a clear connection between Emotional Intelligence and both employee engagement levels and organizational performance. The correlation is mentioned but not explored in detail. While the submission mentions aspects of Emotional Intelligence, such as self-awareness and empathy, applicable theoretical frameworks or models are not used to analyze EQ within the context of Qatar Airways. Their applications are not clearly defined. Lastly, the submission lacks proposed actionable steps and academically supported suggestions to enhance the impact of Emotional Intelligence on organizational outcomes.","Good evaluation showing general effects of Emotional Intelligence on employee engagement and organizational performance, with some examples and theoretical backdrop."
4,"Positive Aspects: - Identifies competencies relevant to Qatar Airways but does not delineate them as strengths or areas for improvement clearly. Areas for Development: - Lacks a critical examination of how these competencies are vital for maintaining Qatar Airways' industry leadership. - Does not provide clear theories or models to enhance these competencies, limiting the depth of strategic recommendations. Next Steps: - Conduct a more detailed analysis of management competencies, identifying specific strengths and weaknesses within Qatar Airways. - Recommend actionable steps and theoretical models to enhance these competencies, linking them directly to Qatar Airways' strategic objectives.",The submission identifies some competencies but does not clearly delineate them as strengths or areas for improvement within Qatar Airways. The critical examination of how these competencies are vital for maintaining Qatar Airways' industry leadership is lacking. The analysis is not deeply explored. The submission does not provide clear theories or models to enhance these competencies. The relevance of any recommendations to Qatar Airways is not clearly explained. There are no case studies or empirical evidence included that show successful or unsuccessful strategy implementations concerning competency development.,"Competent assessment of management competencies with some actionable suggestions for development, although deeper integration with theory could enhance the analysis."
