In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Configutation parameters:
##### Configure which columns to include or exclude, set renaming rules, and specify formatting options here.

In [2]:
# Configuration parameters - modify these for different datasets
CONFIG = {
    'file_path': "data/2023-json-ANONYM.xlsx",
    'engine': 'openpyxl',
    'extra_time_mins_to_remove': 0,  # 0 means we will remove all students with extra time
    'columns_to_remove': [
        'Column1.result.ext_inspera_attendance',
        'Column1.result.lineItem.sourcedId',
        'Column1.result.lineItem.type',
        'Column1.result.student.type',
        'Column1.result.ext_inspera_autoScore',
        'Column1.result.sourcedId',
        'Column1.result.ext_inspera_userAssessmentSetupId',
        'Column1.result.ext_inspera_userAssessmentId',
        'Column1.result.student.sourcedId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionWeight',
        'Column1.result.ext_inspera_endTime',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2',
    ],
    'datetime_columns': [
        'Column1.result.ext_inspera_startTime',
        'Column1.result.ext_inspera_endTime',
        'Column1.result.dateLastModified'
    ],
    'column_rename_map': {
        'Column1.result.ext_inspera_startTime': 'exam_start_time',
        'Column1.result.dateLastModified': 'last_modified_time',
        'Column1Column1.result.ext_inspera_candidateId': 'candidate_id',
        'Column1.result.ext_inspera_questions.ext_inspera_maxQuestionScore': 'max_question_score',
        'Column1.result.ext_inspera_questions.ext_inspera_questionNumber': 'question_number',
        'Column1.result.ext_inspera_questions.ext_inspera_questionTitle': 'question_title',
        'Column1.result.ext_inspera_questions.ext_inspera_durationSeconds': 'question_duration_seconds',
        'Column1.result.ext_inspera_questions.ext_inspera_autoScore': 'auto_score_per_question',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_': 'candidate_response_code',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1': 'candidate_response_text',
        'Column1.result.ext_inspera_extraTimeMins' : 'extra_time_mins',
        'Column1.result.ext_inspera_incidentTimeMins' : 'incident_time_mins',
        'Column1.result.score': 'total_score',
        'Column1' : 'candidate_id',
        "Oppgave Tid" : 'question_duration_seconds',
        "Oppgave Poeng" : 'auto_score_per_question'
    },
    'fill_missing_columns': {
        'target': 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1',
        'source': 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_'
    }
}

def load_data(file_path, engine='openpyxl'):
    """
    Load data from Excel file
    
    Args:
        file_path (str): Path to the Excel file
        engine (str): Excel engine to use
    
    Returns:
        pandas.DataFrame: Loaded DataFrame
    """
    try:
        df = pd.read_excel(file_path, engine=engine)
        print(f"Data loaded successfully from {file_path}")
        print(f"Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

### Usage Instruction:

##### Steps to use this notebook with new datasets:
1. Modify the CONFIG dictionary at the top
2. Update file_path to point to your new dataset
3. Adjust columns_to_remove, column_rename_map, and other parameters
4. Run the clean_data_pipeline function
5. Optionally save the cleaned data using save_cleaned_data function

In [3]:
# Example: How to use this notebook for other datasets
# Simply modify the CONFIG dictionary at the top of the notebook

# Example configuration for a different dataset:
EXAMPLE_CONFIG = {
    'file_path': "data/2023-json-ANONYM.xlsx",  # this changes to your file path when you call clean_data_pipeline(EXAMPLE_CONFIG, file_name)
    'engine': 'openpyxl',
    'extra_time_mins_to_remove': 45,  # Adjust threshold as needed
    'columns_to_remove': [
        # Add columns you want to remove for your specific dataset
        'unwanted_column1',
        'unwanted_column2',
    ],
    'datetime_columns': [
        # Add datetime columns for your dataset
        'start_time_column',
        'end_time_column',
    ],
    'column_rename_map': {
        # Map old column names to new names
        'old_column_name': 'new_column_name',
    },
    'fill_missing_columns': {
        'target': 'column_to_fill',
        'source': 'column_to_use_as_source'
    }
}

# To use with a different dataset, uncomment the line below:
# df_new_dataset = clean_data_pipeline(EXAMPLE_CONFIG, file_name)

### Functions
##### below functions executed automatically via calling the main function

 

In [4]:
def remove_extra_time_students(df, extra_time_column='Column1.result.ext_inspera_extraTimeMins', 
                             extra_time_mins_to_remove=0):
    """
    Remove students with extra time above a certain threshold
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        extra_time_column (str): Name of the extra time column
        extra_time_mins_to_remove (int): Threshold for extra time removal
    
    Returns:
        pandas.DataFrame: Filtered DataFrame
    """
    if extra_time_column not in df.columns:
        print(f"Column '{extra_time_column}' not found in DataFrame")
        return df
    
    original_count = len(df)
    df_filtered = df[df[extra_time_column] <= extra_time_mins_to_remove]
    removed_count = original_count - len(df_filtered)
    
    print(f"Removed {removed_count} lines with extra time > {extra_time_mins_to_remove} mins")
    print(f"Remaining students: {len(df_filtered)}")
    
    return df_filtered

def remove_columns(df, columns_to_remove):
    """
    Remove specified columns from DataFrame
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        columns_to_remove (list): List of column names to remove
    
    Returns:
        pandas.DataFrame: DataFrame with columns removed
    """
    existing_columns = [col for col in columns_to_remove if col in df.columns]
    missing_columns = [col for col in columns_to_remove if col not in df.columns]
    
    if missing_columns:
        print(f"Warning: These columns were not found: {missing_columns}")
    
    if existing_columns:
        df_cleaned = df.drop(columns=existing_columns)
        print(f"Removed {len(existing_columns)} columns")
    else:
        df_cleaned = df.copy()
        print("No columns to remove")
    
    return df_cleaned

In [5]:
def fill_missing_response_values(df, target_column, source_column):
    """
    Fill missing values in target column with values from source column
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        target_column (str): Column to fill missing values
        source_column (str): Column to use as source for filling
    
    Returns:
        pandas.DataFrame: DataFrame with filled values
    """
    if target_column not in df.columns:
        print(f"Target column '{target_column}' not found")
        return df
    
    if source_column not in df.columns:
        print(f"Source column '{source_column}' not found")
        return df
    
    df_filled = df.copy()
    missing_count = df_filled[target_column].isna().sum()
    
    if missing_count > 0:
        df_filled[target_column] = df_filled[target_column].fillna(df_filled[source_column])
        print(f"Filled {missing_count} missing values in '{target_column}'")
    else:
        print(f"No missing values found in '{target_column}'")
    
    return df_filled

def convert_datetime_columns(df, datetime_columns):
    """
    Convert specified columns to datetime format and remove timezone info
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        datetime_columns (list): List of column names to convert to datetime
    
    Returns:
        pandas.DataFrame: DataFrame with datetime columns converted
    """
    df_converted = df.copy()
    
    for col in datetime_columns:
        if col in df_converted.columns:
            try:
                # Convert to datetime
                df_converted[col] = pd.to_datetime(df_converted[col])
                # Remove timezone info if present
                if df_converted[col].dt.tz is not None:
                    df_converted[col] = df_converted[col].dt.tz_localize(None)
                print(f"Converted '{col}' to datetime (timezone removed)")
            except Exception as e:
                print(f"Error converting '{col}' to datetime: {e}")
        else:
            print(f"Column '{col}' not found for datetime conversion")
    
    return df_converted

def rename_columns(df, column_rename_map):
    """
    Rename columns according to the provided mapping
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        column_rename_map (dict): Dictionary mapping old names to new names
    
    Returns:
        pandas.DataFrame: DataFrame with renamed columns
    """
    existing_columns = {old: new for old, new in column_rename_map.items() if old in df.columns}
    missing_columns = {old: new for old, new in column_rename_map.items() if old not in df.columns}
    
    if missing_columns:
        print(f"Warning: These columns were not found for renaming: {list(missing_columns.keys())}")
    
    if existing_columns:
        df_renamed = df.rename(columns=existing_columns)
        print(f"Renamed {len(existing_columns)} columns")
    else:
        df_renamed = df.copy()
        print("No columns to rename")
    
    return df_renamed


### Main function 
##### - Execute the pre-defined cleaning process. 
##### - Take two arguments config and file_path.

In [6]:
def clean_data_pipeline(config, file_path):
    """
    Complete data cleaning pipeline
    
    Args:
        config (dict): Configuration dictionary with all parameters
    
    Returns:
        pandas.DataFrame: Cleaned DataFrame
    """
    print("Starting data cleaning pipeline...")
    print("=" * 50)

    # Read Data
    config['file_path'] = file_path
    # Load data
    df = load_data(config['file_path'], config['engine'])
    if df is None:
        return None
    
    # Remove students with extra time
    #df = remove_extra_time_students(
       # df, 
        # extra_time_mins_to_remove=config['extra_time_mins_to_remove']
    #)
    
    # Fill missing values if configured
    if 'fill_missing_columns' in config:
        df = fill_missing_response_values(
            df, 
            config['fill_missing_columns']['target'],
            config['fill_missing_columns']['source']
        )
    # Convert datetime columns
    df = convert_datetime_columns(df, config['datetime_columns'])
    
    # Remove specified columns
    df = remove_columns(df, config['columns_to_remove'])
    
    # Rename columns
    df = rename_columns(df, config['column_rename_map'])

    
    print("=" * 50)
    print(f"Data cleaning completed. Final shape: {df.shape}")
    
    return df

# Load and clean data using the configuration
df_cleaned = clean_data_pipeline(CONFIG, "data/2024-json ANONYM.xlsx")

Starting data cleaning pipeline...
Data loaded successfully from data/2024-json ANONYM.xlsx
Shape: (65880, 27)
Filled 36598 missing values in 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1'
Converted 'Column1.result.ext_inspera_startTime' to datetime (timezone removed)
Converted 'Column1.result.ext_inspera_endTime' to datetime (timezone removed)
Converted 'Column1.result.dateLastModified' to datetime (timezone removed)
Removed 14 columns
Renamed 13 columns
Data cleaning completed. Final shape: (65880, 13)


### Export Function
#### export the cleaned file to desired formats.

In [7]:
def save_cleaned_data(df, output_path, file_format='xlsx'):
    """
    Save cleaned data to file
    
    Args:
        df (pandas.DataFrame): Cleaned DataFrame
        output_path (str): Path to save the file
        file_format (str): Format to save ('xlsx', 'csv', 'json')
    """
    try:
        if file_format == 'xlsx':
            df.to_excel(output_path, index=False)
        elif file_format == 'csv':
            df.to_csv(output_path, index=False)
        elif file_format == 'json':
            df.to_json(output_path, orient='records', indent=2)
        else:
            raise ValueError(f"Unsupported format: {file_format}")
        
        print(f"Data saved successfully to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")

# Optional: Save the cleaned data
#save_cleaned_data(df_cleaned, "data/cleaned_data.xlsx", "xlsx")

### Executing Main Function

In [8]:
# Load and clean data using the configuration
df_cleaned = clean_data_pipeline(CONFIG, "data/2024-json ANONYM.xlsx")

Starting data cleaning pipeline...
Data loaded successfully from data/2024-json ANONYM.xlsx
Shape: (65880, 27)
Filled 36598 missing values in 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1'
Converted 'Column1.result.ext_inspera_startTime' to datetime (timezone removed)
Converted 'Column1.result.ext_inspera_endTime' to datetime (timezone removed)
Converted 'Column1.result.dateLastModified' to datetime (timezone removed)
Removed 14 columns
Renamed 13 columns
Data cleaning completed. Final shape: (65880, 13)


### Export Cleaned File

In [9]:
save_cleaned_data(df_cleaned, "data/cleaned_data.xlsx", "xlsx")

Data saved successfully to data/cleaned_data.xlsx


## This file contains preparation of exam data 2024

In [10]:
# Display basic information about the cleaned dataset
print("Dataset Information:")
print(f"Shape: {df_cleaned.shape}")
print(f"Columns: {len(df_cleaned.columns)}")
print("\nFirst few rows:")
# df_cleaned.head()
df_cleaned.info()

Dataset Information:
Shape: (65880, 13)
Columns: 13

First few rows:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65880 entries, 0 to 65879
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   last_modified_time         65880 non-null  datetime64[ns]
 1   exam_start_time            65880 non-null  datetime64[ns]
 2   extra_time_mins            65880 non-null  int64         
 3   incident_time_mins         65880 non-null  int64         
 4   candidate_id               65880 non-null  int64         
 5   max_question_score         65880 non-null  float64       
 6   question_number            65880 non-null  float64       
 7   question_title             65880 non-null  object        
 8   question_duration_seconds  65880 non-null  int64         
 9   auto_score_per_question    65880 non-null  float64       
 10  candidate_response_code    65561 non-null  object        
 11

In [11]:
# Display column information
print("Column Names:")
for i, col in enumerate(df_cleaned.columns):
    print(f"{i+1}. {col}")
    
print(f"\nTotal columns: {len(df_cleaned.columns)}")
print(f"Data types:\n{df_cleaned.dtypes}")
print(f"\nMemory usage: {df_cleaned.memory_usage().sum() / 1024**2:.2f} MB")

Column Names:
1. last_modified_time
2. exam_start_time
3. extra_time_mins
4. incident_time_mins
5. candidate_id
6. max_question_score
7. question_number
8. question_title
9. question_duration_seconds
10. auto_score_per_question
11. candidate_response_code
12. candidate_response_text
13. total_score

Total columns: 13
Data types:
last_modified_time           datetime64[ns]
exam_start_time              datetime64[ns]
extra_time_mins                       int64
incident_time_mins                    int64
candidate_id                          int64
max_question_score                  float64
question_number                     float64
question_title                       object
question_duration_seconds             int64
auto_score_per_question             float64
candidate_response_code              object
candidate_response_text              object
total_score                         float64
dtype: object

Memory usage: 6.53 MB


In [12]:
# Data quality check
print("Data Quality Summary:")
print("=" * 30)
print(f"Total rows: {len(df_cleaned)}")
print(f"Total columns: {len(df_cleaned.columns)}")
print(f"Missing values per column:")
missing_values = df_cleaned.isnull().sum()
for col, missing in missing_values.items():
    if missing > 0:
        print(f"  {col}: {missing} ({missing/len(df_cleaned)*100:.1f}%)")

if missing_values.sum() == 0:
    print("  No missing values found!")

# Show sample of cleaned data
print("\nSample of cleaned data:")
df_cleaned.head(10)

Data Quality Summary:
Total rows: 65880
Total columns: 13
Missing values per column:
  candidate_response_code: 319 (0.5%)
  candidate_response_text: 319 (0.5%)

Sample of cleaned data:


Unnamed: 0,last_modified_time,exam_start_time,extra_time_mins,incident_time_mins,candidate_id,max_question_score,question_number,question_title,question_duration_seconds,auto_score_per_question,candidate_response_code,candidate_response_text,total_score
0,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_1368096164456,Heksadesimale tall,71.61
1,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA172399520449584402464-f255-46e6...,Oktale tall,71.61
2,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952044953d28e157-ac4c-49db...,Titallsystemet,71.61
3,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995204495960f601e-73d5-48f8...,Heksadesimale tall,71.61
4,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952636882cb13f25-bd73-4352...,Titallsystemet,71.61
5,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952636881f913a66-8ab2-4533...,Oktale tall,71.61
6,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995263688fff30f23-beb3-435e...,Binære tall,71.61
7,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995263688cc10ddd6-1786-49a3...,Heksadesimale tall,71.61
8,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,3.0,1.2,Tallsortering,166,3.0,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...,71.61
9,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,3.0,1.2,Tallsortering,166,3.0,gapImg_IA1724001578419272ee4ee-4b37-4e3f-bce7-...,gapImg_IA1724001578419272ee4ee-4b37-4e3f-bce7-...,71.61
