In [None]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

## This jupyter notebook cleans raw data and creates new dataframes that represent one observational unit each

### Usage Instruction:

##### Steps to use this notebook with new datasets:
1. Modify the CONFIG dictionary at the top
2. Update file_path to point to your new dataset
3. Adjust columns_to_remove, column_rename_map, and other parameters
4. Run the clean_data_pipeline function
5. Optionally save the cleaned data using save_cleaned_data function

### Configutation parameters:
##### Configure which columns to include or exclude, set renaming rules, and specify formatting options here.

In [100]:
# Configuration parameters - modify these for different datasets
CONFIG = {
    'file_path': "data/2024-json-ANONYM.xlsx",
    'engine': 'openpyxl',
    'columns_to_remove': [
        'Column1.result.ext_inspera_attendance',
        'Column1.result.lineItem.sourcedId',
        'Column1.result.lineItem.type',
        'Column1.result.student.type',
        'Column1.result.ext_inspera_autoScore',
        'Column1.result.sourcedId',
        'Column1.result.ext_inspera_userAssessmentSetupId',
        'Column1.result.ext_inspera_userAssessmentId',
        'Column1.result.student.sourcedId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId',
        'Column1.result.ext_inspera_questions.ext_inspera_questionWeight',
        'Column1.result.ext_inspera_endTime',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2',
    ],
    'datetime_columns': [
        'Column1.result.ext_inspera_startTime',
        'Column1.result.ext_inspera_endTime',
        'Column1.result.dateLastModified'
    ],
    'string_columns': [
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_',      
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1',      
        'Column1.result.ext_inspera_questions.ext_inspera_questionTitle'                
    ],
    'column_rename_map': {
        'Column1.result.ext_inspera_startTime': 'exam_start_time',
        'Column1.result.dateLastModified': 'exam_last_modified_time',
        'Column1Column1.result.ext_inspera_candidateId': 'candidate_id',
        'Column1.result.ext_inspera_questions.ext_inspera_maxQuestionScore': 'max_question_score',
        'Column1.result.ext_inspera_questions.ext_inspera_questionNumber': 'question_number',
        'Column1.result.ext_inspera_questions.ext_inspera_questionTitle': 'question_title',
        'Column1.result.ext_inspera_questions.ext_inspera_durationSeconds': 'question_duration (sec)',
        'Column1.result.ext_inspera_questions.ext_inspera_autoScore': 'auto_score_per_question',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_': 'candidate_response_code',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1': 'candidate_response_text',
        'Column1.result.ext_inspera_extraTimeMins' : 'extra_time_mins',
        'Column1.result.ext_inspera_incidentTimeMins' : 'incident_time_mins',
        'Column1.result.score': 'total_score',
    },
    'fill_missing_columns': {
        'target': 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1',
        'source': 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_'
    }
}

## Functions
##### below functions executed automatically via calling the main function ***clean_data_pipeline(CONFIG, "file_path")***

### Load data

In [101]:
def load_data(file_path, engine='openpyxl'):
    """
    Load data from Excel file
    
    Args:
        file_path (str): Path to the Excel file
        engine (str): Excel engine to use
    
    Returns:
        pandas.DataFrame: Loaded DataFrame
    """
    try:
        df = pd.read_excel(file_path, engine=engine)
        print(f"Data loaded successfully from {file_path}")
        print(f"Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

### Remove columns

In [102]:
def remove_columns(df, columns_to_remove):
    """
    Remove specified columns from DataFrame
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        columns_to_remove (list): List of column names to remove
    
    Returns:
        pandas.DataFrame: DataFrame with columns removed
    """
    existing_columns = [col for col in columns_to_remove if col in df.columns]
    missing_columns = [col for col in columns_to_remove if col not in df.columns]
    
    if missing_columns:
        print(f"Warning: These columns were not found: {missing_columns}")
    
    if existing_columns:
        df_cleaned = df.drop(columns=existing_columns)
        print(f"Removed {len(existing_columns)} columns")
    else:
        df_cleaned = df.copy()
        print("No columns to remove")
    
    return df_cleaned

### Filling NaN values (students' responses)

In [103]:
def fill_missing_response_values(df, target_column, source_column):
    """
    Fill missing values in target column with values from source column
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        target_column (str): Column to fill missing values
        source_column (str): Column to use as source for filling
    
    Returns:
        pandas.DataFrame: DataFrame with filled values
    """
    if target_column not in df.columns:
        print(f"Target column '{target_column}' not found")
        return df
    
    if source_column not in df.columns:
        print(f"Source column '{source_column}' not found")
        return df
    
    df_filled = df.copy()
    missing_count = df_filled[target_column].isna().sum()
    
    if missing_count > 0:
        df_filled[target_column] = df_filled[target_column].fillna(df_filled[source_column])
        print(f"Filled {missing_count} missing values in '{target_column}'")
    else:
        print(f"No missing values found in '{target_column}'")
    
    return df_filled


# After filling missing values from source to target column, there are still NaN values in the DataFrame.
# We will fill them with a default values "No Response"
# And count how many NaN values were filled
def fill_NaN_values(df, fill_value="No Response"):
    """
    Fill NaN values in DataFrame with a specified value
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        fill_value (str): Value to fill NaN entries with
    
    Returns:
        pandas.DataFrame: DataFrame with NaN values filled
    """
    df_filled = df.fillna(fill_value)
    filled_count = df_filled.isna().sum().sum()
    
    if filled_count > 0:
        print(f"Filled {filled_count} NaN values with '{fill_value}'")
    else:
        print("No NaN values found to fill")
    
    return df_filled


### Rename columns

In [104]:
def rename_columns(df, column_rename_map):
    """
    Rename columns according to the provided mapping
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        column_rename_map (dict): Dictionary mapping old names to new names
    
    Returns:
        pandas.DataFrame: DataFrame with renamed columns
    """
    existing_columns = {old: new for old, new in column_rename_map.items() if old in df.columns}
    missing_columns = {old: new for old, new in column_rename_map.items() if old not in df.columns}
    
    if missing_columns:
        print(f"Warning: These columns were not found for renaming: {list(missing_columns.keys())}")
    
    if existing_columns:
        df_renamed = df.rename(columns=existing_columns)
        print(f"Renamed {len(existing_columns)} columns")
    else:
        df_renamed = df.copy()
        print("No columns to rename")
    
    return df_renamed

### Convert columns with object type to datetime type

In [105]:
def convert_datetime_columns(df, datetime_columns):
    """
    Convert specified columns to datetime format and remove timezone info
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        datetime_columns (list): List of column names to convert to datetime
    
    Returns:
        pandas.DataFrame: DataFrame with datetime columns converted
    """
    df_converted = df.copy()
    
    for col in datetime_columns:
        if col in df_converted.columns:
            try:
                # Convert to datetime
                df_converted[col] = pd.to_datetime(df_converted[col])
                # Remove timezone info if present
                if df_converted[col].dt.tz is not None:
                    df_converted[col] = df_converted[col].dt.tz_localize(None)
                print(f"Converted '{col}' to datetime (timezone removed)")
            except Exception as e:
                print(f"Error converting '{col}' to datetime: {e}")
        else:
            print(f"Column '{col}' not found for datetime conversion")
    
    return df_converted

### Convert columns with object type to string type

In [106]:
# def convert_string_columns(df, string_columns):
#     """
#     Convert specified columns to string type
    
#     Args:
#         df (pandas.DataFrame): Input DataFrame
#         string_columns (list): List of column names to convert to string
    
#     Returns:
#         pandas.DataFrame: DataFrame with string columns converted
#     """
#     df_converted = df.copy()
    
#     for col in string_columns:
#         if col in df_converted.columns:
#             try:
#                 df_converted[col] = df_converted[col].astype(str)
#                 print(f"Converted '{col}' to string")
#             except Exception as e:
#                 print(f"Error converting '{col}' to string: {e}")
#         else:
#             print(f"Column '{col}' not found for string conversion")
    
#     return df_converted

### Main function 
##### - Execute the pre-defined cleaning process. 
##### - Take two arguments config and file_path.

In [107]:
def clean_data_pipeline(config, file_path):
    """
    Complete data cleaning pipeline
    
    Args:
        config (dict): Configuration dictionary with all parameters
    
    Returns:
        pandas.DataFrame: Cleaned DataFrame
    """
    print("Starting data cleaning pipeline...")
    print("=" * 50)

    # Read Data
    config['file_path'] = file_path
    # Load data
    df = load_data(config['file_path'], config['engine'])
    if df is None:
        return None
    
    
    # Fill missing values if configured
    if 'fill_missing_columns' in config:
        df = fill_missing_response_values(
            df, 
            config['fill_missing_columns']['target'],
            config['fill_missing_columns']['source']
        )

    # Fill NaN values with a default value
    df = fill_NaN_values(df, fill_value="No Response")
    
    # Convert datetime columns (before renaming, using original column names)
    df = convert_datetime_columns(df, config['datetime_columns'])
    
    # Remove specified columns
    df = remove_columns(df, config['columns_to_remove'])
    
    # Rename columns
    df = rename_columns(df, config['column_rename_map'])

    
    print("=" * 50)
    print(f"Data cleaning completed. Final shape: {df.shape}")
    
    return df

# Load and clean data using the configuration
# df_cleaned = clean_data_pipeline(CONFIG, "data/2024-json-ANONYM.xlsx")

### Export Function
#### export the cleaned file to desired formats

In [108]:
def save_cleaned_data(df, output_path, file_format='xlsx'):
    """
    Save cleaned data to file
    
    Args:
        df (pandas.DataFrame): Cleaned DataFrame
        output_path (str): Path to save the file
        file_format (str): Format to save ('xlsx', 'csv', 'json')
    """
    try:
        if file_format == 'xlsx':
            df.to_excel(output_path, index=False)
        elif file_format == 'csv':
            df.to_csv(output_path, index=False)
        elif file_format == 'json':
            df.to_json(output_path, orient='records', indent=2)
        else:
            raise ValueError(f"Unsupported format: {file_format}")
        
        print(f"Data saved successfully to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")

### Executing Main Function

In [109]:
# Load and clean data using the configuration
df_cleaned = clean_data_pipeline(CONFIG, "data/2024-json-ANONYM.xlsx")

Starting data cleaning pipeline...
Data loaded successfully from data/2024-json-ANONYM.xlsx
Shape: (65880, 27)
Filled 36598 missing values in 'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1'
No NaN values found to fill
Converted 'Column1.result.ext_inspera_startTime' to datetime (timezone removed)
Converted 'Column1.result.ext_inspera_endTime' to datetime (timezone removed)
Converted 'Column1.result.dateLastModified' to datetime (timezone removed)
Removed 14 columns
Renamed 13 columns
Data cleaning completed. Final shape: (65880, 13)


### Export Cleaned File

In [110]:
# save_cleaned_data(df_cleaned, "data/cleaned_data2024.xlsx", "xlsx")

## This file contains preparation of exam data 2024

In [111]:
# Display basic information about the cleaned dataset
print("Dataset Information:")
print(f"Shape: {df_cleaned.shape}")
print(f"Columns: {len(df_cleaned.columns)}")
print("\nFirst few rows:")
# df_cleaned.head()
df_cleaned.info()

Dataset Information:
Shape: (65880, 13)
Columns: 13

First few rows:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65880 entries, 0 to 65879
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   exam_last_modified_time  65880 non-null  datetime64[ns]
 1   exam_start_time          65880 non-null  datetime64[ns]
 2   extra_time_mins          65880 non-null  int64         
 3   incident_time_mins       65880 non-null  int64         
 4   candidate_id             65880 non-null  int64         
 5   max_question_score       65880 non-null  float64       
 6   question_number          65880 non-null  float64       
 7   question_title           65880 non-null  object        
 8   question_duration (sec)  65880 non-null  int64         
 9   auto_score_per_question  65880 non-null  float64       
 10  candidate_response_code  65880 non-null  object        
 11  candidate_response_text 

In [112]:
# Display column information
print("Column Names:")
for i, col in enumerate(df_cleaned.columns):
    print(f"{i+1}. {col}")
    
print(f"\nTotal columns: {len(df_cleaned.columns)}")
print(f"Data types:\n{df_cleaned.dtypes}")
print(f"\nMemory usage: {df_cleaned.memory_usage().sum() / 1024**2:.2f} MB")

Column Names:
1. exam_last_modified_time
2. exam_start_time
3. extra_time_mins
4. incident_time_mins
5. candidate_id
6. max_question_score
7. question_number
8. question_title
9. question_duration (sec)
10. auto_score_per_question
11. candidate_response_code
12. candidate_response_text
13. total_score

Total columns: 13
Data types:
exam_last_modified_time    datetime64[ns]
exam_start_time            datetime64[ns]
extra_time_mins                     int64
incident_time_mins                  int64
candidate_id                        int64
max_question_score                float64
question_number                   float64
question_title                     object
question_duration (sec)             int64
auto_score_per_question           float64
candidate_response_code            object
candidate_response_text            object
total_score                       float64
dtype: object

Memory usage: 6.53 MB


### Cleaned DataFrame: rows dropped, types converted, columns renamed, NaNs filled

In [113]:
# Data quality check
print("Data Quality Summary:")
print("=" * 30)
print(f"Total rows: {len(df_cleaned)}")
print(f"Total columns: {len(df_cleaned.columns)}")
print(f"Missing values per column:")
missing_values = df_cleaned.isnull().sum()
for col, missing in missing_values.items():
    if missing > 0:
        print(f"  {col}: {missing} ({missing/len(df_cleaned)*100:.1f}%)")

if missing_values.sum() == 0:
    print("  No missing values found!")

# Show sample of cleaned data
print("\nSample of cleaned data:")
df_cleaned.head(10)

Data Quality Summary:
Total rows: 65880
Total columns: 13
Missing values per column:
  No missing values found!

Sample of cleaned data:


Unnamed: 0,exam_last_modified_time,exam_start_time,extra_time_mins,incident_time_mins,candidate_id,max_question_score,question_number,question_title,question_duration (sec),auto_score_per_question,candidate_response_code,candidate_response_text,total_score
0,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_1368096164456,Heksadesimale tall,71.61
1,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA172399520449584402464-f255-46e6...,Oktale tall,71.61
2,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952044953d28e157-ac4c-49db...,Titallsystemet,71.61
3,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995204495960f601e-73d5-48f8...,Heksadesimale tall,71.61
4,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952636882cb13f25-bd73-4352...,Titallsystemet,71.61
5,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952636881f913a66-8ab2-4533...,Oktale tall,71.61
6,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995263688fff30f23-beb3-435e...,Binære tall,71.61
7,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995263688cc10ddd6-1786-49a3...,Heksadesimale tall,71.61
8,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,3.0,1.2,Tallsortering,166,3.0,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...,71.61
9,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,3.0,1.2,Tallsortering,166,3.0,gapImg_IA1724001578419272ee4ee-4b37-4e3f-bce7-...,gapImg_IA1724001578419272ee4ee-4b37-4e3f-bce7-...,71.61


## Cleaned DataFrame divided into separate DataFrames, each representing a single observational unit.

### Total score per candidate

In [134]:
def total_score_per_candidate_dataframe(df):
    """
    Create a DataFrame with total score per candidate.
    
    Args:
        df (pandas.DataFrame): Cleaned DataFrame
    
    Returns:
        pandas.DataFrame: DataFrame with total score per candidate
    """
    if 'candidate_id' not in df.columns or 'total_score' not in df.columns:
        print("Required columns are missing for total score calculation.")
        return pd.DataFrame()
    
    total_score_df = (df.groupby('candidate_id')
                   .agg({
                       'total_score': 'first',
                   })
                   .reset_index())
    
    return total_score_df

In [135]:
total_score_df = total_score_per_candidate_dataframe(df_cleaned)
total_score_df.info()
total_score_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   candidate_id  569 non-null    int64  
 1   total_score   569 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 9.0 KB


Unnamed: 0,candidate_id,total_score
0,17104,85.84
1,17105,65.08
2,17106,35.64
3,17107,71.61
4,17111,31.91


### Exam time per candidate

In [114]:
def exam_time_dataframe(df):
    """
    Calculate total exam time for each candidate
    
    Args:
        df (pandas.DataFrame): DataFrame with exam start and end times
    
    Returns:
        pandas.DataFrame: DataFrame with total exam time per candidate
    """
    # Select and aggregate data in one step using groupby
    exam_time_df = (df.groupby('candidate_id')
                   .agg({
                       'exam_start_time': 'first',
                       'exam_last_modified_time': 'first', 
                       'extra_time_mins': 'first',
                       'incident_time_mins': 'first'
                   })
                   .reset_index())
    
    # Calculate time differences
    time_diff = exam_time_df['exam_last_modified_time'] - exam_time_df['exam_start_time']
    exam_time_df['exam_duration_total (min)'] = (time_diff.dt.total_seconds() / 60).round(1)
    
    # Calculate duration without incident time
    incident_timedelta = pd.to_timedelta(exam_time_df['incident_time_mins'], unit='m')
    exam_time_df['exam_duration_no_incident_time (min)'] = ((time_diff - incident_timedelta).dt.total_seconds() / 60).round(1)
    
    return exam_time_df

In [115]:
exam_time_df = exam_time_dataframe(df_cleaned)
exam_time_df.info()
exam_time_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 7 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   candidate_id                          569 non-null    int64         
 1   exam_start_time                       569 non-null    datetime64[ns]
 2   exam_last_modified_time               569 non-null    datetime64[ns]
 3   extra_time_mins                       569 non-null    int64         
 4   incident_time_mins                    569 non-null    int64         
 5   exam_duration_total (min)             569 non-null    float64       
 6   exam_duration_no_incident_time (min)  569 non-null    float64       
dtypes: datetime64[ns](2), float64(2), int64(3)
memory usage: 31.2 KB


Unnamed: 0,candidate_id,exam_start_time,exam_last_modified_time,extra_time_mins,incident_time_mins,exam_duration_total (min),exam_duration_no_incident_time (min)
0,17104,2024-12-11 14:00:02,2024-12-11 17:55:47,0,0,235.8,235.8
1,17105,2024-12-11 14:00:01,2024-12-11 17:48:51,0,0,228.8,228.8
2,17106,2024-12-11 14:00:02,2024-12-11 17:46:31,0,0,226.5,226.5
3,17107,2024-12-11 14:00:02,2024-12-11 15:57:52,0,0,117.8,117.8
4,17111,2024-12-11 14:00:04,2024-12-11 18:14:54,30,0,254.8,254.8


### Time per task

In [140]:
def task_time_dataframe(df):
    """
    Show time spent on each task per candidate and points 
    Args:
        df (pandas.DataFrame): DataFrame with exam data
    
    Returns:
        pandas.DataFrame: DataFrame with task time per candidate
    """
    # Select and aggregate data in one step using groupby
    task_time_df = (df.groupby(['candidate_id', 'question_number'])
                   .agg({
                    #    'question_title': 'first',
                       'max_question_score': 'first',
                       'question_duration (sec)': 'first'
                   })
                   .reset_index())

    # New column: % of exam time expected to be spent on the task
    # Counts like this: 210 * max_question_score / 100
    # 210 min is 3,5 hours, which is the expected effective exam time
    task_time_df['expected_time_spent (sec)'] = 210 * task_time_df['max_question_score'] / 100 * 60

    # Drop the column after calculation
    task_time_df.drop(columns='max_question_score', inplace=True)

    return task_time_df

In [141]:
time_per_task_df = task_time_dataframe(df_cleaned)
time_per_task_df.info()
time_per_task_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17070 entries, 0 to 17069
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   candidate_id               17070 non-null  int64  
 1   question_number            17070 non-null  float64
 2   question_duration (sec)    17070 non-null  int64  
 3   expected_time_spent (sec)  17070 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 533.6 KB


Unnamed: 0,candidate_id,question_number,question_duration (sec),expected_time_spent (sec)
0,17104,1.1,295,302.4
1,17104,1.2,209,378.0
2,17104,1.3,496,478.8
3,17104,1.4,210,378.0
4,17104,1.5,373,378.0


### Score per task per candidate

## ADD A COLUMN WITH MANUAL CORRECTED SCORES from data/2024-eksamen Analyse!!!

In [124]:
def score_per_task_dataframe(df):
    """
    Calculate score per task per candidate
    
    Args:
        df (pandas.DataFrame): DataFrame with exam data
    
    Returns:
        pandas.DataFrame: DataFrame with score per task per candidate
    """
    # Select and aggregate data in one step using groupby
    score_per_task_df = (df.groupby(['candidate_id', 'question_number'])
                        .agg({
                            'max_question_score': 'first',
                            'auto_score_per_question': 'first'
                        })
                        .reset_index())
    
    return score_per_task_df

In [125]:
score_per_task_df = score_per_task_dataframe(df_cleaned)
score_per_task_df.info()
score_per_task_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17070 entries, 0 to 17069
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   candidate_id             17070 non-null  int64  
 1   question_number          17070 non-null  float64
 2   max_question_score       17070 non-null  float64
 3   auto_score_per_question  17070 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 533.6 KB


Unnamed: 0,candidate_id,question_number,max_question_score,auto_score_per_question
0,17104,1.1,2.4,2.4
1,17104,1.2,3.0,3.0
2,17104,1.3,3.8,0.0
3,17104,1.4,3.0,3.0
4,17104,1.5,3.0,3.0


### Candidate responses

In [126]:
def candidate_responses_dataframe(df):
    """
    Extract candidate responses from the DataFrame
    
    Args:
        df (pandas.DataFrame): DataFrame with exam data
    
    Returns:
        pandas.DataFrame: DataFrame with candidate responses
    """
    # Select and aggregate data in one step using groupby
    candidate_responses_df = (df.groupby(['candidate_id', 'question_number'])
                              .agg({
                                  'candidate_response_code': 'first',
                                  'candidate_response_text': 'first'
                              })
                              .reset_index())
    
    return candidate_responses_df

In [127]:
candidate_responses_df = candidate_responses_dataframe(df_cleaned)
candidate_responses_df.info()
candidate_responses_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17070 entries, 0 to 17069
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   candidate_id             17070 non-null  int64  
 1   question_number          17070 non-null  float64
 2   candidate_response_code  17070 non-null  object 
 3   candidate_response_text  17070 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 533.6+ KB


Unnamed: 0,candidate_id,question_number,candidate_response_code,candidate_response_text
0,17104,1.1,simpleChoice_1368096164456,Heksadesimale tall
1,17104,1.2,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...,gapImg_IA172400165100658ec50ba-bb49-44cd-99f0-...
2,17104,1.3,IFI,IFI
3,17104,1.4,9,9
4,17104,1.5,simpleChoice_1367658328874,-a + b
