In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
def read_data(file_path):
    '''Read Excel File'''
    
    df = pd.read_excel(file_path, engine = 'openpyxl')

    return df

In [48]:
def prep_time_per_task(df):
    '''Extract time relevant data for each student per question and create new df'''
    
    task_time_df = (
    df.groupby(['candidate_id', 'question_number'], as_index=False)
      .first()
      [['candidate_id', 'question_number', 'question_title', 'question_duration_seconds', 
        'incident_time_mins', 'auto_score_per_question', 'max_question_score']]
      .copy()
    )
		
    return task_time_df

In [49]:
def convert_seconds_to_minutes(df):
    df['time_per_questions_mins'] = round(df['question_duration_seconds']/60, 2)
    return df

In [50]:
def calculate_ideal_time_per_task(df):
    df['expected_time_spent (mins)'] = round(210 * task_time_df['max_question_score'] / 100, 2)
    return df

In [51]:
def run_analysis(file_path):

    # Read Data
    df = read_data(file_path)

    # prep and extract data from original df for time per task analysis
    task_time_df = prep_time_per_task(df)

    # convert time taken per task from second to minutes
    task_time_df = convert_seconds_to_minutes(task_time_df)

    # calculate ideal time 
    task_time_df = calculate_ideal_time_per_task(task_time_df)

    return task_time_df


In [52]:
def export_data(df, output_path, file_format='xlsx'):
    """
    Save cleaned data to file
    
    Args:
        df (pandas.DataFrame): Cleaned DataFrame
        output_path (str): Path to save the file
        file_format (str): Format to save ('xlsx', 'csv', 'json')
    """
    try:
        if file_format == 'xlsx':
            df.to_excel(output_path, index=False)
        elif file_format == 'csv':
            df.to_csv(output_path, index=False)
        elif file_format == 'json':
            df.to_json(output_path, orient='records', indent=2)
        else:
            raise ValueError(f"Unsupported format: {file_format}")
        
        print(f"Data saved successfully to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")

### Execution

In [53]:
df = read_data("data/cleaned_data.xlsx")
df.head()

Unnamed: 0,last_modified_time,exam_start_time,extra_time_mins,incident_time_mins,candidate_id,max_question_score,question_number,question_title,question_duration_seconds,auto_score_per_question,candidate_response_code,candidate_response_text,total_score
0,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_1368096164456,Heksadesimale tall,71.61
1,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA172399520449584402464-f255-46e6...,Oktale tall,71.61
2,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952044953d28e157-ac4c-49db...,Titallsystemet,71.61
3,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA1723995204495960f601e-73d5-48f8...,Heksadesimale tall,71.61
4,2024-12-11 15:57:52,2024-12-11 14:00:02,0,0,17107,2.4,1.1,Tallsystemer,85,2.4,simpleChoice_IA17239952636882cb13f25-bd73-4352...,Titallsystemet,71.61


In [54]:
task_time_df = prep_time_per_task(df)
task_time_df.head()

Unnamed: 0,candidate_id,question_number,question_title,question_duration_seconds,incident_time_mins,auto_score_per_question,max_question_score
0,17104,1.1,Tallsystemer,295,0,2.4,2.4
1,17104,1.2,Tallsortering,209,0,3.0,3.0
2,17104,1.3,ASCII - Hullbånd,496,0,0.0,3.8
3,17104,1.4,Minnebruk,210,0,3.0,3.0
4,17104,1.5,LMC - Aritmetikk,373,0,3.0,3.0


In [55]:
# add column for minutes per question
task_time_df = convert_seconds_to_minutes(task_time_df)
task_time_df.head()

Unnamed: 0,candidate_id,question_number,question_title,question_duration_seconds,incident_time_mins,auto_score_per_question,max_question_score,time_per_questions_mins
0,17104,1.1,Tallsystemer,295,0,2.4,2.4,4.92
1,17104,1.2,Tallsortering,209,0,3.0,3.0,3.48
2,17104,1.3,ASCII - Hullbånd,496,0,0.0,3.8,8.27
3,17104,1.4,Minnebruk,210,0,3.0,3.0,3.5
4,17104,1.5,LMC - Aritmetikk,373,0,3.0,3.0,6.22


In [56]:
task_time_df = calculate_ideal_time_per_task(task_time_df)
task_time_df

Unnamed: 0,candidate_id,question_number,question_title,question_duration_seconds,incident_time_mins,auto_score_per_question,max_question_score,time_per_questions_mins,expected_time_spent (mins)
0,17104,1.1,Tallsystemer,295,0,2.4,2.4,4.92,5.04
1,17104,1.2,Tallsortering,209,0,3.0,3.0,3.48,6.30
2,17104,1.3,ASCII - Hullbånd,496,0,0.0,3.8,8.27,7.98
3,17104,1.4,Minnebruk,210,0,3.0,3.0,3.50,6.30
4,17104,1.5,LMC - Aritmetikk,373,0,3.0,3.0,6.22,6.30
...,...,...,...,...,...,...,...,...,...
15475,17844,4.4,CIDR til punktnotasjon,1136,8,0.0,4.0,18.93,8.40
15476,17844,4.5,Subnet + Broadcast,2267,8,0.0,6.0,37.78,12.60
15477,17844,4.6,Transportlagsprotokoller,502,8,0.5,2.0,8.37,4.20
15478,17844,4.7,HTTP,149,8,0.0,1.0,2.48,2.10


In [57]:
question_summary = (
    task_time_df.groupby('question_number')
    .agg({
        'time_per_questions_mins': 'mean',
        'auto_score_per_question': 'mean',
        'max_question_score': 'all'
    })
    .reset_index()
)

# Ideal time comparison
question_summary['over_ideal_time'] = question_summary['time_per_questions_mins'] - 1.8
question_summary['pct_of_ideal_time'] = question_summary['time_per_questions_mins'] / 1.8 * 100

# Score as percentage
question_summary['pct_score'] = question_summary['auto_score_per_question'] / question_summary['max_question_score'] * 100


In [59]:
correlation = task_time_df[['time_per_questions_mins', 'auto_score_per_question']].corr().iloc[0,1]
print(f"Correlation between time spent and score (all rows): {correlation:.2f}")

Correlation between time spent and score (all rows): 0.12


In [60]:
task_time_df = run_analysis("data/cleaned_data.xlsx")
task_time_df

Unnamed: 0,candidate_id,question_number,question_title,question_duration_seconds,incident_time_mins,auto_score_per_question,max_question_score,time_per_questions_mins,expected_time_spent (mins)
0,17104,1.1,Tallsystemer,295,0,2.4,2.4,4.92,5.04
1,17104,1.2,Tallsortering,209,0,3.0,3.0,3.48,6.30
2,17104,1.3,ASCII - Hullbånd,496,0,0.0,3.8,8.27,7.98
3,17104,1.4,Minnebruk,210,0,3.0,3.0,3.50,6.30
4,17104,1.5,LMC - Aritmetikk,373,0,3.0,3.0,6.22,6.30
...,...,...,...,...,...,...,...,...,...
15475,17844,4.4,CIDR til punktnotasjon,1136,8,0.0,4.0,18.93,8.40
15476,17844,4.5,Subnet + Broadcast,2267,8,0.0,6.0,37.78,12.60
15477,17844,4.6,Transportlagsprotokoller,502,8,0.5,2.0,8.37,4.20
15478,17844,4.7,HTTP,149,8,0.0,1.0,2.48,2.10


In [61]:
export_data(task_time_df, 'processed data/Time_Per_Question_Analysis_2024.xlsx', 'xlsx')

Data saved successfully to processed data/Time_Per_Question_Analysis_2024.xlsx
