In [102]:
import pandas as pd

In [103]:
df = pd.read_excel("Eksamen-2024/2024-json ANONYM.xlsx", engine = 'openpyxl')

In [104]:
# df.head()

In [105]:
df1 = pd.read_excel("Eksamen-2024/2023-json-ANONYM.xlsx", engine = 'openpyxl')

## Handling Missing Data

#### Finding Missing Data

In [106]:
# finding missing data/Null Data
def find_missing_data(df):
    """Check for missing values and return boolean"""
    df.isna() # return boolean True if values are Null and False if not missing. 
    return df

### Check for duplicated values

In [107]:
def find_duplicated(df):
    """Check for duplicates and return boolean."""
    
    df.duplicated()
    return df

### Drop Duplicates

In [108]:
# dropping duplicates
def drop_duplicated(df):
    """Drop duplicated rows"""
    
    df.drop_duplicates(inplace = True)
    return df

### Fill Null Values

In [109]:
def fill_missing_values(df):
    """Fill in missing values"""
    
    cols_to_fill = [
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1'
    ]
    
    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown")
    
    return df

### Drop missing/null values 

In [110]:
# dropping missing/null values
def drop_missing(df):
    """Drop missing (null) values"""
    df.dropna(inplace = True)
    # rest default indexes after dropping so it not longer shows older indexes numbers
    df = df.reset_index(drop=True)
    return df

### Drop Non-Useful Columns by Name

In [111]:
# dropping columns using their indexes. 
#df.drop(df.columns[[0, 1, 2, 9, 10, 11, 12, 13, 16, 17, 20, 23, 25, 26]], axis = 1, inplace = True)

def drop_columns(df):
    """ Drop non-useful columns """
    
    # defining columns to drop based on the pattern found on the JSON file
    columns_to_drop = ["Column1.result.sourcedId", 
         "Column1.result.ext_inspera_userAssessmentSetupId", 
         "Column1.result.ext_inspera_userAssessmentId",
         "Column1.result.ext_inspera_attendance", 
         "Column1.result.lineItem.sourcedId",
         "Column1.result.lineItem.type", 
         "Column1.result.student.sourcedId",
         "Column1.result.student.type", 
         "Column1.result.ext_inspera_questions.ext_inspera_questionId",
         "Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId", 
         "Column1.result.ext_inspera_questions.ext_inspera_questionWeight",
         "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_",
         "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2",
         "Column1.result.score", "Autoscore", "Column1.result.dateLastModified" ]

    # condition to take only columns matching the current dataframe (df) 
    current_df_columns = [col for col in columns_to_drop if col in df.columns]

    # drop these columns
    df.drop(current_df_columns, axis=1, inplace=True)
    
   # df.drop(["Column1.result.sourcedId", 
        # "Column1.result.ext_inspera_userAssessmentSetupId", 
        # "Column1.result.ext_inspera_userAssessmentId",
        # "Column1.result.ext_inspera_attendance", 
        # "Column1.result.lineItem.sourcedId",
        # "Column1.result.lineItem.type", 
        # "Column1.result.student.sourcedId",
        # "Column1.result.student.type", 
        # "Column1.result.ext_inspera_questions.ext_inspera_questionId",
        # "Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId", 
        # "Column1.result.ext_inspera_questions.ext_inspera_questionWeight",
        # "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_"] ,axis = 1, inplace = True)
    
   # if "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2" in df.columns:
       # df.drop("Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2", axis = 1, inplace = True)
        
    return df

### Renaming Columns 

In [112]:
def rename_columns(df):
    """Rename columns to be easily readable"""
    
    rename_map = {
        "Column1.result.ext_inspera_startTime" : "Starttid",
        "Column1.result.ext_inspera_endTime" : "Sluttid",
        "Column1.result.ext_inspera_extraTimeMins" : "Ekstratid (minutter)",
        "Column1.result.ext_inspera_incidentTimeMins" : "Hendelsestid (minutter)",
        "Column1Column1.result.ext_inspera_candidateId" : "Kandidat‑ID",
        "Column1.result.ext_inspera_autoScore" : "Resultatpoeng",
        "Column1.result.ext_inspera_questions.ext_inspera_maxQuestionScore" : "Oppgave MaxPoeng",
        "Column1.result.ext_inspera_questions.ext_inspera_questionNumber" : "Oppgave",
        "Column1.result.ext_inspera_questions.ext_inspera_questionTitle" : "Oppgavetittel",
        "Column1.result.ext_inspera_questions.ext_inspera_durationSeconds" : "Oppgavetid (sekunder)",
        "Column1.result.ext_inspera_questions.ext_inspera_autoScore" : "Oppnådd poeng per oppgave",
        "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1" : "Kandidatens svar",
        "Column1" : "Kandidat‑ID",
        "Oppgave Tid" : "Oppgavetid (sekunder)",
        "Oppgave Poeng" : "Oppnådd poeng per oppgave"     
    }

    filtered_rename_map = {k: v for k, v in rename_map.items() if k in df.columns}

    df = df.rename(columns = filtered_rename_map)
    return df
    

## Convert Date into time 

In [113]:
def convert_into_datetime(df):
    """ Convert dates into datetime format and extract time only for start and end time"""
    # convert specified columns into datetime format
    # df['Sist endret'] = pd.to_datetime(df['Sist endret']).dt.tz_localize(None) # dt.tz_localize(None) remove +00:00 UTC time information
    df['Starttid'] = pd.to_datetime(df['Starttid'])
    df['Sluttid'] = pd.to_datetime(df['Sluttid'])

    # convert speicfied column into time only.
    df['Starttid'] = df['Starttid'].dt.time
    df['Sluttid'] = df['Sluttid'].dt.time

    return df 

## Re-order Columns

In [114]:
def reorder_columns(df):
    """Re-order columns in the dataframe"""

    # intilizing desired order 
    ordered_columns = [
    "Kandidat‑ID",
    "Oppgave",
    "Oppgavetittel",
    "Starttid",
    "Sluttid",
    "Sist endret",
    "Ekstratid (minutter)",
    "Hendelsestid (minutter)",
    "Oppgavetid (sekunder)",
    "Kandidatens svar",
    "Oppnådd poeng per oppgav",
    "Oppgave MaxPoeng",
    "Resultatpoeng"]

    # check if columns name exist in current Dataframe (df)
    current_df_columns = [col for col in ordered_columns if col in df.columns]

    # re-order df based on the new order
    df = df[current_df_columns]
    
    return df

## Cleaning 2023 Exam File

In [117]:
# Dropping duplicated values 
df1 = drop_duplicated(df1)
# df1

In [118]:
# fill missing value with unknown
df1 = fill_missing_values(df1)
df1 

Unnamed: 0,Column1.result.sourcedId,Column1.result.ext_inspera_userAssessmentSetupId,Column1.result.ext_inspera_userAssessmentId,Column1.result.dateLastModified,Column1.result.ext_inspera_startTime,Column1.result.ext_inspera_endTime,Column1.result.ext_inspera_extraTimeMins,Column1.result.ext_inspera_incidentTimeMins,Column1,Column1.result.ext_inspera_attendance,...,Oppgave MaxPoeng,Column1.result.ext_inspera_questions.ext_inspera_questionId,Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId,Oppgave,Column1.result.ext_inspera_questions.ext_inspera_questionTitle,Column1.result.ext_inspera_questions.ext_inspera_questionWeight,Oppgave Tid,Oppgave Poeng,Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_,Autoscore
0,24499445,28048982,18845779,2023-12-11T11:54:57Z,2023-12-11T08:00:12Z,2023-12-11T11:54:57Z,0,0,13747,True,...,4.5,187896995,185850970,1.1,Dataformater,1,641,4.5,gapImg__9164195750 associableHotspot__146705536,63.0
1,24499445,28048982,18845779,2023-12-11T11:54:57Z,2023-12-11T08:00:12Z,2023-12-11T11:54:57Z,0,0,13747,True,...,4.5,187896995,185850970,1.1,Dataformater,1,641,4.5,gapImg_IA1701156824392d16c8498-010d-48e2-8773-...,63.0
2,24499445,28048982,18845779,2023-12-11T11:54:57Z,2023-12-11T08:00:12Z,2023-12-11T11:54:57Z,0,0,13747,True,...,4.5,187896995,185850970,1.1,Dataformater,1,641,4.5,gapImg__361670718761 associableHotspot45546867419,63.0
3,24499445,28048982,18845779,2023-12-11T11:54:57Z,2023-12-11T08:00:12Z,2023-12-11T11:54:57Z,0,0,13747,True,...,4.5,187896995,185850970,1.1,Dataformater,1,641,4.5,gapImg_IA1701156731070520d3d02-90c1-49b6-b58e-...,63.0
4,24499445,28048982,18845779,2023-12-11T11:54:57Z,2023-12-11T08:00:12Z,2023-12-11T11:54:57Z,0,0,13747,True,...,4.5,187896995,185850970,1.1,Dataformater,1,641,4.5,gapImg_IA1701156329933eb2abd04-c1f3-401f-847a-...,63.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64804,24500614,28049048,18846596,2023-12-11T11:59:35Z,2023-12-11T08:00:26Z,2023-12-11T11:59:35Z,0,0,13624,True,...,5.0,187912801,26745308,4.8,Transportprotokoller,1,654,2.0,gapImg__361670718761 associableHotspot45546867419,77.5
64805,24500614,28049048,18846596,2023-12-11T11:59:35Z,2023-12-11T08:00:26Z,2023-12-11T11:59:35Z,0,0,13624,True,...,5.0,187912801,26745308,4.8,Transportprotokoller,1,654,2.0,gapImg_IA1516303918137d707410f-4cf6-4fe7-8715-...,77.5
64806,24500614,28049048,18846596,2023-12-11T11:59:35Z,2023-12-11T08:00:26Z,2023-12-11T11:59:35Z,0,0,13624,True,...,3.0,187912858,37417552,4.9,Content Delivery Network,1,158,3.0,simpleChoice_1368096164457,77.5
64807,24500614,28049048,18846596,2023-12-11T11:59:35Z,2023-12-11T08:00:26Z,2023-12-11T11:59:35Z,0,0,13624,True,...,3.0,187912858,37417552,4.9,Content Delivery Network,1,158,3.0,simpleChoice_IA1543841241358f507f222-54bf-40d8...,77.5


In [119]:
# Dropping missing values 
# df1 = drop_missing(df1)
# df1

In [120]:
# Dropping non-useful columns
df1 = drop_columns(df1)
# df1

In [121]:
# Renaming Columns
df1 = rename_columns(df1)
# df1

In [122]:
# changing time and date format
df1 = convert_into_datetime(df1)
# df1

In [123]:
df1 = reorder_columns(df1)
df1

Unnamed: 0,Kandidat‑ID,Oppgave,Oppgavetittel,Starttid,Sluttid,Ekstratid (minutter),Hendelsestid (minutter),Oppgavetid (sekunder),Oppgave MaxPoeng,Resultatpoeng
0,13747,1.1,Dataformater,08:00:12,11:54:57,0,0,641,4.5,63.0
1,13747,1.1,Dataformater,08:00:12,11:54:57,0,0,641,4.5,63.0
2,13747,1.1,Dataformater,08:00:12,11:54:57,0,0,641,4.5,63.0
3,13747,1.1,Dataformater,08:00:12,11:54:57,0,0,641,4.5,63.0
4,13747,1.1,Dataformater,08:00:12,11:54:57,0,0,641,4.5,63.0
...,...,...,...,...,...,...,...,...,...,...
64804,13624,4.8,Transportprotokoller,08:00:26,11:59:35,0,0,654,5.0,77.5
64805,13624,4.8,Transportprotokoller,08:00:26,11:59:35,0,0,654,5.0,77.5
64806,13624,4.9,Content Delivery Network,08:00:26,11:59:35,0,0,158,3.0,77.5
64807,13624,4.9,Content Delivery Network,08:00:26,11:59:35,0,0,158,3.0,77.5


## Cleaning 2024 Exam File

In [124]:
# Dropping duplicated values 
df = drop_duplicated(df)

In [125]:
# Fill missing value with unknown
df = fill_missing_values(df)
df

Unnamed: 0,Column1.result.sourcedId,Column1.result.ext_inspera_userAssessmentSetupId,Column1.result.ext_inspera_userAssessmentId,Column1.result.dateLastModified,Column1.result.ext_inspera_startTime,Column1.result.ext_inspera_endTime,Column1.result.ext_inspera_extraTimeMins,Column1.result.ext_inspera_incidentTimeMins,Column1Column1.result.ext_inspera_candidateId,Column1.result.ext_inspera_attendance,...,Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId,Column1.result.ext_inspera_questions.ext_inspera_questionNumber,Column1.result.ext_inspera_questions.ext_inspera_questionTitle,Column1.result.ext_inspera_questions.ext_inspera_questionWeight,Column1.result.ext_inspera_questions.ext_inspera_durationSeconds,Column1.result.ext_inspera_questions.ext_inspera_autoScore,Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_,Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1,Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2,Column1.result.score
0,39719815,46609915,34764844,2024-12-11T15:57:52Z,2024-12-11T14:00:02Z,2024-12-11T15:57:52Z,0,0,17107,True,...,267129466,1.1,Tallsystemer,1,85,2.4,simpleChoice_1368096164456,Heksadesimale tall,2,71.61
1,39719815,46609915,34764844,2024-12-11T15:57:52Z,2024-12-11T14:00:02Z,2024-12-11T15:57:52Z,0,0,17107,True,...,267129466,1.1,Tallsystemer,1,85,2.4,simpleChoice_IA172399520449584402464-f255-46e6...,Oktale tall,7,71.61
2,39719815,46609915,34764844,2024-12-11T15:57:52Z,2024-12-11T14:00:02Z,2024-12-11T15:57:52Z,0,0,17107,True,...,267129466,1.1,Tallsystemer,1,85,2.4,simpleChoice_IA17239952044953d28e157-ac4c-49db...,Titallsystemet,8,71.61
3,39719815,46609915,34764844,2024-12-11T15:57:52Z,2024-12-11T14:00:02Z,2024-12-11T15:57:52Z,0,0,17107,True,...,267129466,1.1,Tallsystemer,1,85,2.4,simpleChoice_IA1723995204495960f601e-73d5-48f8...,Heksadesimale tall,6,71.61
4,39719815,46609915,34764844,2024-12-11T15:57:52Z,2024-12-11T14:00:02Z,2024-12-11T15:57:52Z,0,0,17107,True,...,267129466,1.1,Tallsystemer,1,85,2.4,simpleChoice_IA17239952636882cb13f25-bd73-4352...,Titallsystemet,12,71.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65875,39721255,46610190,34766243,2024-12-11T16:38:05Z,2024-12-11T14:00:15Z,2024-12-11T16:38:05Z,0,0,17383,True,...,131559664,4.6,Transportlagsprotokoller,1,335,1.0,simpleAssociableChoice_1373281683439 simpleAss...,Unknown,1 1,64.09
65876,39721255,46610190,34766243,2024-12-11T16:38:05Z,2024-12-11T14:00:15Z,2024-12-11T16:38:05Z,0,0,17383,True,...,36280738,4.7,HTTP,1,348,0.0,simpleChoice_1368096164459,HTTP-forespørsler kan ikke multiplekses over d...,4,64.09
65877,39721255,46610190,34766243,2024-12-11T16:38:05Z,2024-12-11T14:00:15Z,2024-12-11T16:38:05Z,0,0,17383,True,...,36280738,4.7,HTTP,1,348,0.0,simpleChoice_1368096164457,Den samme TCP-forbindelsen blir gjenbrukt til ...,3,64.09
65878,39721255,46610190,34766243,2024-12-11T16:38:05Z,2024-12-11T14:00:15Z,2024-12-11T16:38:05Z,0,0,17383,True,...,135245596,4.8,HTTP-streaming,1,200,0.0,simpleChoice_1368096164456,Video is divided in small segments and differe...,2,64.09


In [126]:
# Dropping missing values 
# df = drop_missing(df)
# df

In [127]:
# Dropping non-useful columns
df = drop_columns(df)

In [128]:
# Renaming Columns
df = rename_columns(df)
# df

In [129]:
# changing time and date format
df = convert_into_datetime(df)

In [130]:
df = reorder_columns(df)
df

Unnamed: 0,Kandidat‑ID,Oppgave,Oppgavetittel,Starttid,Sluttid,Ekstratid (minutter),Hendelsestid (minutter),Oppgavetid (sekunder),Kandidatens svar,Oppgave MaxPoeng,Resultatpoeng
0,17107,1.1,Tallsystemer,14:00:02,15:57:52,0,0,85,Heksadesimale tall,2.4,71.61
1,17107,1.1,Tallsystemer,14:00:02,15:57:52,0,0,85,Oktale tall,2.4,71.61
2,17107,1.1,Tallsystemer,14:00:02,15:57:52,0,0,85,Titallsystemet,2.4,71.61
3,17107,1.1,Tallsystemer,14:00:02,15:57:52,0,0,85,Heksadesimale tall,2.4,71.61
4,17107,1.1,Tallsystemer,14:00:02,15:57:52,0,0,85,Titallsystemet,2.4,71.61
...,...,...,...,...,...,...,...,...,...,...,...
65875,17383,4.6,Transportlagsprotokoller,14:00:15,16:38:05,0,0,335,Unknown,2.0,64.09
65876,17383,4.7,HTTP,14:00:15,16:38:05,0,0,348,HTTP-forespørsler kan ikke multiplekses over d...,1.0,64.09
65877,17383,4.7,HTTP,14:00:15,16:38:05,0,0,348,Den samme TCP-forbindelsen blir gjenbrukt til ...,1.0,64.09
65878,17383,4.8,HTTP-streaming,14:00:15,16:38:05,0,0,200,Video is divided in small segments and differe...,2.0,64.09


### Export Cleaned data to excel

In [None]:
# df1.to_excel('clean_exam_2023.xlsx', index = False)

In [None]:
# df.to_excel('clean_exam_2024.xlsx', index=False)