In [280]:
import pandas as pd

### Read Data File

In [None]:
df = pd.read_excel("Eksamen-2024/2024-json ANONYM.xlsx", engine = 'openpyxl')

In [None]:
df1 = pd.read_excel("Eksamen-2024/2023-json-ANONYM.xlsx", engine = 'openpyxl')

#### Finding Missing Data

In [None]:
# finding missing data/Null Data
def find_missing_data(df):
    """Check for missing values and return boolean"""
    df.isna() # return boolean True if values are Null and False if not missing. 
    return df

### Check for duplicated values

In [None]:
def find_duplicated(df):
    """Check for duplicates and return boolean."""
    
    df.duplicated()
    return df

### Drop Duplicates

In [None]:
# dropping duplicates
def drop_duplicated(df):
    """Drop duplicated rows"""
    
    df.drop_duplicates(inplace = True)
    return df

### Fill Null Values

In [None]:
def fill_missing_values(df):
    """Fill in missing values in specified columns"""
    
    cols_to_fill = [
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2',
        'Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1'
    ]
    
    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown")
    
    return df

### Drop missing/null values 

In [None]:
# dropping missing/null values
def drop_missing(df):
    """Drop missing (null) values"""
    df.dropna(inplace = True)
    # rest default indexes after dropping so it not longer shows older indexes numbers
    df = df.reset_index(drop=True)
    return df

### Drop Non-Useful Columns by Name

In [None]:
# dropping columns using their indexes. 
#df.drop(df.columns[[0, 1, 2, 9, 10, 11, 12, 13, 16, 17, 20, 23, 25, 26]], axis = 1, inplace = True)

def drop_columns(df):
    """ Drop non-useful columns """
    
    # defining columns to drop based on the pattern found on the JSON file
    columns_to_drop = ["Column1.result.sourcedId", 
         "Column1.result.ext_inspera_userAssessmentSetupId", 
         "Column1.result.ext_inspera_userAssessmentId",
         "Column1.result.ext_inspera_attendance", 
         "Column1.result.lineItem.sourcedId",
         "Column1.result.lineItem.type", 
         "Column1.result.student.sourcedId",
         "Column1.result.student.type", 
         "Column1.result.ext_inspera_questions.ext_inspera_questionId",
         "Column1.result.ext_inspera_questions.ext_inspera_questionContentItemId", 
         "Column1.result.ext_inspera_questions.ext_inspera_questionWeight",
         "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_inspera_",
         "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.2",
         "Column1.result.score", "Autoscore", "Column1.result.dateLastModified" ]

    # condition to take only columns matching the current dataframe (df) 
    current_df_columns = [col for col in columns_to_drop if col in df.columns]

    # drop these columns
    df.drop(current_df_columns, axis=1, inplace=True)
        
    return df

### Renaming Columns 

In [None]:
def rename_columns(df):
    """Rename columns to be easily readable"""
    
    rename_map = {
        "Column1.result.ext_inspera_startTime" : "Starttid",
        "Column1.result.ext_inspera_endTime" : "Sluttid",
        "Column1.result.ext_inspera_extraTimeMins" : "Ekstratid (minutter)",
        "Column1.result.ext_inspera_incidentTimeMins" : "Hendelsestid (minutter)",
        "Column1Column1.result.ext_inspera_candidateId" : "Kandidat‑ID",
        "Column1.result.ext_inspera_autoScore" : "Resultatpoeng",
        "Column1.result.ext_inspera_questions.ext_inspera_maxQuestionScore" : "Oppgave MaxPoeng",
        "Column1.result.ext_inspera_questions.ext_inspera_questionNumber" : "Oppgave",
        "Column1.result.ext_inspera_questions.ext_inspera_questionTitle" : "Oppgavetittel",
        "Column1.result.ext_inspera_questions.ext_inspera_durationSeconds" : "Oppgavetid (sekunder)",
        "Column1.result.ext_inspera_questions.ext_inspera_autoScore" : "Oppnådd poeng per oppgave",
        "Column1.result.ext_inspera_questions.ext_inspera_candidateResponses.ext_insper.1" : "Kandidatens svar",
        "Column1" : "Kandidat‑ID",
        "Oppgave Tid" : "Oppgavetid (sekunder)",
        "Oppgave Poeng" : "Oppnådd poeng per oppgave"     
    }

    filtered_rename_map = {k: v for k, v in rename_map.items() if k in df.columns}

    df = df.rename(columns = filtered_rename_map)
    return df
    

## Convert Date into time 

In [None]:
def convert_into_datetime(df):
    """ Convert dates into datetime format and extract time only for start and end time"""
    # convert specified columns into datetime format
    # df['Sist endret'] = pd.to_datetime(df['Sist endret']).dt.tz_localize(None) # dt.tz_localize(None) remove +00:00 UTC time information
    df['Starttid'] = pd.to_datetime(df['Starttid'])
    df['Sluttid'] = pd.to_datetime(df['Sluttid'])

    # dette funker ikke med dt.total_seconds()
    # convert speicfied column into time only.
    # df['Starttid'] = df['Starttid'].dt.time
    # df['Sluttid'] = df['Sluttid'].dt.time

    return df 

### Adding column for time related statistics 

In [None]:
def add_time_usage_columns(df, max_minutes = 240):
    """
    Adds time usage columns to the DataFrame (df):
      - Brukt_tid: total time used in minutes
      - Tid_igjen: remaining time in minutes
      - Prosent_brukt: percent of allowed time used
    """
    df = df.copy()
    df["Brukt tid (minutter)"] = round((df["Sluttid"] - df["Starttid"]).dt.total_seconds() / 60 ,2)
    df["Tid igjen (minutter)"] = round(max_minutes - df["Brukt tid (minutter)"], 2)
    df["%Tid brukt"] = round((df["Brukt tid (minutter)"] / max_minutes) * 100, 2)
    return df

### Re-order Columns

In [None]:
def reorder_columns(df):
    """Re-order columns in the dataframe"""

    # intilizing desired order 
    ordered_columns = [
    "Kandidat‑ID",
    "Oppgave",
    "Oppgavetittel",
    "Starttid",
    "Sluttid",
    "Ekstratid (minutter)",
    "Hendelsestid (minutter)",
    "Oppgavetid (sekunder)",
    "Kandidatens svar",
    "Oppnådd poeng per oppgav",
    "Oppgave MaxPoeng",
    "Brukt tid (minutter)",
    "Tid igjen (minutter)",
    "%Tid brukt",
    "Resultatpoeng"]

    # check if columns name exist in current Dataframe (df)
    current_df_columns = [col for col in ordered_columns if col in df.columns]

    # re-order df based on the new order
    df = df[current_df_columns]
    
    return df

## Cleaning 2023 Exam File

In [None]:
# Dropping duplicated values 
df1 = drop_duplicated(df1)
# df1

In [None]:
# fill missing value with unknown
df1 = fill_missing_values(df1)
# df1 

In [None]:
# Dropping missing values 
# df1 = drop_missing(df1)
# df1

In [None]:
# Dropping non-useful columns
df1 = drop_columns(df1)
# df1

In [None]:
# Renaming Columns
df1 = rename_columns(df1)
# df1

In [None]:
# changing time and date format
df1 = convert_into_datetime(df1)
# df1

In [None]:
# add time usage columns
df1 = add_time_usage_columns(df1, 240)
df1

In [None]:
# re-order columns 
df1 = reorder_columns(df1)
df1

## Cleaning 2024 Exam File

In [None]:
# Dropping duplicated values 
df = drop_duplicated(df)

In [None]:
# Fill missing value with unknown
df = fill_missing_values(df)
# df

In [None]:
# Dropping missing values 
# df = drop_missing(df)
# df

In [None]:
# Dropping non-useful columns
df = drop_columns(df)

In [None]:
# Renaming Columns
df = rename_columns(df)
# df

In [None]:
# changing time and date format
df = convert_into_datetime(df)

In [None]:
# add time usage columns
df = add_time_usage_columns(df, 240)
df

In [None]:
# re-order columns
df = reorder_columns(df)
df

### Export Cleaned data to excel

In [None]:
# df1.to_excel('clean_exam_2023.xlsx', index = False)

In [None]:
# df.to_excel('clean_exam_2024.xlsx', index=False)