In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv("E:\Group Folder\Data\cleaned_data_backup.csv", low_memory=False)

In [9]:
# Left over cleaning
df.columns = df.columns.str.lower()
df['exam_subject'] = df['exam_subject'].str.lower()
df = df[df['exam_subject'] != 'civics']

# Removing cases when the state is null
df = df[df['t_state'].notnull()]

In [6]:
# Saving the number of ged_ready tests a candidate gave
num_ged_ready_tests = {}
for group_index, sub_df in df[["candidate_id","ged_ready","exam_subject"]].groupby(["candidate_id","exam_subject"]):
    num_ged_ready_tests[group_index] = sub_df["ged_ready"].sum()

In [10]:
# filtering out practice tests
df = df[df['ged_ready'] == 0]

In [10]:
# replacing score with nan if its not in the valid range
df.loc[df["score"]<100,"score"] = np.nan

In [1]:
# Flags indicating pass/failure
flag_cols = ['first_passed', 'first_complete', 'make_passed', 'make_complete']
df.loc[df["score"].isnull(), flag_cols] = np.nan

### Feature Engineering
Following features are created at the test level before aggregation:
- Based on Cadidate's Test Performance/Behaviour
    - `no_show`: If the exam score is 0, candidate did not show up to the exam
    - `first_try`: Candidate passed the exam in the first try
- Based on dates: The difference is in days.
    - `prep_time`: Difference between `account_creation_date` and `exam_date`
    - `time_to_credential`: Difference between `account_creation_date` and `credential_date`
    - `age`: Difference between `birth_year` and year the exam was taken in.
- Based on location:
    - `out_of_town`: Zip from candidate's location is different from zipcode of the test location. Matching cities is problematic as its a free text string.
    - `out_of_state`

In [12]:
date_cols = ['account_setup_complete_date','exam_start','credential_date']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

In [14]:
# Creating some date based features
df["prep_time"] = (df["exam_start"]-df["account_setup_complete_date"]).dt.days
df["time_to_credential"] = (df["credential_date"]-df["account_setup_complete_date"]).dt.days
df["age"] = df["exam_start"].dt.year-df["birth_year"]

In [15]:
# Creating some features (at test level) based on location
df["out_of_town"] = df["zip"] != df["zipcode"] # zip is candidate's location, zipcode is from the test location
df["out_of_state"] = df["c_state"] != df["t_state"]

In [5]:
# Creating features based on the test results
df['no_show'] = df['score'] == 0

# Filling in no shows with false for passing flags
for col in flag_cols:
    df[col] = df[col].map({"True":True,"False":False}).fillna(False)

df['first_try'] = df['first_passed'] & df['first_complete']
flag_cols.append("first_try")

In [3]:
continuous_colums = ["score"]+["prep_time","age"] # "time_to_credential", null values

binary_candidate_lvl = ['study_helpful_adult_education_class',
       'study_helpful_adult_education_teacher',
       'study_helpful_audio_study_materials',
       'study_helpful_books_printed_study_material', 'study_helpful_ged_ready',
       'study_helpful_materials_mobile_app',
       'study_helpful_online_course_video_study_materials',
       'study_helpful_other', 'study_helpful_social_networking_website',
       'study_helpful_tv_study_program',
       'study_location_test_preparation_center', 'indian_or_alaskan', 'asian',
       'african_american', 'race_decline', 'white', 'race_none',
       'hawaiian_or_pacific']

binary_test_lvl = ['on_vue','no_show'] + ["out_of_town", "out_of_state"]

multi_category_columns = ['gender', 'highest_grade_completed', 'c_state', 't_state', 'juris_name', "studied_for_ged",
                          'testing_reason', 'school_incomplete_reason', 'language_code','zipcode']

### Aggregation

#### At Candidate Level

Since there may be candidate level or even regional differences in the performance for each subject, we aggregate for each subject a cadidate appeared for. Depending on the type of the column we treat them differently:

- __Continuous Columns__: We take the mean of the continuous columns to get the central tendency at the candidate level.
- __Categorical Columns__:
    - Binary Candidate Level: These columns stay constant for a particular candidate, we can take the "first" value.
    - Binary Test Level: We need to sum up these values to get an aggregation of the performance of a candidate across time.
        - Flag Columns (`flag_cols`) are a special category for this as they indicate the overall result for a candidate, thus we should take the latest result or the max result.
    - Multi Category Columns: These are also candidate level columns, and hence we should take the mode.

In [19]:
df[flag_cols+binary_candidate_lvl+binary_test_lvl] = df[flag_cols+binary_candidate_lvl+binary_test_lvl].astype(int)

In [20]:
# Creating aggregation dictionary based on the logic discussed above
cand_agg_dict = {}

for col in continuous_colums:
    cand_agg_dict[col] = "mean"
    
for col in flag_cols:
    cand_agg_dict[col] = "max"
    
for col in binary_candidate_lvl:
    cand_agg_dict[col] = "first"

for col in binary_test_lvl:
    cand_agg_dict[col] = sum

for col in multi_category_columns:
    cand_agg_dict[col] = lambda x: pd.Series.mode(x)[0]

cand_agg_dict["result_id"] = "nunique"

In [21]:
candidate_grouper = df.groupby(["candidate_id","exam_subject"])

In [23]:
candidate_agg = candidate_grouper.agg(cand_agg_dict)

In [25]:
# Adding number of practice tests from the map created earlier
candidate_agg["num_practice_tests"] = candidate_agg.index.map(num_ged_ready_tests)

# Unique number of results is the number of tests taken
candidate_agg.rename(columns={"result_id":"num_tests","c_state":"candidate_state"},inplace=True)

In [26]:
def prop_fill(df, prop_fill_cols=["gender"],
              values_to_replace=["DECLINE","NONBINARY"]):
    """
    We replace the values in the columns `prop_fill_cols`.
    The values defined in `values_to_replace` are replaced.
    Filling in the columns with the given proportions of all values in a certain 
    column which is not in values to replace.
    """
    for col in prop_fill_cols:
        fill_index = df[df[col].isin(values_to_replace)].index
        fill_values = list(set(df[col].unique())-set(values_to_replace))
        fill_props = df[df[col].isin(fill_values)][col].value_counts(normalize=True)
        fill_splits = [int(len(fill_index)*x) for x in fill_props[:-1]]
        split_index = np.split(fill_index, fill_splits)
        for idx, val in zip(split_index, fill_values):
            df.loc[idx, col] = val
        return df

# Replacing the unknown gender values proportionately
candidate_agg = prop_fill(candidate_agg)

In [27]:
candidate_agg = pd.get_dummies(candidate_agg, columns=["gender","language_code"], drop_first=True)

In [33]:
candidate_agg.to_csv('E:\Group Folder\Data\candidate_lvl_feats.csv')

#### At zipcode level
Our logic changes slightly as now we need to:
- For `flag_cols`: Calculate the mean to get the average pass rate and so on.
- For `continuous_columns`: The mean is still appropriate, example average age of the candidate from the zip code
- For `binary_candidate_lvl`: We take the sum to get the total counts in various categories
- For `binary_test_lvl`: Again we take the sum to get the total counts of tests taken remotely, out of town/state etc.
- For `multi_category_columns`: We take the mode to find the most frequent category.

In [4]:
# Creating aggregation dictionary based on the logic discussed above
zip_agg_dict = {}

for col in continuous_colums+["distances"]:
    zip_agg_dict[col] = "mean"
    
for col in flag_cols:
    zip_agg_dict[col] = "mean"

## Adding candidate level aggregate columns
for col in binary_candidate_lvl+["num_tests","num_practice_tests"]:
    zip_agg_dict[col] = sum

# Removing zipcode and categorical columns for which we have dummy variables now 
multi_category_columns = list(set(multi_category_columns) - {"gender","language_code","zipcode","zip","c_state"})

for col in multi_category_columns:
    zip_agg_dict[col] = lambda x: pd.Series.mode(x)[0]

# adding new dummy variables (which need to be summed up for each zip)
binary_test_lvl+= ['gender_MALE','language_code_ESP']
for col in binary_test_lvl:
    zip_agg_dict[col] = sum
    
# we want to count the number of candidates
zip_agg_dict["candidate_id"] = "nunique"

In [6]:
candidate_agg = pd.read_csv('E:\Group Folder\Data\candidate_lvl_feats.csv')

In [7]:
zip_agg = candidate_agg.reset_index().groupby(["zipcode","exam_subject"]).agg(zip_agg_dict)

In [11]:
# Adding number of test centers
zip_agg["num_test_centers"] = df[["zipcode","exam_subject","test_center_id"]].groupby(["zipcode","exam_subject"]).agg('nunique')

In [12]:
zip_agg.rename(columns={"candidate_id":"num_candidates"},inplace=True)

In [13]:
zip_agg.to_csv('E:\Group Folder\Data\zipcode_lvl_feats.csv')