### Load in the data

In [4]:
## Imports 
import pandas as pd
import matplotlib.pyplot as plt


In [5]:
file_path_raw = './data/compas-scores-raw.csv'
file_path_non_violent = './data/compas-scores-two-years.csv'
file_path_violent = './data/compas-scores-two-years.csv'

compas_data_raw = pd.read_csv(file_path_raw)
two_years_data = pd.read_csv(file_path_non_violent)
two_years_violent_data = pd.read_csv(file_path_violent)
compas_data_info = compas_data_raw.info()
compas_data_head = compas_data_raw.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60843 entries, 0 to 60842
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person_ID                60843 non-null  int64  
 1   AssessmentID             60843 non-null  int64  
 2   Case_ID                  60843 non-null  int64  
 3   Agency_Text              60843 non-null  object 
 4   LastName                 60843 non-null  object 
 5   FirstName                60843 non-null  object 
 6   MiddleName               15624 non-null  object 
 7   Sex_Code_Text            60843 non-null  object 
 8   Ethnic_Code_Text         60843 non-null  object 
 9   DateOfBirth              60843 non-null  object 
 10  ScaleSet_ID              60843 non-null  int64  
 11  ScaleSet                 60843 non-null  object 
 12  AssessmentReason         60843 non-null  object 
 13  Language                 60843 non-null  object 
 14  LegalStatus           

In [10]:
def process_data(df):
    df['above_45'] = (df['age'] > 45).astype(int)
    df['below_25'] = (df['age'] < 25).astype(int)

    df['African_American'] = (df['race'] == 'African-American').astype(int)
    df['Asian'] = (df['race'] == 'Asian').astype(int)
    df['Hispanic'] = (df['race'] == 'Hispanic').astype(int)
    df['Native_American'] = (df['race'] == 'Native American').astype(int)
    df['Other'] = (df['race'] == 'Other').astype(int)
    df['Female'] = (df['sex'] == 'Female').astype(int)
    df['misdemeanor'] = (df['c_charge_degree'] == 'M').astype(int)

    return df

def add_one_hot_encoded_columns(raw_df, merged_df):
    key_column = 'Person_ID' 
    if key_column not in raw_df.columns:
        raise KeyError(f"Key column '{key_column}' not found in raw_df")

    marital_status_dummies = pd.get_dummies(raw_df['MaritalStatus'], prefix='MaritalStatus') if 'MaritalStatus' in raw_df.columns else pd.DataFrame()
    language_dummies = pd.get_dummies(raw_df['Language'], prefix='Language') if 'Language' in raw_df.columns else pd.DataFrame()
    raw_with_dummies = pd.concat([raw_df[[key_column]], marital_status_dummies, language_dummies], axis=1)
    one_hot_encoded_df = pd.merge(merged_df, raw_with_dummies, left_on='id', right_on=key_column, how='left')
    one_hot_encoded_df = one_hot_encoded_df.drop(columns=[key_column])

    return one_hot_encoded_df



In [None]:
two_years_data = process_data(two_years_data)
two_years_violent_data = process_data(two_years_violent_data)

# Merge datasets on shared identifiers
merged_data = pd.merge(
    two_years_data,
    two_years_violent_data,
    on='id',
    suffixes=('_general', '_violent'),
    how='outer'
)

columns_of_interest = [
    'priors_count', 'above_45', 'below_25', 'African_American', 'Asian',
    'Hispanic', 'Native_American', 'Other', 'Female', 'misdemeanor',
    'v_decile_score', 'age', 'juv_fel_count', 'two_year_recid', 'is_recid'
]
additional_features = [
    'juv_misd_count', 'juv_other_count', 'days_b_screening_arrest', 
    'c_charge_desc', 'screening_date', 'decile_score', 'score_text'
]
additional_features = [col for col in additional_features if col in merged_data.columns]
merged_data = merged_data[additional_features]

final_data = add_one_hot_encoded_columns(compas_data_raw, merged_data)

final_output_path = './data/compas_final_combined_dataset.csv'
final_data.to_csv(final_output_path, index=False)

