In [30]:
import pandas as pd
import numpy as np
import matplotlib as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [31]:
data = pd.read_csv('NIJ_s_Recidivism_Challenge_Training_Dataset.csv')

In [47]:
df = pd.DataFrame(data)

In [38]:
data.dtypes

ID                                                     int64
Gender                                                 int32
Race                                                   int32
Age_at_Release                                         int32
Residence_PUMA                                         int64
Gang_Affiliated                                       object
Supervision_Risk_Score_First                         float64
Supervision_Level_First                               object
Education_Level                                       object
Dependents                                            object
Prison_Offense                                        object
Prison_Years                                          object
Prior_Arrest_Episodes_Felony                          object
Prior_Arrest_Episodes_Misd                            object
Prior_Arrest_Episodes_Violent                         object
Prior_Arrest_Episodes_Property                        object
Prior_Arrest_Episodes_Dr

In [50]:
##Label Encoder Needs Uniform Data Types so switching all Categorical Columns to String
df = df.astype({'ID': int,
                'Gender': str,
               'Race': str,
               'Age_at_Release': str,
                'Residence_PUMA': int,
                'Gang_Affiliated': str,
                'Supervision_Risk_Score_First': float,
               'Supervision_Level_First': str,
               'Education_Level': str,
               'Dependents': str,
               'Prison_Offense': str,
               'Prison_Years': str,
               'Prior_Arrest_Episodes_Felony': str,
               'Prior_Arrest_Episodes_Misd': str,
               'Prior_Arrest_Episodes_Violent': str,
               'Prior_Arrest_Episodes_Property': str,
               'Prior_Arrest_Episodes_Drug': str,
               'Prior_Arrest_Episodes_PPViolationCharges': str,
               'Prior_Arrest_Episodes_DVCharges': str,
               'Prior_Arrest_Episodes_GunCharges': str,
               'Prior_Conviction_Episodes_Felony': str,
               'Prior_Conviction_Episodes_Misd': str,
               'Prior_Conviction_Episodes_Viol': str,
               'Prior_Conviction_Episodes_Prop': str,
               'Prior_Conviction_Episodes_Drug': str,
               'Prior_Conviction_Episodes_PPViolationCharges': str,
               'Prior_Conviction_Episodes_DomesticViolenceCharges': str,
               'Prior_Conviction_Episodes_GunCharges': str,
               'Prior_Revocations_Parole': str,
               'Prior_Revocations_Probation': str,
               'Condition_MH_SA': str,
               'Condition_Cog_Ed': str,
               'Condition_Other': str,
               'Violations_ElectronicMonitoring': str,
               'Violations_Instruction': str,
               'Violations_FailToReport': str,
               'Violations_MoveWithoutPermission': str,
               'Delinquency_Reports': str,
               'Program_Attendances': str,
               'Program_UnexcusedAbsences': str,
               'Residence_Changes': str,                
                'Avg_Days_per_DrugTest': float,
                'DrugTests_THC_Positive': float,
                'DrugTests_Cocaine_Positive': float,
                'DrugTests_Meth_Positive': float,
                'DrugTests_Other_Positive': float,
                'Percent_Days_Employed': float,
                'Jobs_Per_Year': float,                
               'Employment_Exempt': str,
               'Recidivism_Within_3years': str,
               'Recidivism_Arrest_Year1': str,
               'Recidivism_Arrest_Year2': str,
               'Recidivism_Arrest_Year3': str
               })

In [51]:
print(df.dtypes)

ID                                                     int32
Gender                                                object
Race                                                  object
Age_at_Release                                        object
Residence_PUMA                                         int32
Gang_Affiliated                                       object
Supervision_Risk_Score_First                         float64
Supervision_Level_First                               object
Education_Level                                       object
Dependents                                            object
Prison_Offense                                        object
Prison_Years                                          object
Prior_Arrest_Episodes_Felony                          object
Prior_Arrest_Episodes_Misd                            object
Prior_Arrest_Episodes_Violent                         object
Prior_Arrest_Episodes_Property                        object
Prior_Arrest_Episodes_Dr

In [52]:
le = LabelEncoder()

In [79]:
## Use Label Encoder to Encode Each Categorical Column Prior to One-Hot Encoding
df['Gender'] = le.fit_transform(df['Gender'])
df['Race'] = le.fit_transform(df['Race'])
df['Age_at_Release'] = le.fit_transform(df['Age_at_Release'])
df['Gang_Affiliated'] = le.fit_transform(df['Gang_Affiliated'])
df['Supervision_Level_First'] = le.fit_transform(df['Supervision_Level_First'])
df['Education_Level'] = le.fit_transform(df['Education_Level'])
df['Dependents'] = le.fit_transform(df['Dependents'])
df['Prison_Offense'] = le.fit_transform(df['Prison_Offense'])
df['Prison_Years'] = le.fit_transform(df['Prison_Years'])
df['Prior_Arrest_Episodes_Felony'] = le.fit_transform(df['Prior_Arrest_Episodes_Felony'])
df['Prior_Arrest_Episodes_Misd'] = le.fit_transform(df['Prior_Arrest_Episodes_Misd'])
df['Prior_Arrest_Episodes_Violent'] = le.fit_transform(df['Prior_Arrest_Episodes_Violent'])
df['Prior_Arrest_Episodes_Property'] = le.fit_transform(df['Prior_Arrest_Episodes_Property'])
df['Prior_Arrest_Episodes_Drug'] = le.fit_transform(df['Prior_Arrest_Episodes_Drug'])
df['Prior_Arrest_Episodes_PPViolationCharges'] = le.fit_transform(df['Prior_Arrest_Episodes_PPViolationCharges'])
df['Prior_Arrest_Episodes_DVCharges'] = le.fit_transform(df['Prior_Arrest_Episodes_DVCharges'])
df['Prior_Arrest_Episodes_GunCharges'] = le.fit_transform(df['Prior_Arrest_Episodes_GunCharges'])
df['Prior_Conviction_Episodes_Felony'] = le.fit_transform(df['Prior_Conviction_Episodes_Felony'])
df['Prior_Conviction_Episodes_Misd'] = le.fit_transform(df['Prior_Conviction_Episodes_Misd'])
df['Prior_Conviction_Episodes_Viol'] = le.fit_transform(df['Prior_Conviction_Episodes_Viol'])
df['Prior_Conviction_Episodes_Prop'] = le.fit_transform(df['Prior_Conviction_Episodes_Prop'])
df['Prior_Conviction_Episodes_Drug'] = le.fit_transform(df['Prior_Conviction_Episodes_Drug'])
df['Prior_Conviction_Episodes_PPViolationCharges'] = le.fit_transform(df['Prior_Conviction_Episodes_PPViolationCharges'])
df['Prior_Conviction_Episodes_DomesticViolenceCharges'] = le.fit_transform(df['Prior_Conviction_Episodes_DomesticViolenceCharges'])
df['Prior_Conviction_Episodes_GunCharges'] = le.fit_transform(df['Prior_Conviction_Episodes_GunCharges'])
df['Prior_Revocations_Parole'] = le.fit_transform(df['Prior_Revocations_Parole'])
df['Prior_Revocations_Probation'] = le.fit_transform(df['Prior_Revocations_Probation'])
df['Condition_MH_SA'] = le.fit_transform(df['Condition_MH_SA'])
df['Condition_Cog_Ed'] = le.fit_transform(df['Condition_Cog_Ed'])
df['Condition_Other'] = le.fit_transform(df['Condition_Other'])
df['Violations_ElectronicMonitoring'] = le.fit_transform(df['Violations_ElectronicMonitoring'])
df['Violations_Instruction'] = le.fit_transform(df['Violations_Instruction'])
df['Violations_FailToReport'] = le.fit_transform(df['Violations_FailToReport'])
df['Violations_MoveWithoutPermission'] = le.fit_transform(df['Violations_MoveWithoutPermission'])
df['Delinquency_Reports'] = le.fit_transform(df['Delinquency_Reports'])
df['Program_Attendances'] = le.fit_transform(df['Program_Attendances'])
df['Program_UnexcusedAbsences'] = le.fit_transform(df['Program_UnexcusedAbsences'])
df['Residence_Changes'] = le.fit_transform(df['Residence_Changes'])
df['Employment_Exempt'] = le.fit_transform(df['Employment_Exempt'])
df['Recidivism_Within_3years'] = le.fit_transform(df['Recidivism_Within_3years'])
df['Recidivism_Arrest_Year1'] = le.fit_transform(df['Recidivism_Arrest_Year1'])
df['Recidivism_Arrest_Year2'] = le.fit_transform(df['Recidivism_Arrest_Year2'])
df['Recidivism_Arrest_Year3'] = le.fit_transform(df['Recidivism_Arrest_Year3'])

In [80]:
df.head()

Unnamed: 0,ID,Gender,Race,Age_at_Release,Residence_PUMA,Gang_Affiliated,Supervision_Risk_Score_First,Supervision_Level_First,Education_Level,Dependents,...,DrugTests_Meth_Positive,DrugTests_Other_Positive,Percent_Days_Employed,Jobs_Per_Year,Employment_Exempt,Recidivism_Within_3years,Recidivism_Arrest_Year1,Recidivism_Arrest_Year2,Recidivism_Arrest_Year3,PPrior_Conviction_Episodes_Viol
0,1,1,0,5,16,0,3.0,2,0,3,...,0.0,0.0,0.488562,0.44761,0,0,0,0,0,0
1,2,1,0,3,16,0,6.0,1,2,1,...,0.0,0.0,0.425234,2.0,0,1,0,0,1,1
2,3,1,0,6,24,0,7.0,0,0,3,...,0.166667,0.0,0.0,0.0,0,1,0,1,0,1
3,4,1,1,4,16,0,7.0,0,2,1,...,0.0,0.0,1.0,0.718996,0,0,0,0,0,0
4,5,1,1,3,16,0,4.0,1,2,3,...,0.058824,0.0,0.203562,0.929389,0,1,1,0,0,1


In [81]:
print(df.dtypes)

ID                                                     int32
Gender                                                 int32
Race                                                   int32
Age_at_Release                                         int32
Residence_PUMA                                         int32
Gang_Affiliated                                        int32
Supervision_Risk_Score_First                         float64
Supervision_Level_First                                int32
Education_Level                                        int32
Dependents                                             int32
Prison_Offense                                         int32
Prison_Years                                           int32
Prior_Arrest_Episodes_Felony                           int32
Prior_Arrest_Episodes_Misd                             int32
Prior_Arrest_Episodes_Violent                          int32
Prior_Arrest_Episodes_Property                         int32
Prior_Arrest_Episodes_Dr

In [82]:
df.shape

(18028, 54)

In [57]:
onehotencoder = OneHotEncoder()
columnTransformer = columnTransformer = ColumnTransformer([('encoder',
                                        OneHotEncoder(),
                                        [0])],
                                      remainder='passthrough')

In [85]:
data = np.array(columnTransformer.fit_transform(df), dtype = np.int32)

SyntaxError: unexpected EOF while parsing (<ipython-input-85-d19d6abea0b8>, line 1)