In [161]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [199]:
ticker = 'High_Value_Dataset_July_2021'
input_dir = r'./data/'
file = os.path.join(input_dir, ticker + '.csv')

df = pd.read_csv(file)
df.head()

Unnamed: 0,SID Number,TDCJ Number,Name,Current Facility,Gender,Race,Age,Projected Release,Maximum Sentence Date,Parole Eligibility Date,Case Number,County,Offense Code,TDCJ Offense,Sentence Date,Offense Date,Sentence (Years),Last Parole Decision,Next Parole Review Date,Parole Review Status
0,524524,758486,"HARPER,DAVID JOHN JR",Estelle,M,W,92,01/01/9999,01/01/9999,08/03/2026,95-765-C,McLennan,11180000,AGG SEXUAL ASSAULT,08/13/1996,09/15/1995,Life,,08/03/2026,NOT IN REVIEW PROCESS
1,703267,841623,"RODRIGUEZ,THEODORE",Beto,M,W,83,08/03/2043,08/03/2043,02/01/2021,97-396-C,McLennan,11990003,AGG SEX ASLT,08/05/1998,06/01/1996,45.0,,Unavailable at this time.,IN PAROLE REVIEW PROCESS
2,766785,365547,"BROWN,NARRIES EARL",W. Scott,M,W,85,01/01/9999,01/01/9999,08/24/2003,F83-89728-HI,Dallas,11220000,AGG SEX ABUSE CHILD/U14,08/24/1983,03/15/1983,Life,Denied on 12/05/2016,Unavailable at this time.,NOT IN REVIEW PROCESS
3,770626,449674,"FLORES,ISABEL",Pack,M,H,84,01/01/9999,01/01/9999,09/02/2006,86CR-1234-B,Nueces,9150000,MURDER W/DEADLY WPN,04/13/1987,09/02/1986,Life,Denied on 07/21/2021,07/2024,NOT IN REVIEW PROCESS
4,771601,1491019,"MOLETT,JOHN HENRY",Pack,M,B,79,12/02/2037,12/02/2037,12/02/2022,1144294,Harris,12990002,AGG ROBBERY W/DW,02/21/2008,12/03/2007,30.0,,12/02/2022,NOT IN REVIEW PROCESS


In [200]:
df.isna().sum()

SID Number                     0
TDCJ Number                    0
Name                           0
Current Facility               0
Gender                         0
Race                           0
Age                            0
Projected Release             11
Maximum Sentence Date       1914
Parole Eligibility Date     9847
Case Number                   38
County                         0
Offense Code                   0
TDCJ Offense                   0
Sentence Date               1990
Offense Date                   0
Sentence (Years)              31
Last Parole Decision        8857
Next Parole Review Date    22368
Parole Review Status        8857
dtype: int64

In [201]:
# drop all rows with null values, drop where doesn't have a last parole decision
df = df.dropna()
df = df[df['Last Parole Decision'] != 'None']
# df = df[df['Next Parole Review Date'] != 'Unavailable at this time.']

# drop next parole review date because those who were approved do not have one
df = df.drop(columns='Next Parole Review Date')

# drop columns because they are not useful
drop_cols = ['Name', 'SID Number', 'TDCJ Number', 'Current Facility', 'County']
df = df.drop(columns=drop_cols)

In [202]:
## Change data types here
le = LabelEncoder()
## Categorical to numerical
df['Gender'] = le.fit_transform(df['Gender'])
df['Race'] = le.fit_transform(df['Race'])
df['TDCJ Offense'] = le.fit_transform(df['TDCJ Offense'])
df['Age'].astype(int)

#create a class attribute that is formulated if the person was approved or not
df['Last Parole Decision'] = np.where(df['Last Parole Decision'].str.startswith('Approve'), 1, 0)

## If Parole Review Status is IN PAROLE REVIEW PROCESS, then 1, else 0
df['Parole Review Status'] = np.where(df['Parole Review Status'] == 'IN PAROLE REVIEW PROCESS', 1, 0)

"""
999 if Sentence (Years) is Life/Captial Life (not sure if this is the best way to do this)
Something to discuss
"""
df['Sentence (Years)'] = np.where((df['Sentence (Years)'] == 'Capital Life') | (df['Sentence (Years)'] == 'Life'), 999, df['Sentence (Years)'])


## converting date columns to date columns
df['Sentence Date'] = pd.to_datetime(df['Sentence Date'], format='%m/%d/%Y')
df['Offense Date'] = pd.to_datetime(df['Offense Date'], format='%m/%d/%Y')

In [203]:
df.head()

Unnamed: 0,Gender,Race,Age,Projected Release,Maximum Sentence Date,Parole Eligibility Date,Case Number,Offense Code,TDCJ Offense,Sentence Date,Offense Date,Sentence (Years),Last Parole Decision,Parole Review Status
2,1,6,85,01/01/9999,01/01/9999,08/24/2003,F83-89728-HI,11220000,952,1983-08-24,1983-03-15,999.0,0,0
3,1,2,84,01/01/9999,01/01/9999,09/02/2006,86CR-1234-B,9150000,6202,1987-04-13,1986-09-02,999.0,0,0
5,1,1,83,06/19/2023,06/19/2023,06/18/2013,"04CR-16,149",11990003,1194,2004-05-13,1999-07-24,20.0,0,0
6,1,6,84,01/01/9999,01/01/9999,05/24/1981,225031,9130000,3071,1975-04-30,1974-06-28,999.0,0,1
12,1,2,80,01/01/9999,01/01/9999,05/22/1982,233429,9130000,3071,1975-10-06,1975-03-24,999.0,0,0


In [204]:
# find the current date to set all other dates against (number of days away)
current_date = 0

if max(df['Sentence Date']) > max(df['Offense Date']):
    current_date = max(df['Sentence Date'])
else:
    current_date = max(df['Offense Date'])
print(current_date)

2021-07-12 00:00:00


In [205]:
# find number of days Parole Eligibility is away from current date
pe_date = []

for i in df['Parole Eligibility Date']:
    if i != '01/01/9999':
        temp = pd.to_datetime(i, format='%m/%d/%Y')
        num_years = temp - current_date
        num_years = round(np.absolute((num_years / np.timedelta64(1, 'Y'))))
        pe_date.append(num_years)

df['Parole Eligibility (Years)'] = pe_date

In [212]:
# find number of days Max Release Date  is away from current date
ms_date = []

## Simplified
for i in df['Maximum Sentence Date']:
    """
    If the sentence is more than current_date + 100 years, then set it to 999 Years
    """
    if int(i[-4:]) > current_date.year + 100:
        ms_date.append(999)
    # if int(i[-4:]) >= 9999 or int(i[-4:]) > 2262:
    #     ms_date.append(999)
    # elif int(i[-4:]) < 9999 and int(i[-4:]) >= 8888:
    #     ms_date.append(99)
    # elif int(i[-4:]) < 8888 and int(i[-4:]) >= 5555:
    #     ms_date.append(1000000)
    # elif int(i[-4:]) < 5555 and int(i[-4:]) > 2262:   # since only 8 tuples fall into this time period
    #     ms_date.append(100000)
    else:
        temp = pd.to_datetime(i, format='%m/%d/%Y')
        num_years = temp - current_date
        num_years = round((num_years/np.timedelta64(1, 'Y')))  ## converted to years
        ms_date.append(num_years)

df['Sentence Left (Years)'] = ms_date

In [211]:
df.head(5)

Unnamed: 0,Gender,Race,Age,Projected Release,Maximum Sentence Date,Parole Eligibility Date,Case Number,Offense Code,TDCJ Offense,Sentence Date,Offense Date,Sentence (Years),Last Parole Decision,Parole Review Status,Parole Eligibility (Years),Sentence Left (Years)
2,1,6,85,01/01/9999,01/01/9999,08/24/2003,F83-89728-HI,11220000,952,1983-08-24,1983-03-15,999.0,0,0,18,999
3,1,2,84,01/01/9999,01/01/9999,09/02/2006,86CR-1234-B,9150000,6202,1987-04-13,1986-09-02,999.0,0,0,15,999
5,1,1,83,06/19/2023,06/19/2023,06/18/2013,"04CR-16,149",11990003,1194,2004-05-13,1999-07-24,20.0,0,0,8,2
6,1,6,84,01/01/9999,01/01/9999,05/24/1981,225031,9130000,3071,1975-04-30,1974-06-28,999.0,0,1,40,999
12,1,2,80,01/01/9999,01/01/9999,05/22/1982,233429,9130000,3071,1975-10-06,1975-03-24,999.0,0,0,39,999


In [179]:
columns = ['Gender', 'Race', 'Age', 'Sentence (Years)', 'Last Parole Decision', 'Parole Review Status', 'Parole Eligibility (Years)', 'Sentence Left (Years)']
# df = df[columns]
df[columns]

Unnamed: 0,Gender,Race,Age,Sentence (Years),Last Parole Decision,Parole Review Status,Parole Eligibility (Years),Sentence Left (Years)
2,1,6,85,999,0,0,18,999
3,1,2,84,999,0,0,15,999
5,1,1,83,20.0,0,0,8,2
6,1,6,84,999,0,1,40,999
12,1,2,80,999,0,0,39,999
...,...,...,...,...,...,...,...,...
118304,1,1,36,6.0,0,0,1,4
118305,1,6,30,10.0,0,0,0,9
118307,1,2,20,6.0,0,0,0,3
118313,1,1,24,7.0,0,1,1,6


In [51]:
df.to_csv(r'./data/' + ticker + '_preprocessed.csv')

        SID Number  Last Parole Decision
2           766785                     0
3           770626                     0
5           779423                     0
6           799447                     0
8           834039                     0
...            ...                   ...
118305    50823966                     0
118307    50824199                     0
118310    50824228                     1
118313    50824270                     0
118317    50824569                     0

[61209 rows x 2 columns]
