In [1]:
import pandas as pd
import pickle
import numpy as np
import boto3
from io import StringIO

# Create S3 session, resource, bucket, and file
s3_session = boto3.Session().client("s3")
s3_resource = boto3.resource("s3")
bucket = 'team-3-project-data'
file = 'district_misdemeanor_felony.csv'

# Read in Courts Mapper
courts_map = pd.read_pickle('courts.pickle')

# Read in Race Mapper File
race_map = pd.read_pickle('race_map.pickle')

# Assign columns which will be used
load_cols = ['FinalDisposition', 
            'SentenceTime', 
            'Complainant', 
            'AmendedCharge', 
            'DefenseAttorney', 
            'Gender', 
            'Race', 
            'CaseType', 
            'Class', 
            'CodeSection',
            'fips']

# Download from S3
## Only need to run once to get the file
# s3_session.download_file(Bucket=bucket, Key=file, Filename=file)

# Read in District Data and drop duplicates
district = pd.read_csv(file, usecols=load_cols)

In [2]:
district.head()

Unnamed: 0,fips,DefenseAttorney,Gender,Race,CodeSection,CaseType,Class,Complainant,AmendedCharge,FinalDisposition,SentenceTime
0,91,,Male,White Caucasian(Non-Hispanic),46.2-300,Misdemeanor,2,CARPENTER,,Complied With Law,0.0
1,91,,Male,White Caucasian(Non-Hispanic),B.46.2-853,Misdemeanor,1,CARPENTER,,Guilty In Absentia,
2,91,,Male,White Caucasian(Non-Hispanic),18.2-250.1,Misdemeanor,U,HYLTON,,Guilty,0.0
3,91,,Male,White Caucasian(Non-Hispanic),A.46.2-301,Misdemeanor,1,BRENDEL,LICENSE: DRIVE W/O,Guilty,0.0
4,91,,Male,White Caucasian(Non-Hispanic),B.46.2-853,Misdemeanor,1,BRENDEL,,Dismissed,0.0


In [3]:
# Feature Creation

# Create Public Defender Column
district['PublicDefender'] = district.DefenseAttorney.isin(['Public Defender', 
                                                               'PD', 
                                                               'PUB DEF', 
                                                               'P DEF', 
                                                               'PUBLIC DEFENDER(TSR)', 
                                                               'P/D', 
                                                               'PUBLIC DEFENDER 703-934-5600', 
                                                               'PUBLIC DEFENDER  703-934-5600',
                                                               'PUBLIC DEFENDER 934-5600',
                                                               'PUBLIC DEFENDER (TSR)',
                                                               'PUBLIC DEF',
                                                               'PDEF']+
                                                                [i for i in dict(district.DefenseAttorney.value_counts()) if 'PD' in i]
                                                           )

# Create Column indicating if a charge was amended
district['ChargeAmended'] = district.AmendedCharge.notnull()

# Add in Courts Data from FIPS
district['Court'] = district['fips'].map(courts_map)

In [4]:
# Clean Data

# Clean Mappings for Race variable
district.Race = district.Race.map(race_map)

# Give Value for NULLs Charge Class
district.Class = district.Class.fillna('None')

# Clean Charge Code Section - Drop NULLs (6 records)
district = district[district['CodeSection'].notnull()]

# Give NUll Sentence Time 0 values - as no sentence would equate to 0
district['SentenceTime'] = district['SentenceTime'].fillna(0).astype(int)

# Drop Records where FinalDisposition is NULL (89854 records)
district = district[district.FinalDisposition.notnull()]

# Drop Records where Complainant is NULL (1 record)
district = district[district.Complainant.notnull()]

In [5]:
# Subsect Data Frame for only Response and Predictor Variables
usecols = ['FinalDisposition', 'SentenceTime', 'Court', 'Complainant', 'PublicDefender', 'Gender', 'Race', 'CaseType', 'Class', 'CodeSection', 'ChargeAmended']

In [15]:
# Write Data for Model

# Write Data as CSV to S3 Bucket
with StringIO() as csv_buffer:
    
    # Only get Misdeameanor and Felonies
    district[usecols].to_csv(csv_buffer, index=False)

    # Write file to S3
    response = s3_resource.Object(bucket, 'model_data.csv').put(Body=csv_buffer.getvalue())

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    # Print whether it was successful
    if status == 200:
        print(f"Successful S3 put_object - response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object - response. Status - {status}")

Successful S3 put_object - response. Status - 200


# Data Exploration

In [None]:
#TODO