In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn.linear_model import LogisticRegression

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [21]:
url = "https://data.ojp.usdoj.gov/resource/ynf5-u8nk.csv"
df = pd.read_csv(url)

# Explore the dataset
print(df.head())

   id gender   race age_at_release  residence_puma gang_affiliated  \
0   1      M  BLACK          43-47              16           False   
1   2      M  BLACK          33-37              16           False   
2   3      M  BLACK    48 or older              24           False   
3   4      M  WHITE          38-42              16           False   
4   5      M  WHITE          33-37              16           False   

   supervision_risk_score_first supervision_level_first  \
0                           3.0                Standard   
1                           6.0             Specialized   
2                           7.0                    High   
3                           7.0                    High   
4                           4.0             Specialized   

         education_level dependents  ... drugtests_meth_positive  \
0  At least some college  3 or more  ...                0.000000   
1   Less than HS diploma          1  ...                0.000000   
2  At least some col

In [22]:
#print(df.info())

In [23]:
#print(df.describe())

In [24]:
# Drop columns
df = df.drop(columns=["id"])

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, drop_first=True)

# Handle missing values if any
df = df.dropna()

# Define features and target variable
X = df.drop(
    columns=[
        "recidivism_within_3years",
        "recidivism_arrest_year1",
        "recidivism_arrest_year2",
        "recidivism_arrest_year3",
    ],
    axis=1,
)
y = df["recidivism_within_3years"]

print(len(df.columns))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

119


**LOGISTIC REGRESSION**

In [27]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Lasso logistic regression model
lasso_model = LogisticRegression(penalty="l1", solver="liblinear", random_state=42)
lasso_model.fit(X_train_scaled, y_train)

# Print coefficients
coef_df = pd.DataFrame({"Feature": X.columns, "Coefficient": lasso_model.coef_[0]})
print(coef_df)

# Predictions and Evaluation
y_pred_logistic = lasso_model.predict(X_test_scaled)

                                 Feature  Coefficient
0                         residence_puma    -0.039025
1           supervision_risk_score_first     0.000000
2                prior_arrest_episodes_1     0.036453
3                prior_arrest_episodes_2     0.078853
4            prior_conviction_episodes_2     0.178448
..                                   ...          ...
110          program_unexcusedabsences_2    -0.108559
111  program_unexcusedabsences_3 or more     0.118189
112                  residence_changes_1     0.000242
113                  residence_changes_2     0.061021
114          residence_changes_3 or more     0.367359

[115 rows x 2 columns]


**DECISION TREE**

In [28]:
#train decision tree

clf = DecisionTreeClassifier() #clf to denote classifier

In [29]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Using the best estimator found by GridSearchCV
best_clf = grid_search.best_estimator_

# Fit the model with the best parameters on the training data
best_clf.fit(X_train, y_train)

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Evaluate Decision Tree Classifier
y_pred_tree = best_clf.predict(X_test)
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree) #conf matrix of Decision Tree
class_report_tree = classification_report(y_test, y_pred_tree)

# Evaluate Logistic Regression
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)


# Display confusion matrices and classification reports
print("Confusion Matrix - Decision Tree Classifier:")
print(conf_matrix_tree)
print("\nClassification Report - Decision Tree Classifier:")
print(class_report_tree)

print("\nConfusion Matrix - Logistic Regression:")
print(conf_matrix_logistic)
print("\nClassification Report - Logistic Regression:")
print(class_report_logistic)