In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [18]:
# Load your data (assuming it's saved as a CSV)
file_path = 'crime_randomforest.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Preprocessing
# Mapping your features and target
features = ['vict_age', 'vict_sex', 'vict_descent', 'AREA NAME', 'period']
target = 'Crm.Cd.Group'

# Handle missing values (simple approach, can be improved)
data = data.dropna(subset=features + [target])  # Drop rows with missing values

# Encode categorical variables
label_encoders = {}
for col in features + [target]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Convert to string for consistency
    label_encoders[col] = le

# Split into features (X) and target (y)
X = data[features]
y = data[target]


# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predict probabilities for the first 5 samples (example)
y_proba = model.predict_proba(X_test)
print("Probabilities for first 5 samples:")
print(y_proba[:5])

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.02      0.03      3884
           1       0.38      0.14      0.21      4923
           2       0.40      0.09      0.14      2298
           3       0.00      0.00      0.00        77
           4       0.45      0.15      0.23      4706
           5       0.50      0.92      0.65     16563
           6       0.00      0.00      0.00       478
           7       0.00      0.00      0.00       299
           8       0.21      0.01      0.02      1403
           9       0.00      0.00      0.00       304

    accuracy                           0.48     34935
   macro avg       0.22      0.13      0.13     34935
weighted avg       0.41      0.48      0.38     34935



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Probabilities for first 5 samples:
[[0.11143029 0.12902367 0.04324296 0.         0.13131697 0.52063585
  0.01455894 0.01151693 0.03827439 0.        ]
 [0.23386889 0.14031314 0.02846849 0.00553156 0.05840734 0.44116314
  0.00511563 0.         0.0653648  0.02176701]
 [0.09220385 0.12586218 0.06702907 0.         0.08832314 0.5871523
  0.00263481 0.00791832 0.02363583 0.00524051]
 [0.11071177 0.15898335 0.03098706 0.         0.16823936 0.42789707
  0.         0.04540353 0.05777785 0.        ]
 [0.23759176 0.19802936 0.0221639  0.00624614 0.03826113 0.36046741
  0.00785389 0.0020251  0.11313907 0.01422224]]


In [36]:
# Get predicted probabilities
y_proba = model.predict_proba(X_test)

# Convert y_test to a NumPy array for indexing
y_test_array = y_test.to_numpy()

# Calculate Top-3 Accuracy
top_3_preds = np.argsort(y_proba, axis=1)[:, -3:]  # Indices of the top 3 probabilities
correct_count_top_3 = sum(1 for i, true_class in enumerate(y_test) if true_class in top_3_preds[i])
top_3_accuracy = correct_count_top_3 / len(y_test)

# Calculate Low-3 Accuracy
low_3_preds = np.argsort(y_proba, axis=1)[:, :3]  # Indices of the bottom 3 probabilities
correct_count_low_3 = sum(1 for i, true_class in enumerate(y_test) if true_class in low_3_preds[i])
low_3_accuracy = correct_count_low_3 / len(y_test)

# Print Results
print(f"Top-3 Accuracy: {top_3_accuracy:.2f}")
print(f"Low-3 Accuracy: {low_3_accuracy:.2f}")

print()

importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)

Top-3 Accuracy: 0.81
Low-3 Accuracy: 0.02

Feature Importance:
        feature  importance
3     AREA NAME    0.399735
2  vict_descent    0.245697
1      vict_sex    0.130315
4        period    0.127966
0      vict_age    0.096287


In [39]:
# Create a dictionary to map labels back to original categorical values
label_mapping = {}
for col in features + [target]:
    label_mapping[col] = {index: label for index, label in enumerate(label_encoders[col].classes_)}

print("Label Mappings:")
print(label_mapping)

Label Mappings:
{'vict_age': {0: 'Adult (15~64)', 1: 'Elderly (65~)', 2: 'Youth (0~14)'}, 'vict_sex': {0: 'F', 1: 'H', 2: 'M'}, 'vict_descent': {0: 'Asian', 1: 'Black', 2: 'Descent is not Specified', 3: 'Hispanic', 4: 'Native American', 5: 'Other', 6: 'Pacific Islander', 7: 'White'}, 'AREA NAME': {0: '77TH STREET', 1: 'CENTRAL', 2: 'DEVONSHIRE', 3: 'FOOTHILL', 4: 'HARBOR', 5: 'HOLLENBECK', 6: 'HOLLYWOOD', 7: 'MISSION', 8: 'NEWTON', 9: 'NORTH HOLLYWOOD', 10: 'NORTHEAST', 11: 'OLYMPIC', 12: 'PACIFIC', 13: 'RAMPART', 14: 'SOUTHEAST', 15: 'SOUTHWEST', 16: 'TOPANGA', 17: 'VAN NUYS', 18: 'WEST LOS ANGELES', 19: 'WEST VALLEY', 20: 'WILSHIRE'}, 'period': {0: 'afternoon', 1: 'late_night', 2: 'morning', 3: 'night'}, 'Crm.Cd.Group': {0: 'AGG.ASSAULTS', 1: 'BURG.THEFT.FROMVEICHLE', 2: 'BURGLARY', 3: 'HOMICIDE', 4: 'OTHER.THEFT', 5: 'PART2 Crime', 6: 'PERSONAL.THEFT', 7: 'RAPE', 8: 'ROBBERY', 9: 'VEICHLE.THEFT'}}


In [46]:
# Function to map numerical age to age groups
def map_age_to_group(age):
    if age == -1 or age == 0:
        return np.nan  # Invalid or missing age
    elif age < 15:
        return "Youth (0~14)"
    elif age > 64:
        return "Elderly (65~)"
    else:
        return "Adult (15~64)"

# Updated Function to Predict for Numerical `vict_age`
def predict_new_sample(input_data, model, features, label_mapping, target_label_mapping):
    """
    Predict the output for new input data, mapping vict_age to age groups.
    Args:
        input_data (dict): A dictionary containing feature values (vict_age as numerical).
        model (sklearn model): The trained Random Forest model.
        features (list): List of feature names.
        label_mapping (dict): Mapping of encoded values to original values for each feature.
        target_label_mapping (dict): Mapping of encoded values to original target labels.

    Returns:
        prediction (str): Predicted class label.
        probabilities (dict): Probabilities for each class.
    """
    # Map `vict_age` to its group
    if 'vict_age' in input_data:
        input_data['vict_age'] = map_age_to_group(input_data['vict_age'])
    
    # Encode the input data
    encoded_input = []
    for feature in features:
        value = input_data[feature]
        encoded_value = label_encoders[feature].transform([str(value)])[0]  # Encode input
        encoded_input.append(encoded_value)
    
    # Reshape for prediction
    encoded_input = np.array(encoded_input).reshape(1, -1)
    
    # Predict
    pred = model.predict(encoded_input)
    pred_proba = model.predict_proba(encoded_input)
    
    # Decode prediction
    predicted_label = target_label_mapping[pred[0]]
    
    # Map probabilities to target labels
    proba_dict = {target_label_mapping[idx]: prob for idx, prob in enumerate(pred_proba[0])}
    
    return predicted_label, proba_dict

# Example Input with Numerical `vict_age`
example_input = {
    'vict_age': 30,            # Numerical age
    'vict_sex': 'M',           # As categorical input
    'vict_descent': 'Hispanic',       # As categorical input
    'AREA NAME': 'CENTRAL',    # As categorical input
    'period': 'night'        # As categorical input
}

# Perform Prediction
predicted_label, probabilities = predict_new_sample(
    example_input, model, features, label_mapping, label_mapping[target]
)

# Display Results
print("Predicted Label:", predicted_label)
print("Probabilities:", probabilities)

Predicted Label: PART2 Crime
Probabilities: {'AGG.ASSAULTS': 0.1565328352659695, 'BURG.THEFT.FROMVEICHLE': 0.2745549228336818, 'BURGLARY': 0.018096595754899702, 'HOMICIDE': 0.0, 'OTHER.THEFT': 0.14779251069984242, 'PART2 Crime': 0.31260298998508107, 'PERSONAL.THEFT': 0.027389198524593615, 'RAPE': 0.0, 'ROBBERY': 0.06016227836958181, 'VEICHLE.THEFT': 0.002868668566350221}




In [47]:
import joblib
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model (example)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, "random_forest_model.joblib")
print("Model saved as random_forest_model.joblib")

Model saved as random_forest_model.joblib
