In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

In [20]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [21]:
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [22]:
spark = SparkSession.builder.appName("FBI_Data").getOrCreate()
df = spark.read.csv("city_scout_fbi_nibrs_2011_2021.csv", header=True, inferSchema=True)

In [23]:
df.show(20, truncate = False)

+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+-------------------------+------------+--------------------+----------+----------------------------+----------------------------+
|offense_type_id|victim_id|agency_id|city_name       |primary_county|state_abbr|state_id|population|incident_id|age_num|sex_code|race_id                  |ethnicity_id|resident_status_code|offense_id|offense_name                |offense_category_name       |
+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+-------------------------+------------+--------------------+----------+----------------------------+----------------------------+
|45             |62205773 |8714     |Sterling Heights|Macomb        |MI        |26      |132255    |57347297   |10     |M       |White                    |2           |R                   |66872038  |All Other Larceny      

In [24]:
from pyspark.sql.functions import col

# Get unique offense names
df_offense_category = df.groupBy("offense_category_name").count().select("offense_category_name")
df_offense_category_pd = df_offense_category.toPandas()

offenses = df_offense_category_pd["offense_category_name"]
print(offenses)

0                               Fraud Offenses
1                     Stolen Property Offenses
2                                      Robbery
3                                 Embezzlement
4                             Assault Offenses
5                        Prostitution Offenses
6                                      Bribery
7     Destruction/Damage/Vandalism of Property
8                                        Arson
9                 Pornography/Obscene Material
10                           Human Trafficking
11                         Extortion/Blackmail
12                      Counterfeiting/Forgery
13                      Drug/Narcotic Offenses
14                           Homicide Offenses
15                           Gambling Offenses
16                              Animal Cruelty
17                                Sex Offenses
18                      Larceny/Theft Offenses
19                Burglary/Breaking & Entering
20                        Kidnapping/Abduction
21           

In [25]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
import pandas as pd

In [26]:
def create_training_data():
    """
    Create training data with manual classifications.
    Each offense must belong to at least one category.
    """
    return [
        ("Human Trafficking", ["Children", "Women travelling solo"]),
        ("Sex Offenses", ["Children", "Women travelling solo"]),
        ("Kidnapping/Abduction", ["Children", "Women travelling solo"]),
        ("Pornography/Obscene Material", ["Children"]),
        ("Fraud Offenses", ["Elderly", "Tourists"]),
        ("Extortion/Blackmail", ["Elderly", "Tourists"]),
        ("Counterfeiting/Forgery", ["Elderly", "Tourists"]),
        ("Assault Offenses", ["Women travelling solo", "People of Color"]),
        ("Prostitution Offenses", ["Women travelling solo"]),
        ("Robbery", ["Women travelling solo", "Elderly", "Tourists"]),
        ("Stolen Property Offenses", ["Tourists"]),
        ("Motor Vehicle Theft", ["Tourists"]),
        ("Burglary/Breaking & Entering", ["Tourists", "Elderly"]),
        ("Larceny/Theft Offenses", ["Tourists", "Elderly"]),
        ("Destruction/Damage/Vandalism of Property", ["People of Color"]),
        ("Drug/Narcotic Offenses", ["People of Color", "Tourists"]),
        ("Homicide Offenses", ["People of Color", "Women travelling solo"]),
        ("Gambling Offenses", ["Tourists", "Elderly"]),
        ("Weapon Law Violations", ["People of Color"]),
        ("Bribery", ["Tourists", "Elderly"]),
        ("Arson", ["People of Color", "Tourists"]),
        ("Animal Cruelty", ["Children", "Elderly"]),
        ("Embezzlement", ["Elderly", "Tourists"])
    ]

In [27]:
class CategoryOffenseClassifier:
    def __init__(self):
        self.categories = ['Children', 'Elderly', 'Women travelling solo', 'Tourists', 'People of Color']
        self.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1)
        self.classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

    def fit(self):
        training_data = create_training_data()
        X = [offense for offense, _ in training_data]
        y = np.zeros((len(training_data), len(self.categories)))

        for i, (_, cats) in enumerate(training_data):
            for cat in cats:
                y[i, self.categories.index(cat)] = 1

        X_transformed = self.vectorizer.fit_transform(X)
        self.classifier.fit(X_transformed, y)

    def predict(self, offenses):
        X_new = self.vectorizer.transform(offenses)
        predictions_proba = [estimator.predict_proba(X_new)[:, 1] for estimator in self.classifier.estimators_]
        predictions_proba = np.array(predictions_proba).T

        # Create a DataFrame to store all predictions
        results = []
        for i, offense in enumerate(offenses):
            probs = predictions_proba[i]

            # Create a row for each offense with binary indicators for each category
            row = {'Offense': offense}
            assigned_to_any = False

            # Assign categories based on probability threshold
            for cat_idx, prob in enumerate(probs):
                category = self.categories[cat_idx]
                is_assigned = prob > 0.3
                row[category] = 1 if is_assigned else 0
                assigned_to_any = assigned_to_any or is_assigned

            # If no category was assigned, assign to highest probability category
            if not assigned_to_any:
                max_prob_idx = np.argmax(probs)
                row[self.categories[max_prob_idx]] = 1

            results.append(row)

        return pd.DataFrame(results)

In [28]:
def create_csv_outputs(offense_list, output_prefix='offense_classifications'):
    """
    Creates two CSV files:
    1. Binary matrix showing which offenses belong to which categories
    2. Categorized list showing offenses under each category
    """
    classifier = CategoryOffenseClassifier()
    classifier.fit()
    offense_list = [offense for offense in offense_list if offense is not None]

    # Get binary classification matrix
    matrix_df = classifier.predict(offense_list)

    # Create category-based list
    category_lists = defaultdict(list)
    for _, row in matrix_df.iterrows():
        offense = row['Offense']
        for category in classifier.categories:
            if row[category] == 1:
                category_lists[category].append(offense)

    category_based = []
    for category in classifier.categories:
        for offense in category_lists[category]:
            category_based.append({
                'Category': category,
                'Offense': offense
            })

    category_df = pd.DataFrame(category_based)
    category_df.to_csv(f'{output_prefix}_by_category.csv', index=False)

    return matrix_df, category_df

In [29]:
# Run the classification and create CSV files
matrix_df, category_df = create_csv_outputs(offenses)

print("\nMatrix Format Preview (First 5 rows):")
print(matrix_df.head(5))

print("\nCategory-Based Format Preview (First 5 rows):")
print(category_df.head(5))


Matrix Format Preview (First 5 rows):
                    Offense  Children  Elderly  Women travelling solo  \
0            Fraud Offenses         0        1                      0   
1  Stolen Property Offenses         0        0                      0   
2                   Robbery         0        1                      1   
3              Embezzlement         0        1                      0   
4          Assault Offenses         0        0                      1   

   Tourists  People of Color  
0         1                0  
1         1                0  
2         1                0  
3         1                0  
4         0                1  

Category-Based Format Preview (First 5 rows):
   Category                       Offense
0  Children  Pornography/Obscene Material
1  Children             Human Trafficking
2  Children                Animal Cruelty
3  Children                  Sex Offenses
4  Children          Kidnapping/Abduction
