In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

# Initialize Spark session
spark = SparkSession.builder.appName("PCAExample").getOrCreate()


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())


CUDA available: True


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from keras.models import Model
from keras.layers import Input, Dense
import numpy as np
spark = SparkSession.builder \
    .appName("AutoencoderRiskScore") \
    .getOrCreate()

In [None]:
import pandas as pd


file_path = 'city_scout_fbi_nibrs_2011_2021.csv'  # Update with the correct path
data = spark.read.csv("/content/drive/MyDrive/" + file_path, header=True, inferSchema=True)

# df = pd.read_csv("/content/drive/MyDrive/" + file_path)
firstThousDF = data
# firstThousDF.show(n = 10000)

In [None]:

bendDF = firstThousDF
print(bendDF.count())
# Using population, age_num, sex_code, race, resident_status_code, offense_name feature as input to autoencoder

6822798


In [None]:
median_value = bendDF.approxQuantile("population", [0.5], 0.1)[0]

median_value_age = bendDF.approxQuantile("age_num", [0.5], 0.1)[0]


res = bendDF.withColumn(
    "age_filled",
    when(col("age_num").isNull(), median_value_age).otherwise(col("age_num"))
)

res = res.withColumn(
    "population_filled",
    when(col("population").isNull(), median_value).otherwise(col("population"))
)

res = res.withColumn(
    "race_id_filled",
    when(col("race_id").isNull(), "Unknown").otherwise(col("race_id"))
)

res = res.withColumn(
    "offense_name_filled",
    when(col("offense_name").isNull(), "Unknown").otherwise(col("offense_name"))
)

res = res.withColumn(
    "resident_status_code_filled",
    when(col("resident_status_code").isNull(), "Unknown").otherwise(col("resident_status_code"))
)

res = res.withColumn(
    "sex_code_filled",
    when(col("sex_code").isNull(), "Unknown").otherwise(col("sex_code"))
)

# res = res.withColumn("log_population", log(col("population_filled") + 1))
res.show()

+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+--------------------+------------+--------------------+----------+--------------------+---------------------+----------+-----------------+--------------------+--------------------+---------------------------+---------------+
|offense_type_id|victim_id|agency_id|       city_name|primary_county|state_abbr|state_id|population|incident_id|age_num|sex_code|             race_id|ethnicity_id|resident_status_code|offense_id|        offense_name|offense_category_name|age_filled|population_filled|      race_id_filled| offense_name_filled|resident_status_code_filled|sex_code_filled|
+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+--------------------+------------+--------------------+----------+--------------------+---------------------+----------+-----------------+--------------------+----

In [None]:
null_counts = res.select([sum(col(c).isNull().cast("int")).alias(c) for c in res.columns])
null_counts.show()

+---------------+---------+---------+---------+--------------+----------+--------+----------+-----------+-------+--------+-------+------------+--------------------+----------+------------+---------------------+-----------------+--------------+-------------------+---------------------------+---------------+
|offense_type_id|victim_id|agency_id|city_name|primary_county|state_abbr|state_id|population|incident_id|age_num|sex_code|race_id|ethnicity_id|resident_status_code|offense_id|offense_name|offense_category_name|population_filled|race_id_filled|offense_name_filled|resident_status_code_filled|sex_code_filled|
+---------------+---------+---------+---------+--------------+----------+--------+----------+-----------+-------+--------+-------+------------+--------------------+----------+------------+---------------------+-----------------+--------------+-------------------+---------------------------+---------------+
|              0|        0|        0|  1700668|         50094|         1|   

In [None]:
from pyspark.ml.feature import StandardScaler

currNumCols=["population_filled", "age_filled"]
otherCols = ["sex_code_filled", "race_id_filled", "resident_status_code_filled", "offense_name_filled"]


raceIndexer = StringIndexer(inputCol="race_id_filled", outputCol="race_id_index")
offenseIndexer = StringIndexer(inputCol="offense_name_filled", outputCol="crime_type_index")
residentIndexer = StringIndexer(inputCol="resident_status_code_filled", outputCol="resident_status_index")
sexCodeIndexer = StringIndexer(inputCol="sex_code_filled", outputCol="sex_code_index")



assembler = VectorAssembler(inputCols=currNumCols + ["sex_code_index", "race_id_index", "crime_type_index", "resident_status_index"], outputCol="features_for_scaling")



scaler = StandardScaler(inputCol="features_for_scaling", outputCol="scaledFeatures", withStd=True, withMean=True)
pipeline = Pipeline(stages=[raceIndexer, offenseIndexer, residentIndexer, sexCodeIndexer, assembler, scaler])
scaled_df = pipeline.fit(res).transform(res)
scaled_df.show()


+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+--------------------+------------+--------------------+----------+--------------------+---------------------+----------+-----------------+--------------------+--------------------+---------------------------+---------------+-------------+----------------+---------------------+--------------+--------------------+--------------------+
|offense_type_id|victim_id|agency_id|       city_name|primary_county|state_abbr|state_id|population|incident_id|age_num|sex_code|             race_id|ethnicity_id|resident_status_code|offense_id|        offense_name|offense_category_name|age_filled|population_filled|      race_id_filled| offense_name_filled|resident_status_code_filled|sex_code_filled|race_id_index|crime_type_index|resident_status_index|sex_code_index|features_for_scaling|      scaledFeatures|
+---------------+---------+---------+----------------+--------------+---

In [None]:
features_array = np.array(scaled_df.select("scaledFeatures").rdd.map(lambda row: row[0].toArray()).collect())



In [None]:
input_dim = features_array.shape[1]
encoding_dim = 6  # Adjust based on your dataset

# Define the autoencoder architecture
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
autoencoder.fit(features_array, features_array, epochs=5, batch_size=32, shuffle=True)


Epoch 1/5
[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 2ms/step - loss: 0.5135
Epoch 2/5
[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 2ms/step - loss: 0.5139
Epoch 3/5
[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 2ms/step - loss: 0.5138
Epoch 4/5
[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 2ms/step - loss: 0.5133
Epoch 5/5
[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 2ms/step - loss: 0.5136


<keras.src.callbacks.history.History at 0x7aa046bff8e0>

In [None]:
reconstructed_features = autoencoder.predict(features_array)
print(reconstructed_features)
# Calculate the reconstruction error (Mean Squared Error)
reconstruction_error = np.mean(np.square(features_array - reconstructed_features), axis=1)

print(reconstruction_error)
print(np.mean(reconstruction_error))

[1m213213/213213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 1ms/step
[[1.99430063e-03 5.02994794e-07 9.99992847e-01 9.96359518e-07
  1.08034816e-03 2.75678392e-15]
 [1.19443583e-04 9.99955773e-01 3.85481478e-08 9.95519940e-07
  3.42459907e-03 1.41599270e-13]
 [1.20082877e-04 9.99954820e-01 6.74541312e-08 9.98615974e-07
  7.14824438e-01 3.09277696e-13]
 ...
 [4.44600475e-04 6.90870820e-06 1.72083318e-07 1.00029933e-06
  9.99991536e-01 6.15903543e-16]
 [4.17151896e-04 1.07037595e-04 9.99995112e-01 9.98637006e-07
  1.39991827e-02 1.88412840e-14]
 [8.51684454e-05 1.22941157e-03 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00]]
[ 0.63277002  0.5108052   0.49589767 ...  0.78927482  0.26158986
 24.41835297]
0.5136450201402303


In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import monotonically_increasing_id


risk_scores = reconstruction_error

# Add the risk scores back to your original DataFrame
risk_scores_df = spark.createDataFrame([(float(score),) for score in risk_scores], ["risk_score"])
processed_data= scaled_df.withColumn("id", monotonically_increasing_id())
risk_scores_df = risk_scores_df.withColumn("id", monotonically_increasing_id())

# risk_scores_df.show()
# final_df = processed_data.join(risk_scores_df)

final_df = processed_data.join(risk_scores_df, "id")

newFinalDF = final_df.drop("features_for_scaling", "scaledFeatures")
newFinalDF.show(truncate=False)# Show the final DataFrame with risk scores
# final_df.select("age_num", "offense_name", "risk_score").show()

KeyboardInterrupt: 

In [None]:
from pyspark.ml import Pipeline
newFinalDF = final_df.drop("features_for_scaling", "scaledFeatures")
newFinalDF.show(truncate=False)
#choose age and crime type as features to get risk score using autoencoder
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Dense
# from sklearn.metrics import mean_squared_error
# vector_assemblerAge = VectorAssembler(inputCols=["age_num"], outputCol="age_vec")
# vector_assemblerPopulation = VectorAssembler(inputCols=["population_filled"], outputCol="population_vec")

# scaler_age = StandardScaler(inputCol="age_vec", outputCol="scaled_age_vector", withMean=True, withStd=True)
#scaler_pop = StandardScaler(inputCol="population_vec", outputCol="scaled_pop_vector", withMean=True, withStd=True)
# vector_df = vector_assemblerAge.transform(res)
#vector_df.show()

+---+---------------+---------+---------+----------------+--------------+----------+--------+----------+-----------+-------+--------+-------+------------+--------------------+----------+----------------------------------------+----------------------------------------+----------+-----------------+--------------+----------------------------------------+---------------------------+---------------+-------------+----------------+---------------------+--------------+-------------------+
|id |offense_type_id|victim_id|agency_id|city_name       |primary_county|state_abbr|state_id|population|incident_id|age_num|sex_code|race_id|ethnicity_id|resident_status_code|offense_id|offense_name                            |offense_category_name                   |age_filled|population_filled|race_id_filled|offense_name_filled                     |resident_status_code_filled|sex_code_filled|race_id_index|crime_type_index|resident_status_index|sex_code_index|risk_score         |
+---+---------------+-------

In [None]:
print(newFinalDF.count())


2032348


In [None]:
newFinalDF.coalesce(1).write.csv("FinalCityDataRiskScores.csv", header=True)
# assembler = VectorAssembler(inputCols=["age_num", "log_population"], outputCol="features_vec")
# vector_df = assembler.transform(res)
# vector_df.show()