<a href="https://colab.research.google.com/github/yesufsa/PersonalInterestProject/blob/main/Homicide_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow_hub as hub
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf

import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
label_encoder = LabelEncoder()

In [3]:
df = pd.read_csv('database.csv')
df.head()
print(df["Perpetrator Race"].value_counts())
print(len(df.columns))

Perpetrator Race
White                            12638
Black                            12582
Unknown                           9955
Asian/Pacific Islander             200
Native American/Alaska Native      192
Name: count, dtype: int64
24


In [4]:
CATEGORICAL_COLUMNS = ['Agency Code', 'Agency Name', 'Agency Type', 'City', 'State', 'Month', 'Crime Type', 'Crime Solved', 'Victim Sex', 'Victim Race', 'Victim Ethnicity', 'Perpetrator Sex', 'Perpetrator Race','Perpetrator Ethnicity', 'Relationship', 'Weapon', 'Record Source']

In [5]:
for entry in CATEGORICAL_COLUMNS:
  df[entry] = label_encoder.fit_transform(df[entry])
  df[entry].unique()
df = df.dropna()
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,1,0,65,1,29,1,1980,4,1.0,1,...,2,1,15.0,2,2,0,0,0.0,0.0,0
1,2,0,65,1,29,1,1980,7,1.0,1,...,2,1,42.0,4,2,0,13,0.0,0.0,0
2,3,0,65,1,29,1,1980,7,2.0,1,...,2,2,0.0,3,2,26,15,0.0,0.0,0
3,4,0,65,1,29,1,1980,0,1.0,1,...,2,1,42.0,4,2,0,13,0.0,0.0,0
4,5,0,65,1,29,1,1980,0,2.0,1,...,2,2,0.0,3,2,26,15,0.0,1.0,0


In [6]:
df.dtypes

Record ID                  int64
Agency Code                int64
Agency Name                int64
Agency Type                int64
City                       int64
State                      int64
Year                       int64
Month                      int64
Incident                 float64
Crime Type                 int64
Crime Solved               int64
Victim Sex                 int64
Victim Age               float64
Victim Race                int64
Victim Ethnicity           int64
Perpetrator Sex            int64
Perpetrator Age          float64
Perpetrator Race           int64
Perpetrator Ethnicity      int64
Relationship               int64
Weapon                     int64
Victim Count             float64
Perpetrator Count        float64
Record Source              int64
dtype: object

In [7]:
df["Perpetrator Race"].unique()

array([2, 4, 3, 1, 0])

In [8]:
df["Perpetrator Race"].value_counts()
#white
#black
#Unknown
#asian
#native

Perpetrator Race
4    12638
1    12582
3     9955
0      200
2      192
Name: count, dtype: int64

In [9]:
df = df.drop('Perpetrator Age', axis=1)
x= df[df.columns[:-1]].values.astype(np.float32)
y = df[df.columns[17]].values.astype(np.float32)
print(x)

[[1.0000e+00 0.0000e+00 6.5000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.0000e+00 0.0000e+00 6.5000e+01 ... 1.3000e+01 0.0000e+00 0.0000e+00]
 [3.0000e+00 0.0000e+00 6.5000e+01 ... 1.5000e+01 0.0000e+00 0.0000e+00]
 ...
 [3.5565e+04 2.7790e+03 3.2180e+03 ... 9.0000e+00 0.0000e+00 0.0000e+00]
 [3.5566e+04 2.7800e+03 3.2180e+03 ... 8.0000e+00 0.0000e+00 0.0000e+00]
 [3.5567e+04 2.7800e+03 3.2180e+03 ... 2.0000e+00 0.0000e+00 0.0000e+00]]


In [10]:
over = RandomOverSampler()
x, y = over.fit_resample(x, y)
data = np.hstack((x, np.reshape(y,(-1, 1))))
transformed_df = pd.DataFrame(data, columns = df.columns)

In [11]:
len(df[df["Perpetrator Race"] == 4]), len(df[df["Perpetrator Race"] == 1]), len(df[df["Perpetrator Race"] == 3]),len(df[df["Perpetrator Race"] == 0]), len(df[df["Perpetrator Race"] == 2]), len(df[df["Perpetrator Race"] == 5])

(12638, 12582, 9955, 200, 192, 0)

In [12]:
len(transformed_df[transformed_df["Perpetrator Race"] == 4]), len(transformed_df[transformed_df["Perpetrator Race"] == 1]), len(df[df["Perpetrator Race"] == 3]),len(df[df["Perpetrator Race"] == 0]), len(df[df["Perpetrator Race"] == 2]), len(df[df["Perpetrator Race"] == 5])

(28767, 15012, 9955, 200, 192, 0)

In [13]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size = 0.4, random_state = 0)

In [14]:
scaler = StandardScaler().fit(x_train)
scaler = StandardScaler().fit(x_temp)
print(scaler)
scaler.mean_

StandardScaler()


array([1.81325638e+04, 1.76522917e+03, 1.50826006e+03, 1.43988769e+00,
       6.34421937e+02, 1.84621651e+01, 1.98037922e+03, 5.54975605e+00,
       1.23248642e+01, 9.86790021e-01, 8.25554635e-01, 7.99871122e-01,
       3.41305809e+01, 2.92317960e+00, 9.38552886e-01, 1.08225168e+00,
       2.96294762e+00, 1.00349811e+00, 1.59864218e+01, 7.99235939e+00,
       1.10650833e-01, 1.98379821e-01])

In [15]:
scaler.scale_

array([1.01184756e+04, 1.32028954e+03, 8.26688903e+02, 9.76926887e-01,
       3.24663396e+02, 1.47896702e+01, 4.85193702e-01, 3.60649400e+00,
       2.36056125e+01, 1.14173006e-01, 3.79492002e-01, 4.01703981e-01,
       2.85890652e+01, 1.43804494e+00, 7.22814014e-01, 5.09778100e-01,
       1.29617465e+00, 8.16714548e-01, 1.12988270e+01, 3.21615877e+00,
       4.85165911e-01, 5.85778517e-01])

In [16]:
scaler.transform(x_train)

array([[ 1.4758583 ,  0.19751032, -1.6442219 , ...,  0.0023757 ,
        -0.22806802, -0.3386601 ],
       [-0.7470062 ,  0.17706028,  0.82587284, ...,  0.0023757 ,
        -0.22806802, -0.3386601 ],
       [-0.9569192 , -0.25617805, -1.2293139 , ...,  0.0023757 ,
        -0.22806802, -0.3386601 ],
       ...,
       [ 0.627015  , -1.0491859 ,  0.02508797, ...,  2.1788852 ,
         1.8330826 , -0.3386601 ],
       [ 0.5754262 , -1.1264416 ,  0.71458554, ...,  0.0023757 ,
        -0.22806802, -0.3386601 ],
       [-1.5219252 , -1.0188894 ,  0.02508797, ...,  0.0023757 ,
        -0.22806802, -0.3386601 ]], dtype=float32)

In [17]:
x_train_scaled = scaler.transform(x_train)
x_temp_scaled = scaler.transform(x_temp)

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'), #if x<=0 --> 0, x>0 --> x
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.01),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [20]:
model.evaluate(x_train, y_train)
model.evaluate(x_train_scaled, y_train)
model.evaluate



In [21]:
model.fit(x_train, y_train, batch_size=1024, epochs=20, validation_data=(x_temp, y_temp))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cb34d688d90>

In [22]:
model.fit(x_train_scaled, y_train, batch_size=1024, epochs=20, validation_data=(x_temp_scaled, y_temp))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cb33bfd58d0>