<a href="https://colab.research.google.com/github/uninstallit/ati580_final_project/blob/edvin-1/ati580_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Machine Learning portion of ATI580 Final Project**

In [None]:
pip install dnspython

In [None]:
pip install --upgrade tf-nightly

In [1]:
from pymongo import MongoClient

class Connect(object):

  @staticmethod    
  def get_connection(database):
    username = "mdbUser"
    password = "ati580"
    return MongoClient('mongodb+srv://{}:{}@ati580-cluster.s5t5z.gcp.mongodb.net/{}?retryWrites=true&w=majority'.format(username, password, database))

In [None]:
# # save
# from google.colab import drive
# drive.mount('/content/gdrive')
# pip freeze --local > /content/gdrive/My\ Drive/colab_installed.txt

# # restore
# from google.colab import drive
# drive.mount('/content/gdrive')
# pip install --upgrade --force-reinstall `cat/content/gdrive/My\ Drive/colab_installed.txt`

In [None]:
#plotly: https://plotly.com/python/ipython-notebook-tutorial/

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pymongo
import datetime

class PoliceInterviews(object):

  def __init__(self, num_rows, output, eval_percent, batch):
    # select columns
    self._columns = dict({
        'FieldInterviewID':0,     
        'NOPD_Item':0,            
        'EventDate':1,            
        'District':1,             
        'Zone':1,                 
        'OfficerAssignment':0, 
        'StopDescription':1, 
        'ActionsTaken':0, 
        'VehicleYear':1, 
        'VehicleMake':1, 
        'VehicleModel':1, 
        'VehicleStyle':1, 
        'VehicleColor':1, 
        'SubjectID':0, 
        'SubjectRace':1, 
        'SubjectGender':1, 
        'SubjectAge':1, 
        'SubjectHasPhotoID':0, 
        'SubjectHeight':1, 
        'SubjectWeight':1, 
        'SubjectEyeColor':0, 
        'SubjectHairColor':1, 
        'SubjectDriverLicState':1, 
        'CreatedDateTime':0, 
        'LastModifiedDateTime':0, 
        'Longitude':0, 
        'Latitude':0, 
        'Zip':1, 
        'BlockAddress':0})

    # connect to database
    self._connect = Connect()
    self._mdb_client = self._connect.get_connection("POLICE_DATABASE")
    self._mdb_database = self._mdb_client['POLICE_DATABASE'] 
    self._mdb_collection = self._mdb_database['POLICE_INTERVIEWS']
    self._rows = num_rows

    # data model attributes
    self._output = output
    self._eval_percent = eval_percent
    self._batch = batch

    # split the data into a training and validation set
    # self._eval_df  = self._dataframe.sample(frac=self._eval_percent, random_state=123)
    # self._train_df = self._dataframe.drop(self._eval_df.index)

  # convert queries to dataframe or numpy array
  def query_and_convert(self, filter, projection, to="dataframe", rows=None):
    if rows is not None:
      _cursor = self._mdb_collection.find(filter, projection).limit(rows)
    else:
      _cursor = self._mdb_collection.find(filter, projection)
    _dataframe = pd.DataFrame(list(_cursor))
    if to is "dataframe":
      return _dataframe
    elif to is "numpy":
      _np_array = np.transpose(np.squeeze(_dataframe.to_numpy()))
      return _np_array
  
  # convert time to seconds 
  @staticmethod
  def time_to_int_seconds(datetime_str):
    time = datetime.datetime.strptime(datetime_str, '%m/%d/%Y %H:%M:%S %p').time()
    [hours, minutes, seconds] = [int(t) for t in str(time).split(':')]
    return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds).seconds

  # convert license tag to binary
  @staticmethod
  def instate_or_outofstate(license):
    if license is "LA":
      return 1
    return 0

  # convert stop description to binary
  @staticmethod
  def cause_or_suspicion(description):
    _probable_cause_list = ["CRIMINAL VIOLATION", "JUVENILE VIOLATION", "TRAFFIC VIOLATION"]
    if description in _probable_cause_list:
      return 0
    return 1

  def load_dataframe(self, split=True):
    _filter = []
    _projection = dict({'_id':0})
    # create filter
    for key, value in self._columns.items():
      if value is 1:
        _filter.append({key:{"$exists": True}})
        _filter.append({key:{"$ne": ""}})
        _filter.append({key:{"$ne": None}})
        _filter.append({key:{"$ne": 0}})
    # create projection
    for key, value in self._columns.items():
      if value is 1:
        _projection[key] = value
    # query database
    _query_dataframe = self.query_and_convert({"$and":_filter}, _projection, rows=self._rows)
    # convert specific columns
    _query_dataframe['EventDate'] = _query_dataframe['EventDate'].apply(lambda x: self.time_to_int_seconds(x))
    _query_dataframe['SubjectDriverLicState'] = _query_dataframe['SubjectDriverLicState'].apply(lambda x: self.instate_or_outofstate(x))
    _query_dataframe['StopDescription'] = _query_dataframe['StopDescription'].apply(lambda x: self.cause_or_suspicion(x))
    _query_dataframe['Zip'] = _query_dataframe['Zip'].apply(lambda x: int(x))
    _query_dataframe['VehicleYear'] = _query_dataframe['VehicleYear'].apply(lambda x: int(x))
    _query_dataframe['SubjectAge'] = _query_dataframe['SubjectAge'].apply(lambda x: int(x))
    _query_dataframe['SubjectHeight'] = _query_dataframe['SubjectHeight'].apply(lambda x: int(x))
    _query_dataframe['SubjectWeight'] = _query_dataframe['SubjectWeight'].apply(lambda x: int(x))
    # split data into train and evaluate set
    _eval_df = _query_dataframe.sample(frac=self._eval_percent, random_state=1234)
    _train_df = _query_dataframe.drop(_eval_df.index)
    if split is True:
      # split data into train and evaluate set
      _eval_df = _query_dataframe.sample(frac=self._eval_percent, random_state=1234)
      _train_df = _query_dataframe.drop(_eval_df.index)
      return _train_df, _eval_df
    return _query_dataframe

  # convert dataframe_to_dataset 
  @staticmethod
  def dataframe_to_dataset(dataframe, output):
    _dataframe = dataframe.copy()
    _labels    = _dataframe.pop(output)
    _dataset   = tf.data.Dataset.from_tensor_slices((dict(_dataframe), _labels))
    _dataset   = _dataset.shuffle(buffer_size=len(_dataframe))
    return _dataset

  def load_dataset(self):
    _train_df, _eval_df = self.load_dataframe()

    _train_ds = self.dataframe_to_dataset(_train_df, self._output)
    _eval_ds  = self.dataframe_to_dataset(_eval_df, self._output)

    _train_ds = _train_ds.batch(self._batch)
    _eval_ds  = _eval_ds.batch(self._batch)
    return _train_ds, _eval_ds


## **Classification with Neural Networks**
- Source: [Keras | Code examples](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/)
- CategoricalEncoding API: [Tensorflow](https://tensorflow.google.cn/api_docs/python/tf/keras/layers/experimental/preprocessing/CategoryEncoding#methods)


In [3]:
# pull data from db and convert
police_interviews = PoliceInterviews(num_rows=1000, output="StopDescription", eval_percent=0.2, batch=25)
train_ds, eval_ds = police_interviews.load_dataset()

# visual check
# train_df, eval_df = police_interviews.load_dataframe()
# print('{:<25} {} \n'.format("Column Name", "Sample"))
# for (column_name, column_data) in train_df.iteritems():
#   print('{:<25} {}'.format(column_name, column_data[0]))

In [4]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras import layers
from tensorflow import keras

def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the statistics of the data
    normalizer.adapt(feature_ds)
    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

def encode_string_categorical_feature(feature, name, dataset):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)
    # Turn the string input into integer indices
    encoded_feature = index(feature)
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")
    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)
    # Learn the space of possible indices
    encoder.adapt(feature_ds)
    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature

def encode_integer_categorical_feature(feature, name, dataset):
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the space of possible indices
    encoder.adapt(feature_ds)
    # Apply one-hot encoding to our indices
    encoded_feature = encoder(feature)
    return encoded_feature

# categorical features encoded as integers
inout_state  = keras.Input(shape=(1,), name="SubjectDriverLicState", dtype="int64")
district     = keras.Input(shape=(1,), name="District", dtype="int64")
vehicle_year = keras.Input(shape=(1,), name="VehicleYear", dtype="int64")
zip          = keras.Input(shape=(1,), name="Zip", dtype="int64")
# output
# stop_description = keras.Input(shape=(1,), name="StopDescription", dtype="int64")

# categorical feature encoded as string
zone             = keras.Input(shape=(1,), name="Zone", dtype="string")
vehicle_make     = keras.Input(shape=(1,), name="VehicleMake", dtype="string")
vehicle_model    = keras.Input(shape=(1,), name="VehicleModel", dtype="string")
vehicle_style    = keras.Input(shape=(1,), name="VehicleStyle", dtype="string")
vehicle_color    = keras.Input(shape=(1,), name="VehicleColor", dtype="string")
subject_race     = keras.Input(shape=(1,), name="SubjectRace", dtype="string")
subject_gender   = keras.Input(shape=(1,), name="SubjectGender", dtype="string")
hair_color       = keras.Input(shape=(1,), name="SubjectHairColor", dtype="string")

# numerical features
time           = keras.Input(shape=(1,), name="EventDate")
subject_age    = keras.Input(shape=(1,), name="SubjectAge")
subject_height = keras.Input(shape=(1,), name="SubjectHeight")
subject_weight = keras.Input(shape=(1,), name="SubjectWeight")

all_inputs = [inout_state,
              district, 
              vehicle_year, 
              zip, 
              zone, 
              vehicle_make, 
              vehicle_model, 
              vehicle_style, 
              vehicle_color, 
              subject_race, 
              subject_gender, 
              hair_color, 
              time, 
              subject_age, 
              subject_height, 
              subject_weight]

# integer categorical features
inout_state_encoded  = encode_integer_categorical_feature(inout_state, "SubjectDriverLicState", train_ds)
district_encoded     = encode_integer_categorical_feature(district, "District", train_ds)
vehicle_year_encoded = encode_integer_categorical_feature(vehicle_year, "VehicleYear", train_ds)
zip_encoded          = encode_integer_categorical_feature(zip, "Zip", train_ds)

# string categorical features
zone_encoded             = encode_string_categorical_feature(zone, "Zone", train_ds)
vehicle_make_encoded     = encode_string_categorical_feature(vehicle_make, "VehicleMake", train_ds)
vehicle_model_encoded    = encode_string_categorical_feature(vehicle_model, "VehicleModel", train_ds)
vehicle_style_encoded    = encode_string_categorical_feature(vehicle_style, "VehicleStyle", train_ds)
vehicle_color_encoded    = encode_string_categorical_feature(vehicle_color, "VehicleColor", train_ds)
subject_race_encoded     = encode_string_categorical_feature(subject_race, "SubjectRace", train_ds)
subject_gender_encoded   = encode_string_categorical_feature(subject_gender, "SubjectGender", train_ds)
hair_color_encoded       = encode_string_categorical_feature(hair_color, "SubjectHairColor", train_ds)

# numerical features
time_encoded           = encode_numerical_feature(time, "EventDate", train_ds)
subject_age_encoded    = encode_numerical_feature(subject_age, "SubjectAge", train_ds)
subject_height_encoded = encode_numerical_feature(subject_height, "SubjectHeight", train_ds)
subject_weight_encoded = encode_numerical_feature(subject_weight, "SubjectWeight", train_ds)

all_features = layers.concatenate([
                                   inout_state_encoded,
                                   district_encoded,
                                   vehicle_year_encoded,
                                   zip_encoded,
                                   zone_encoded,
                                   vehicle_make_encoded,
                                   vehicle_model_encoded,
                                   vehicle_style_encoded,
                                   vehicle_color_encoded,
                                   subject_race_encoded,
                                   subject_gender_encoded,
                                   hair_color_encoded,
                                   time_encoded,
                                   subject_age_encoded,
                                   subject_height_encoded,
                                   subject_weight_encoded])

# build model
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)
x = layers.Dense(16, activation="sigmoid")(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(8, activation="sigmoid")(x)
output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [None]:
# model diagram
keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

In [None]:
# train model
model.fit(train_ds, epochs=5, validation_data=eval_ds)

In [None]:
# predict from model
sample = dict({
    "SubjectDriverLicState":0,
    "District":7,
    "VehicleYear":1997,
    "Zip":70127,
    "Zone":"O",
    "VehicleMake":"CHEVROLET",
    "VehicleModel":"TAHOE",
    "VehicleStyle":"SPORTS UTILITY",
    "VehicleColor":"BLUE",
    "SubjectRace":"BLACK",
    "SubjectGender":"MALE",
    "SubjectHairColor":"Black",
    "EventDate":34500,
    "SubjectAge":21,
    "SubjectHeight":72,
    "SubjectWeight":169,
})

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
tf.print(input_dict)
model.predict(input_dict)

## **Logistic Regression**

Binary Logistic Regression, in which the target variable has only two possible values, e.g., pass/fail or win/lose.

In [23]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def encode_cat_features(features, dataframe, encoder):
  _dataframe = dataframe.copy()
  _dataframe = dataframe[features]
  transform_array = encoder.transform(_dataframe).toarray()
  return np.transpose(transform_array)

def z_score_norm(x, mean, std):
  return ( float(x) - float(mean) ) / float(std)

def inv_z_score_norm(z, mean, std):
  return float(mean) + float(z) * float(std)

def encode_norm_features(features, dataframe, mean=None, std=None):
  _dataframe = dataframe.copy()
  _dataframe = _dataframe[features]
  for feature in features:
    _mean = mean[feature]
    _std  = std[feature]
    _dataframe[feature] = _dataframe[feature].apply(lambda x: z_score_norm(x, _mean, _std))
  return np.transpose(_dataframe.to_numpy())
  
# load data
police_df = police_interviews.load_dataframe(split=False)
police_label_df = police_df.pop("StopDescription")

# categorical features
categorical = ["Zone", "VehicleMake", "VehicleModel", "VehicleStyle",             # string
               "VehicleColor", "SubjectRace", "SubjectGender","SubjectHairColor", # string
               "SubjectDriverLicState", "District", "VehicleYear", "Zip"]         # int

numerical = ["EventDate", "SubjectAge", "SubjectHeight", "SubjectWeight"]

# one-hot encoder
# train on the whole set to avoid feature dim errors
onehot_encoder = OneHotEncoder(sparse=True)
onehot_encoder.fit(police_df[categorical])

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(police_df, police_label_df, test_size=0.2, random_state=1234)

# categorical features encoding
x_train_cat_encoded = encode_cat_features(categorical, x_train, onehot_encoder)
x_test_cat_encoded  = encode_cat_features(categorical, x_test, onehot_encoder)

# # numerical features encoding
mean = x_train[numerical].mean(axis=0)
std  = x_train[numerical].std(axis=0)
x_train_num_encoded = encode_norm_features(numerical, x_train, mean, std)
x_test_num_encoded  = encode_norm_features(numerical, x_test, mean, std)

# # combine categorical and numerical
x_train_combined = np.vstack([x_train_cat_encoded, x_train_num_encoded])
x_train_combined = np.transpose(x_train_combined)
x_test_combined = np.vstack([x_test_cat_encoded, x_test_num_encoded])
x_test_combined = np.transpose(x_test_combined)

# train model
logistic_regression = LogisticRegression(random_state=0).fit(x_train_combined, y_train)

# # accuracy on test set
print(logistic_regression.score(x_train_combined, y_train))
print(logistic_regression.score(x_test_combined, y_test))

# # predict classes
predicted = logistic_regression.predict(x_test_combined[:200, :])
expected = y_test[:200].to_numpy()

# predict probability
logistic_regression.predict_proba(x_test_combined[:200, :])

# confusion matrix
tn, fp, fn, tp = confusion_matrix(expected, predicted, normalize='pred').ravel()

print("True Negative:  ", tn) 
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Prositive: ", tp)

0.88
0.83
True Negative:   0.8429319371727748
False Positive:  0.4444444444444444
False Negative:  0.15706806282722513
True Prositive:  0.5555555555555556
