In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

In [2]:
basedir = r"C:\Users\tmhnguyen\Documents\lalamove\lalamove\data\Clean_extracted_240115_uncal\train"
labels = [5, 6, 7]
synthetic_percent = {5: 0.8, 6: 0.1, 7: 0.5}
with open(basedir + '/../data_split_params.json', 'r') as file:
    features = json.load(file)['FEATURES']
print(features)

{'5': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh'], '6': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh'], '7': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh']}


In [3]:
label = 5

In [4]:
y = pd.read_csv(basedir + f'/{label}/train_label_{label}.csv')
X = []
step = 30_000
for i in range(np.ceil(len(y)/30_000).astype(int)):
    temp = pd.read_csv(basedir + f'/{label}/extract_features_{label}_{i}.csv', index_col=0)
    X.append(temp)
X = pd.concat(X)
assert len(X) == len(y), f"Length mismatch {len(X)}, {len(y)}"

In [5]:
X.shape, y.shape

((381607, 500), (381607, 6))

In [6]:
# # generate synthetic samples BEFORE extracting features
# dates = y.date.unique()
# chosen = dates[-3]
# test_idx = y[(y.date == chosen) & (y.type == 0)].index
# train_idx = y[(y.date != chosen) & (y.type == 0)]
# train_idx_add = y[(y.date != chosen) & (y.type == 1)].sample(frac=0.2)
# train_idx = pd.concat([train_idx, train_idx_add]).index
# # train_idx = y[(y.date != chosen)].index

# X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
# y_train, y_test = y.iloc[train_idx].label, y.iloc[test_idx].label

In [7]:
from imblearn.combine import SMOTEENN

# generate synthetic samples AFTER extracting features
dates = y.date.unique()
chosen = dates[-3]
test_idx = y[(y.date == chosen) & (y.type == 0)].index
train_idx = y[(y.date != chosen) & (y.type == 0)].index

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx].label, y.iloc[test_idx].label

print('before sampling', y_train.value_counts())

sme = SMOTEENN(sampling_strategy=0.2, random_state=42)
X_train, y_train = sme.fit_resample(X_train, y_train)

print('after sampling', y_train.value_counts())


before sampling label
0    241743
1      3845
Name: count, dtype: int64
after sampling label
0    215749
1     47507
Name: count, dtype: int64


In [8]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Conv1D, Dense, Normalization, Activation, GlobalAveragePooling1D, Dropout, LSTM, Reshape, MaxPooling1D

In [9]:
def create_model(xtrain, input_shape=500):
    inputs = Input(shape=input_shape)
    scaler = Normalization()
    scaler.adapt(xtrain)
    scaled_inputs = scaler(inputs)
    
    # reshape_input = Reshape((1, input_shape))(scaled_inputs)
    # x = LSTM(64)(reshape_input)  # LSTM layer with 64 units
    # x = Conv1D(100, 10, padding='same', kernel_initializer='he_uniform')(reshape_input)
    # # x = Dropout(0.2)(x)
    # x = Conv1D(100, 2, padding='same', kernel_initializer='he_uniform')(x)
    # x = Dense(100, activation='relu')(x)
    # x = Conv1D(100, 4, padding='same', kernel_initializer='he_uniform')(x)
    # x = Dense(100, activation='relu')(x)
    # x = Conv1D(100, 8, padding='same', kernel_initializer='he_uniform')(x)
    # x = Dropout(0.2)(x)
    # x = MaxPooling1D()(x)

    # reshape_input = Reshape((10, input_shape//10))(scaled_inputs)
    # x = LSTM(64)(reshape_input)  # LSTM layer with 64 units
    
    x = Dense(500, activation='relu')(scaled_inputs)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(50, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=output)
    return model

In [10]:
tf.keras.backend.clear_session() # release resource associated with previous model
model = create_model(X_train, input_shape=X_train.shape[1])

model.compile(optimizer=Adam(learning_rate=5e-6),
              loss='binary_crossentropy',
              metrics=[BinaryAccuracy(name='acc'),
                       Precision(name='precision'),
                       Recall(name='recall')])

history = model.fit(X_train, y_train, batch_size=200, epochs=2000, validation_data=(X_test, y_test),)
                    # callbacks=[EarlyStopping(patience=5,
                    #                    min_delta=0.0005,
                    #                    restore_best_weights=True)])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

In [None]:
pred = model.predict(X_test) >= 0.5
print(pred.shape, y_test.shape)

df = pd.DataFrame(np.hstack((y_test.to_numpy().reshape(-1, 1), pred)), columns=['true', 'pred'])
df.pred = df.pred.astype(int)

def classify(row):
    true, pred = row.true, row.pred
    if true == pred and true == 0:
        return 'True Negative'
    elif true == pred and true == 1:
        return 'True Positive'
    elif true != pred and true == 0:
        return 'False Positive'
    else:
        return 'False Negative'
    
df['type'] = df.apply(lambda x: classify(x), axis=1)
types = df.type.value_counts().sort_index()[::-1]
print(types)

fig, ax = plt.subplots(figsize=(20, 2.5))
i = 0
colors = ['skyblue', 'blue', 'green', 'red']
types_ = ['True Negative', 'True Positive', 'False Negative', 'False Positive']

for j, t in enumerate(types_):
    ax.scatter(df[df.type==t].index, [i]*types[t], label=t, c=colors[j])
    i += 0.1

ax.legend()
ax.set_ylim(0, 2)
ax.set_xlabel('Seconds')
ax.get_yaxis().set_visible(False)
ax.set_title(f'{features[str(label)]}')


In [None]:
pred = model.predict(X_test).flatten() 
print(pred.shape, y_test.shape)
w = 5 # window in seconds
pred = np.convolve(pred, np.ones(w), mode='same') / w >= 0.5

print(pred.shape, y_test.shape)
df = pd.DataFrame(np.stack((y_test, pred)).T, columns=['true', 'pred'])
df.pred = df.pred.astype(int)

def classify(row):
    true, pred = row.true, row.pred
    if true == pred and true == 0:
        return 'True Negative'
    elif true == pred and true == 1:
        return 'True Positive'
    elif true != pred and true == 0:
        return 'False Positive'
    else:
        return 'False Negative'
    
df['type'] = df.apply(lambda x: classify(x), axis=1)
types = df.type.value_counts().sort_index()[::-1]
print(types)

fig, ax = plt.subplots(figsize=(20, 2.5))
i = 0
colors = ['skyblue', 'blue', 'green', 'red']
types_ = ['True Negative', 'True Positive', 'False Negative', 'False Positive']

for j, t in enumerate(types_):
    try:
        ax.scatter(df[df.type==t].index, [i]*types[t], label=t, c=colors[j])
    except KeyError:
        print(f'There is no {t}')
    i += 0.1

ax.legend()
ax.set_ylim(0, 2)
ax.set_xlabel('Seconds')
ax.get_yaxis().set_visible(False)
ax.set_title(f'{features[str(label)]}')