In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import plotly.express as px
import math

from tensorflow import keras
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../output_data/TrainingDatasetCustomer v1.csv', sep=';')
df

In [None]:
positive_feature_names = [
    'ContractAmountAll',
    'ContractsCount',
    'ProceduresCount',
    'customer_max_price',
]
negative_feature_names = [
    'SuspiciousFactsCount',
    'ContractAmountBad',
    'ContractsCountBad',
    'BadNewsCount',
    'ContractCountTerminationByCourt',
    'ContractCountTerminationByCustomer',
    'ContractCountTerminationBySupplier',
    'ComplaintsCount',
]
feature_names = positive_feature_names + negative_feature_names

rows = df.sample(frac=1)

labels = rows[['inn']]

features_default = pd.Series(np.zeros(len(feature_names)).fill(np.nan), index=feature_names, dtype='float64')

for feature_name in feature_names:
    rows[[feature_name]] = rows[[feature_name]].fillna(features_default[feature_name])
rows = rows.dropna()

features = rows[feature_names].copy()

features_weight = pd.Series(np.ones(len(feature_names)), index=feature_names, dtype='float64')
features_weight[negative_feature_names] = -1

features_mean = features.mean()

features_std = features.std()

features_norm = (features - features_mean) / features_std

features_params = pd.DataFrame({
    'default': features_default,
    'mean': features_mean,
    'std': features_std,
    'weight': features_weight,
})

In [None]:
from itertools import islice

# @REFERENCE: https://github.com/python/cpython/issues/98363#issue-1411970397
def batched(iterable, n):
    if n < 1:
        raise ValueError('n must be >= 1')
    it = iter(iterable)
    while (batch := list(islice(it, n))):
        yield batch

wins = pd.DataFrame({'Wins': 0}, index=features.index)

for feature_name in feature_names:
    feature = features_norm[feature_name].values
    for batch in batched(feature, 500): # @NOTE: Чтобы не держать все в памяти одновременно
        matrix = (feature[:, None] >= batch[:])
        wins['Wins'] += features_weight[feature_name] * np.count_nonzero(matrix, axis=1)

# tmp
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

wins['Wins'] = sigmoid((wins['Wins'] - wins['Wins'].mean()) / wins['Wins'].std())

In [None]:
features_train, features_test, wins_train, wins_test = train_test_split(features_norm, wins, test_size = 0.1)
features_train, features_val, wins_train, wins_val = train_test_split(features_train, wins_train, test_size = 0.2)

print(f"Feature shape: {features_train.shape}")
print(f"Training = {len(features_train)}, validation = {len(features_val)}, test = {len(features_test)}")

In [None]:
model = keras.Sequential([
    keras.Input(shape=(features_train.shape[1])),
    keras.layers.Dense(20, activation="relu", activity_regularizer=keras.regularizers.L1L2(0.000001)),
    keras.layers.Dense(15, activation="relu"),
    keras.layers.Dense(10, activation="relu"),
    keras.layers.Dense(5, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid"),
])

model.compile(loss="mse", optimizer='adamax')
initial_loss = model.evaluate(features_train, wins_train)
initial_val_loss = model.evaluate(features_val, wins_val)
history = pd.DataFrame(dict(
    time=[0],
    loss=[initial_loss],
    val_loss=[initial_val_loss],
    batch_size=[math.nan],
))

model.summary()

In [None]:
from utils.time_callback import TimeCallback

batch_size = 256
epochs = 500

time_callback = TimeCallback()
hist = model.fit(features_train, wins_train, batch_size=batch_size, epochs=epochs, validation_data=(features_val, wins_val), callbacks=[time_callback])
history_chunk = pd.merge(
    pd.DataFrame(hist.history),
    pd.DataFrame(dict(time=time_callback.times, batch_size=batch_size)),
    left_index=True, right_index=True,
)
history = pd.concat((history, history_chunk), ignore_index=True)

In [None]:
px.line(history, y=['loss', 'val_loss'], log_y=True, log_x=True).show()

In [None]:
wins_predicted = pd.DataFrame(model.predict(features_norm), columns=['Wins Predicted'], index=features.index)
tmp = features.join(wins).join(wins_predicted).join(labels)

# px.scatter(tmp, x="Wins", y="Wins Predicted", hover_name='inn')
# px.scatter(tmp, x="customer_max_price", y="Wins", color='Wins Predicted', hover_name='inn', log_x=True)
px.scatter(tmp, x="ContractAmountAll", y="ContractAmountBad", color='Wins Predicted', hover_name='inn', log_x=True, log_y=True)
# px.scatter(tmp, x="AltmanIndex", y="ContractAmountBad", color='Wins Predicted', hover_name='inn', log_y=True)
# px.scatter(tmp, x="win_qty44", y="win_qty223", color='Wins Predicted', hover_name='inn', log_x=True, log_y=True)

# px.histogram(wins_predicted.sample(10000), log_y=True)

In [None]:
SAVE_MODEL_PATH = f'../models/customer'

model.save(f'{SAVE_MODEL_PATH}/model')
features_params.to_csv(f'{SAVE_MODEL_PATH}/features_params.csv')
history.to_csv(f'{SAVE_MODEL_PATH}/history.csv', index=False)