<a href="https://colab.research.google.com/github/zenAurelius/HRAI3/blob/main/notebooks/test_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/HRAI/data

In [None]:
dfraw = pd.read_csv('pmu2017_os.zip')
dfraw.head()

In [None]:
# ajouté à combine.clean
dfraw['pfs_chSexe_1'] = dfraw['pfs_chSexe_1'].replace({'FEMELLES': 'F', 'MALES': 'M', 'HONGRES': 'H'})
dfraw['pfs_chSexe_2'] = dfraw['pfs_chSexe_2'].replace({'FEMELLES': 'F', 'MALES': 'M', 'HONGRES': 'H'})
# ajouté à combine.un_vs_un
dfraw['pfs_dSexe'] = dfraw['pfs_chSexe_1'].astype(str) + dfraw['pfs_chSexe_2'].astype(str)

In [None]:
#SELECTION DES FEATURES ET TARGET

#df = df[['rfi_prix','rfi_distance','pfi_chNbPlaces_1','pff_rapportDirect_1','pff_ord_1','pfi_chNbPlaces_2','pff_rapportDirect_2','pff_ord_2','tgf_win_1']].copy()
#'aid_cr', 'pis_cheval_1','pff_ord_1','pff_ord_2', 'pis_cheval_2','pff_normcote_1'
df = dfraw[['aid_cr', 'pis_cheval_1', 'pis_cheval_2', 'pfs_dSexe', 'pff_rapportDirect_1','pff_rapportDirect_2','pff_normcote_1','pff_normcote_2','pff_ord_1','pff_ord_2', 'tgf_pwin_1', 'tgf_win_1']].copy()
df = df[(~df.pff_rapportDirect_2.isna()) & (~df.pff_rapportDirect_1.isna())].copy()
df['diff_cote'] = df.pff_normcote_1 /  (df.pff_normcote_2 + df.pff_normcote_1)
df['diff_ord'] = df.pff_ord_1 /  (df.pff_ord_2 + df.pff_ord_1)
df['tgf_win_1'] = df['tgf_win_1'].replace(0.5, 0)
df = df[['aid_cr', 'pis_cheval_1', 'pis_cheval_2', 'pfs_dSexe', 'pff_rapportDirect_1', 'pff_ord_1', 'diff_cote','diff_ord', 'tgf_pwin_1','tgf_win_1']].copy()
df.head()

In [None]:
# SPLIT TRAIN, TEST, VAL

limit_train = df['aid_cr'].iloc[int(0.8 * len(df))]
limit_val = df['aid_cr'].iloc[int(0.9 * len(df))]
print(limit_train, limit_val)
train = df[df.aid_cr < limit_train].copy()
val = df[(df.aid_cr >= limit_train) & (df.aid_cr < limit_val)].copy()
test = df[df.aid_cr >= limit_val].copy()
print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# INIT FEATURES/TARGET/CONST

NUM_FEATURES = ['diff_cote', 'diff_ord']
CAT_FEATURES = ['pfs_dSexe']
TARGET = ['tgf_pwin_1']
FEATURES = NUM_FEATURES + CAT_FEATURES
ALL_COLS = FEATURES + TARGET

BATCH_SIZE = 256

In [None]:
# CONVERSIONS INPUT

# DF TO DATASET
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe[ALL_COLS].copy()
  labels = df.pop(TARGET[0])
  df = {key: np.array(value)[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

# GET NORMALIZATION LAYER
def get_normalization_layer(name, dataset):
  normalizer = layers.Normalization(axis=None)
  # Récupère un dataset avec seulement les colonnes passées dans 'name'
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

# GET CATEGORY ENCODING LAYER
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Lookup Layer, soit StringLookup, soit IntergerLookup
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Récupère un dataset avec seulement les colonnes passées dans 'name'
  feature_ds = dataset.map(lambda x, y: x[name])

  # Calcul le Lookup
  index.adapt(feature_ds)

  # Category Encoding à partir du nombre d'index trouvé
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [None]:
# TO DATASET

train_ds = df_to_dataset(train, batch_size=BATCH_SIZE)
val_ds = df_to_dataset(val, shuffle=False, batch_size=BATCH_SIZE)
test_ds = df_to_dataset(test, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
all_inputs = {}
encoded_features = []

for header in NUM_FEATURES:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs[header] = numeric_col
  encoded_features.append(encoded_numeric_col)

for header in CAT_FEATURES:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string',
                                               max_tokens=9)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs[header] = categorical_col
  encoded_features.append(encoded_categorical_col)

In [None]:
# MODEL CLASSIFICATION

all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(16, activation="relu")(all_features)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"],
              run_eagerly=True)


In [None]:
# MODEL REGRESSION
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(64, activation="relu")(all_features)
x = layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001))

In [None]:
# Use `rankdir='LR'` to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")


In [None]:
model.fit(train_ds, epochs=5, validation_data=val_ds)

In [None]:
def predict(model, ds, df, pcol):
  predictions = model.predict(ds)
  df[pcol] = predictions
  threshold = 0.5
  df[f'{pcol}_w'] = (predictions > threshold).astype(int)

  # Le reste pourrait être ailleurs
def prepare_comparaison(df):
  df['win1'] = (df.tgf_pwin_1 > 0.5).astype(int)
  df['win_cote'] = (df['pff_rapportDirect_2'] > df['pff_rapportDirect_1']).astype(int)
  df['win_ord'] = (df['pff_ord_1'] > df['pff_ord_2']).astype(int)

def eval(df, pcol):
  print('positifs')
  print(len(df[(df[f'{pcol}_w'] == 1) & (df.win1 == 1)]) / len(df[(df.win1 == 1)]))
  print(len(df[(df.win_cote == 1) & (df.win1 == 1)]) / len(df[(df.win1 == 1)]))
  print(len(df[(df.win_ord == 1) & (df.win1 == 1)]) / len(df[(df.win1 == 1)]))
  print('negatifs')
  print(len(df[(df[f'{pcol}_w'] == 0) & (df.win1 == 0)]) / len(df[(df.win1 == 0)]))
  print(len(df[(df.win_cote == 0) & (df.win1 == 0)]) / len(df[(df.win1 == 0)]))
  print(len(df[(df.win_ord == 0) & (df.win1 == 0)]) / len(df[(df.win1 == 0)]))

**TESTS**

In [None]:
# prompt: je veux selectionner tout les enregistrements ou OS_N_SG_pis_cheval_1 est supérieur à sa valeur médiane

median_value = dfraw['OS_N_SG_pis_cheval_1'].median()
filtered_df = dfraw[dfraw['OS_N_SG_pis_cheval_1'] < median_value].copy()

# prompt: sur ce qui reste que je veux connaitre la valeur moyenne de OS_N_MU_pis_cheval_1 pour chaque quartile de pff_chGainTotal_1

# Calculate quartiles for pff_chGainTotal_1
filtered_df['quartile'] = pd.qcut(filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'], 4, labels=False)

# Group by quartile and calculate the mean of OS_N_MU_pis_cheval_1
mean_by_quartile = filtered_df.groupby('quartile')['OS_N_MU_pis_cheval_1'].mean()

mean_by_quartile

In [None]:
# prompt: pour tout les enregistrement où pff_mu_pis_cheval_1 est = 25, je veux que la valeur soit remplacé par 24 si pff_chGainTotal_1 / pfi_chNbCourses_1 < 110000, par 29 si >110000 et <174000, par 32 si >174000 et <285000 et par 36 si > 285000

# Create a copy to avoid SettingWithCopyWarning
filtered_df = filtered_df.copy()

# Apply the conditions and update pff_mu_pis_cheval_1
filtered_df.loc[
    (filtered_df['pff_mu_pis_cheval_1'] == 25) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] < 110000),
    'pff_mu_pis_cheval_1'
] = 24

filtered_df.loc[
    (filtered_df['pff_mu_pis_cheval_1'] == 25) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] >= 110000) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] < 174000),
    'pff_mu_pis_cheval_1'
] = 29

filtered_df.loc[
    (filtered_df['pff_mu_pis_cheval_1'] == 25) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] >= 174000) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] < 285000),
    'pff_mu_pis_cheval_1'
] = 32

filtered_df.loc[
    (filtered_df['pff_mu_pis_cheval_1'] == 25) &
    (filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'] >= 285000),
    'pff_mu_pis_cheval_1'
] = 36

In [None]:
# prompt: je veux connnaitre les bornes des quartiles de pff_chGainTotal_1 / pfi_chNbCourses_1

# Calculate quartiles for pff_chGainTotal_1 / pfi_chNbCourses_1
filtered_df['quartile_ratio'] = pd.qcut(filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1'], 4, labels=False)

# Get the boundaries of each quartile
quartiles = filtered_df['pff_chGainTotal_1'] / filtered_df['pfi_chNbCourses_1']
quartile_bounds = quartiles.quantile([0.25, 0.5, 0.75])

quartile_bounds

In [None]:
df = df[df.tgf_win_1 != 0.5]
print(len(df[(df.pfs_dSexe == 'FF') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'FF')]))
print(len(df[(df.pfs_dSexe == 'FM') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'FM')]))
print(len(df[(df.pfs_dSexe == 'FH') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'FH')]))
print(len(df[(df.pfs_dSexe == 'HF') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'HF')]))
print(len(df[(df.pfs_dSexe == 'HM') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'HM')]))
print(len(df[(df.pfs_dSexe == 'HH') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'HH')]))
print(len(df[(df.pfs_dSexe == 'MM') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'MM')]))
print(len(df[(df.pfs_dSexe == 'MF') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'MF')]))
print(len(df[(df.pfs_dSexe == 'MH') & (df.tgf_win_1 == 1)]) / len(df[(df.pfs_dSexe == 'MH')]))

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
test_type_col = train_features['diff_sexe']
test_type_layer = get_category_encoding_layer(name='diff_sexe',
                                              dataset=train_ds,
                                              dtype='string',
                                              max_tokens=5)
test_type_layer(test_type_col)

In [None]:
model.save('classifier_test2.keras')
reloaded_model = tf.keras.models.load_model('classifier_test.keras')

In [None]:
predictions = model.predict(test_ds)
predictions

In [None]:
# prompt: je veux mettre le résultat des prédictions dans une colonne 'pred' du dataframe d'origine 'test'

# Convert predictions to a binary classification (e.g., using a threshold)
threshold = 0.0
binary_predictions = (predictions > threshold).astype(int)

# Add the predictions as a new column 'pred' to the test dataframe
test['pred'] = binary_predictions

# Print the updated dataframe with predictions
test.pred.value_counts()

In [None]:
test.tgf_win_1.value_counts()

In [None]:
# prompt: dans dataframe 'test' je veux une colonne win_cote qui contient 1 si rapport 2 > rapport 1 et 0 sinon

test['win_cote'] = (test['diff_cote'] < 0.5).astype(int)
test['win_ord'] = (test['diff_ord'] > 0.5).astype(int)

In [None]:
test

In [None]:
print(len(test[(test.pred == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.tgf_win_1 == 1)]))
print(len(test[(test.win_cote == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.tgf_win_1 == 1)]))
print(len(test[(test.win_ord == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.tgf_win_1 == 1)]))
print(len(test[(test.win_cote == 1) & (test.win_ord == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.tgf_win_1 == 1)]))

In [None]:
print(len(test[(test.pred == 0) & (test.tgf_win_1 == 0)]) / len(test[(test.tgf_win_1 == 0)]))
print(len(test[(test.win_cote == 0) & (test.tgf_win_1 == 0)]) / len(test[(test.tgf_win_1 == 0)]))
print(len(test[(test.win_ord == 0) & (test.tgf_win_1 == 0)]) / len(test[(test.tgf_win_1 == 0)]))

In [None]:
print(len(test[(test.pred == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.pred == 1)]))
print(len(test[(test.pred2 == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.pred2 == 1)]))
print(len(test[(test.win_cote == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.win_cote == 1)]))
print(len(test[(test.win_ord == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.win_ord == 1)]))
print(len(test[(test.win_cote == 1) & (test.win_ord == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.win_cote == 1) & (test.win_ord == 1)]))
print(len(test[(test.win_cote == 0) & (test.pred == 1) & (test.tgf_win_1 == 1)]) / len(test[(test.win_cote == 0) & (test.pred == 1)]))