In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from tensorflow.keras.regularizers import l2
import numpy as np

In [3]:
# Chargement les données
data_path = 'turnips2.csv'
data = pd.read_csv(data_path)

# Calcul des profits pour chaque demi-journée
columns = ['Mon-AM', 'Mon-PM', 'Tues-AM', 'Tues-PM', 'Wed-AM', 'Wed-PM', 'Thurs-AM', 'Thurs-PM', 'Fri-AM', 'Fri-PM', 'Sat-AM', 'Sat-PM']
for col in columns:
    data[col] = data[col] - data['Purchase']

# Normalisation les données
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[columns])

# Clustering pour identifier les tendances
kmeans = KMeans(n_clusters=4, random_state=0).fit(data_scaled)
data['cluster'] = kmeans.labels_



In [21]:
# Encoder les labels de cluster en format one-hot pour l'entraînement du réseau de neurones
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))  # Utilisez .values pour convertir en numpy array
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Construire le modèle
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_train_encoded.shape[1], activation='softmax')  # La couche de sortie a un neurone par classe
])

# Compiler le modèle
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Entraîner le modèle
history = model.fit(X_train, y_train_encoded, epochs=100, batch_size=32, validation_split=0.2, verbose=2)

# Évaluer le modèle sur l'ensemble de test
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)
print(f'Test accuracy: {accuracy}')



Epoch 1/100
2400/2400 - 5s - loss: 0.5893 - accuracy: 0.7406 - val_loss: 0.4571 - val_accuracy: 0.7863 - 5s/epoch - 2ms/step
Epoch 2/100
2400/2400 - 4s - loss: 0.4923 - accuracy: 0.7759 - val_loss: 0.4417 - val_accuracy: 0.7904 - 4s/epoch - 2ms/step
Epoch 3/100
2400/2400 - 4s - loss: 0.4744 - accuracy: 0.7796 - val_loss: 0.4340 - val_accuracy: 0.7893 - 4s/epoch - 2ms/step
Epoch 4/100
2400/2400 - 6s - loss: 0.4666 - accuracy: 0.7811 - val_loss: 0.4314 - val_accuracy: 0.7914 - 6s/epoch - 2ms/step
Epoch 5/100
2400/2400 - 5s - loss: 0.4598 - accuracy: 0.7840 - val_loss: 0.4252 - val_accuracy: 0.7901 - 5s/epoch - 2ms/step
Epoch 6/100
2400/2400 - 5s - loss: 0.4547 - accuracy: 0.7854 - val_loss: 0.4219 - val_accuracy: 0.7902 - 5s/epoch - 2ms/step
Epoch 7/100
2400/2400 - 4s - loss: 0.4519 - accuracy: 0.7858 - val_loss: 0.4190 - val_accuracy: 0.7913 - 4s/epoch - 2ms/step
Epoch 8/100
2400/2400 - 5s - loss: 0.4500 - accuracy: 0.7859 - val_loss: 0.4160 - val_accuracy: 0.7949 - 5s/epoch - 2ms/step


In [32]:
# Construire le modèle avec des ajustements
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(y_train_encoded.shape[1], activation='softmax')
])

# Compiler le modèle avec un taux d'apprentissage ajusté
model.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

# Entraîner le modèle avec une taille de lot ajustée
history = model.fit(X_train, y_train_encoded, epochs=150, batch_size=64, validation_split=0.2, verbose=2)

Epoch 1/150
1200/1200 - 4s - loss: 0.7091 - accuracy: 0.7387 - val_loss: 0.5502 - val_accuracy: 0.7837 - 4s/epoch - 3ms/step
Epoch 2/150
1200/1200 - 3s - loss: 0.5591 - accuracy: 0.7755 - val_loss: 0.5063 - val_accuracy: 0.7906 - 3s/epoch - 2ms/step
Epoch 3/150
1200/1200 - 3s - loss: 0.5256 - accuracy: 0.7800 - val_loss: 0.4882 - val_accuracy: 0.7910 - 3s/epoch - 2ms/step
Epoch 4/150
1200/1200 - 3s - loss: 0.5093 - accuracy: 0.7826 - val_loss: 0.4814 - val_accuracy: 0.7937 - 3s/epoch - 2ms/step
Epoch 5/150
1200/1200 - 3s - loss: 0.4984 - accuracy: 0.7863 - val_loss: 0.4707 - val_accuracy: 0.7956 - 3s/epoch - 2ms/step
Epoch 6/150
1200/1200 - 3s - loss: 0.4931 - accuracy: 0.7861 - val_loss: 0.4664 - val_accuracy: 0.7951 - 3s/epoch - 2ms/step
Epoch 7/150
1200/1200 - 3s - loss: 0.4870 - accuracy: 0.7882 - val_loss: 0.4628 - val_accuracy: 0.7953 - 3s/epoch - 2ms/step
Epoch 8/150
1200/1200 - 3s - loss: 0.4823 - accuracy: 0.7891 - val_loss: 0.4599 - val_accuracy: 0.7961 - 3s/epoch - 2ms/step


In [34]:
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)
print(f'Test accuracy: {accuracy}')

Test accuracy: 0.8010250926017761


In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Exemple hypothétique de construction de X_fragmented

X_fragmented_list = []  # Pour stocker les fragments

# Parcourir chaque semaine dans les données
for index, row in data.iterrows():
    for i in range(1, len(columns) + 1):  # 'columns' contient les noms des demi-journées
        fragment = row[:i].tolist()  # Prendre les profits jusqu'à la demi-journée i
        fragment += [0] * (len(columns) - i)  # Remplir le reste du fragment avec des zéros
        X_fragmented_list.append(fragment)
        y_expanded_list = []

# Pour chaque semaine dans les données
for label in y:
    for _ in range(len(columns)):  # Répéter le label pour chaque fragment partiel créé pour la semaine
        y_expanded_list.append(label)

# Convertir en array numpy pour l'utilisation avec sklearn
y_expanded = np.array(y_expanded_list)


# Convertir en ndarray ou DataFrame pour l'utilisation avec sklearn
X_fragmented = np.array(X_fragmented_list)

# Supposons que X_fragmented représente vos caractéristiques fragmentées et y les labels de clusters correspondants
X_train, X_test, y_train, y_test = train_test_split(X_fragmented, y_expanded, test_size=0.2, random_state=42)

rf = RandomForestClassifier(
    n_estimators=426,
    max_depth=15,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=10,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.7696891407617301


In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Définition de la grille de recherche
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'max_features': ['sqrt', 'log2', None]
}

# Création du modèle Random Forest
rf = RandomForestClassifier(random_state=42)

# Recherche aléatoire avec validation croisée
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', random_state=42)

# Entraînement
random_search.fit(X_train, y_train)

# Meilleurs paramètres et accuracy
print("Meilleurs paramètres : ", random_search.best_params_)
print("Meilleure accuracy : ", random_search.best_score_)

Meilleurs paramètres :  {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 426}
Meilleure accuracy :  0.7725283883737889
