In [1]:
import pandas as pd
import torch
import os
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt

In [2]:
#On constate qu'il y'a 9 neufs colonnes
df = pd.read_csv("train.csv")
print(df.columns)
print(df.dtypes)

Index(['id', 'date', 'hour', 'bc_price', 'bc_demand', 'ab_price', 'ab_demand',
       'transfer', 'bc_price_evo'],
      dtype='object')
id                int64
date            float64
hour            float64
bc_price        float64
bc_demand       float64
ab_price        float64
ab_demand       float64
transfer        float64
bc_price_evo     object
dtype: object


## Quelques plots 

In [None]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
scatter_matrix(df,figsize=(10,10))
plt.show()

Valeurs aberrantes dans la colonne ab_price

## Preprocessing
On supprime les doublons, on supprime les valeurs nulles 
On supprime aussi la colonne id qui n'est pas pertinent pour notre modèle

In [81]:
df_non_duplicates= df.drop_duplicates()
df_non_missing = df_non_duplicates.dropna()
df_non_aberrantes = df_non_missing[df_non_missing["ab_price"]<0.1]
df_non_aberrantes = df_non_aberrantes.loc[~((df_non_aberrantes["bc_price"] < 0.1) & (df_non_aberrantes["ab_price"] > 0.01))]
target_name = "bc_price_evo"
target, data = df_non_aberrantes[target_name] , df_non_aberrantes.drop(columns=[target_name,"id","transfer","ab_price","ab_demand"])   #On sépare la colonne target des autres colonnes

In [None]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
scatter_matrix(data,figsize=(10,10))
plt.show()

In [59]:
#Très important KNeighborsClassifier se base sur le calcul des distances,
#donc un prétraitement des variables rend ce modèle plus efficace
model = Pipeline(steps=[
    ("classifier",RandomForestClassifier(n_estimators = 200,criterion = "entropy", max_features=6, max_depth=80)),])

#Liste des hyparamètres
all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

param_grid ={"classifier__n_estimators": [100,200,300],
             "classifier__max_features" : [6,8,10],
             "classifier__max_depth": [80, 90, 100, 110]
             }
#Fin de la partie des hyparamètres

In [61]:
#deux(02) validations croisées internes pour déterminer les meilleurs hyparamètres
model_grid_search = GridSearchCV(model, param_grid=param_grid,n_jobs=2, cv=2)

#dix(10) validations croisées externes pour déterminer les performences du modèles
scores = cross_validate(
    model_grid_search, data, target, cv=2, n_jobs=2, return_estimator=True
)


cv_test_scores = scores['test_score']
cv_test_scores

array([0.88135358, 0.88045212])

In [32]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(estimator =GradientBoostingClassifier(n_estimators=200, learning_rate=1.0,max_depth=1, random_state=0) ,n_estimators=100)
scores = cross_validate(
    clf, data, target, cv=2, n_jobs=2, return_estimator=True
)
cv_test_scores = scores['test_score']
cv_test_scores

array([0.86210723, 0.86438679])

In [71]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(estimator =RandomForestClassifier(n_estimators = 500, criterion='entropy',max_features=5,max_depth=200),n_estimators=200)
scores = cross_validate(
    clf, data, target, cv=5, n_jobs=2, return_estimator=True
)
cv_test_scores = scores['test_score']
cv_test_scores

array([0.89856078, 0.89682677, 0.90272239, 0.89628859, 0.89611516])

In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier
clf3 = HistGradientBoostingClassifier().fit(data, target)
scores = cross_validate(
    clf3, data, target, cv=10, n_jobs=2, return_estimator=True
)
cv_test_scores = scores['test_score']
cv_test_scores

array([0.84961885, 0.85654886, 0.85377685, 0.86417186, 0.86451836,
       0.86204506, 0.85788562, 0.85753899, 0.8592721 , 0.85337955])

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Initialiser les apprenants faibles
tree_weak_learner = DecisionTreeClassifier()

clf = GradientBoostingClassifier(init=DecisionTreeClassifier(), n_estimators=200, max_depth=3,learning_rate=0.5,subsample=1.0,random_state=42).fit(data, target)
#max_depth=1
score = clf.score(data,target)
print(score)

1.0


In [26]:
scores = cross_validate(
    clf, data, target, cv=2, n_jobs=2, return_estimator=True
)
cv_test_scores = scores['test_score']
cv_test_scores

array([0.83033918, 0.84517203])

In [None]:
knn = model.fit(data, target)
data_test = pd.read_csv("test.csv")
column_id = data_test["id"]
data_test = data_test.drop(columns="id")
# Faire des prédictions avec le modèle KNN
predictions = knn.predict(data_test)
# Créer un DataFrame avec les colonnes "id" et "predictions"
result_df = pd.DataFrame({'id': column_id, 'predictions': predictions})
result_df.to_csv("predictions.csv", index=False)

In [65]:
knn2 =  model.fit(data, target)
#predictions = knn2.predict(data)
accuracy = knn2.score(data, target)
print("Précision du test:",accuracy)

Précision du test: 0.8358690001732801


In [34]:
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear').fit(data, target)
accuracy = svm_model_linear.score(data, target)
print("Précision du test:",accuracy)

Précision du test: 0.7238055613341654


In [19]:
clf = RandomForestClassifier(n_estimators = 300, criterion='entropy',max_features=8, max_depth=100) 
knn3 =  clf.fit(data, target)
#predictions = knn2.predict(data)
accuracy = knn3.score(data, target)
print("Précision du test:",accuracy)

Précision du test: 1.0


In [20]:
# Obtenir l'importance des variables
feature_importances = clf.feature_importances_

# Afficher l'importance des variables
for i, importance in enumerate(feature_importances):
    print(f"Variable {data.columns[i]} : {importance}")

Variable date : 0.2787476240998431
Variable hour : 0.07802831419308959
Variable bc_price : 0.3635902734662574
Variable bc_demand : 0.12882290940035684
Variable ab_price : 0.044662904072290434
Variable ab_demand : 0.052739367090951186
Variable transfer : 0.053408607677211464


In [69]:
clf = RandomForestClassifier(n_estimators = 300, criterion='entropy',max_features=8,max_depth=200) 
scores = cross_validate(
    clf, data, target, cv=5, n_jobs=2, return_estimator=True
)
cv_test_scores = scores['test_score']
cv_test_scores

array([0.89647997, 0.89613317, 0.90462979, 0.89698231, 0.8997572 ])

In [None]:
model_rd_forest =  AdaBoostClassifier(estimator =RandomForestClassifier(n_estimators = 300, criterion='entropy',max_features=5,max_depth=200),n_estimators=50).fit(data, target)
data_test = pd.read_csv("test.csv")
column_id = data_test["id"]
data_test = data_test.drop(columns=["id","transfer","ab_price","ab_demand"])

# Faire des prédictions avec le modèle random forest
predictions = model_rd_forest.predict(data_test)
# Créer un DataFrame avec les colonnes "id" et "predictions"
result_df = pd.DataFrame({'id': column_id, 'predictions': predictions})
result_df.to_csv("predictions.csv", index=False)
#RandomForestClassifier(n_estimators = 300, criterion='entropy',max_features=5,max_depth=200

In [89]:
data_test = pd.read_csv("test.csv")
column_id = data_test["id"]
data_test = data_test.drop(columns=["id","ab_demand","transfer"])
# Faire des prédictions avec le modèle KNN
predictions = knn3.predict(data_test)
# Créer un DataFrame avec les colonnes "id" et "predictions"
result_df = pd.DataFrame({'id': column_id, 'predictions': predictions})
result_df.to_csv("predictions.csv", index=False)

In [None]:
data_test = pd.read_csv("test.csv")
scatter_matrix(data_test,figsize=(10,10))
plt.show()

In [62]:
#Liste des meilleurs hyparamètres pour chaque validation croisée
for cv_fold, estimator_in_fold in enumerate(scores["estimator"]):
    print(
        f"Meilleurs hyperparamètres pour la validation croisée #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Meilleurs hyperparamètres pour la validation croisée #1:
{'classifier__max_depth': 100, 'classifier__max_features': 8, 'classifier__n_estimators': 300}
Meilleurs hyperparamètres pour la validation croisée #2:
{'classifier__max_depth': 90, 'classifier__max_features': 10, 'classifier__n_estimators': 100}
