# Classification - Article
Ce notebook contient le code permettant de classifier les articles à partir des données pré-traitées.

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA



data_folder = os.path.join(os.getcwd(), '../data_original')
df_correct = pd.read_csv(os.path.join(data_folder, 'correct_data.csv'), sep=',')
df_user = pd.read_csv(os.path.join(data_folder, 'users.csv'), sep=',')

## 1. Clustering 

In [2]:
# On normalise toutes les colonnes sauf l'ID acheteur
X = df_user[df_user.columns[1:]].values

scaler = RobustScaler()
X = scaler.fit_transform(X)

df_user[df_user.columns[1:]] = X

In [3]:
column_names = df_user.columns[1:]

selector = VarianceThreshold(threshold=0.2)
X = selector.fit_transform(X)

In [4]:
model = PCA(n_components=15)
X = model.fit_transform(X)

In [5]:
model = KMeans(n_clusters=3, n_init="auto")
labels = model.fit_predict(X)

In [6]:
df_user['label'] = labels

In [7]:
df_correct.drop(columns=['Date/heure transaction', 'Prix unitaire TTC', 'Total TTC', 'Semestre', 'Automne/Printemps'], inplace=True)

In [8]:
df_correct = df_correct.loc[df_correct.index.repeat(df_correct['Quantité'])]

In [9]:
for fam in df_correct['Famille d\'article'].unique():
    articles = df_correct[df_correct['Famille d\'article'] == fam]['Article'].value_counts()
    list_article = articles.index
    
    # 80% des articles les plus achetés 
    articles = articles[articles.cumsum() < articles.sum() * 0.8].index
    
    # articles supprimés
    articles_sup = list(set(list_article) - set(articles))
    
    # suppression des articles en utilisant articles_sup
    df_correct.drop(df_correct[df_correct['Article'].isin(articles_sup)].index, inplace=True)

In [10]:
df_correct.drop(columns=['Quantité', 'Famille d\'article'], inplace=True)

In [11]:
df_correct = df_correct.merge(df_user[['ID acheteur', 'label']], on='ID acheteur', how='left')
df_correct.drop(columns=['ID acheteur'], inplace=True)

In [12]:
df_correct.head()

Unnamed: 0,Article,Periode,Jour semaine,label
0,oasis tropical,apresmidi,vendredi,0
1,coca,apresmidi,vendredi,0
2,coca,apresmidi,vendredi,0
3,coca,apresmidi,vendredi,0
4,coca,apresmidi,vendredi,0


## 2. Encodage des données

In [13]:
# One hot encoding sur la periode, le jour de la semaine, le label avec get_dummies
df_correct = pd.get_dummies(df_correct, columns=['Periode', 'Jour semaine', 'label'])

In [14]:
df_correct.head()

Unnamed: 0,Article,Periode_apresmidi,Periode_matin,Periode_midi,Periode_soir,Jour semaine_jeudi,Jour semaine_lundi,Jour semaine_mardi,Jour semaine_mercredi,Jour semaine_vendredi,label_0,label_1,label_2
0,oasis tropical,True,False,False,False,False,False,False,False,True,True,False,False
1,coca,True,False,False,False,False,False,False,False,True,True,False,False
2,coca,True,False,False,False,False,False,False,False,True,True,False,False
3,coca,True,False,False,False,False,False,False,False,True,True,False,False
4,coca,True,False,False,False,False,False,False,False,True,True,False,False


date heure
famille d'article
ARTICLE = y target
semestre



In [15]:
# labelencoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_correct['Article'] = le.fit_transform(df_correct['Article'])

df_correct.head()

Unnamed: 0,Article,Periode_apresmidi,Periode_matin,Periode_midi,Periode_soir,Jour semaine_jeudi,Jour semaine_lundi,Jour semaine_mardi,Jour semaine_mercredi,Jour semaine_vendredi,label_0,label_1,label_2
0,33,True,False,False,False,False,False,False,False,True,True,False,False
1,8,True,False,False,False,False,False,False,False,True,True,False,False
2,8,True,False,False,False,False,False,False,False,True,True,False,False
3,8,True,False,False,False,False,False,False,False,True,True,False,False
4,8,True,False,False,False,False,False,False,False,True,True,False,False


## 3. Classification

In [16]:
# modeling
from sklearn.model_selection import train_test_split

X = df_correct[df_correct.columns[1:]]
y = df_correct[df_correct.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
# SGDClassifier
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(n_jobs=-1)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))
print(model.score(X_test, y_test))

0.0970185873605948
0.0970185873605948


In [19]:
from sklearn.model_selection import GridSearchCV

paramps_grid_sgd = {
    'penalty': ['l2', 'l1'],
    'alpha': [0.01, 0.1, 10],
    'fit_intercept': [True, False],
    'max_iter': [250, 500, 1000],
    'n_jobs': [-1],
}


grid = GridSearchCV(model, paramps_grid_sgd, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'alpha': 10, 'fit_intercept': False, 'max_iter': 250, 'n_jobs': -1, 'penalty': 'l2'}
0.13115835830244163


In [None]:
# Résultat : 
# {'alpha': 10, 'fit_intercept': False, 'max_iter': 250, 'n_jobs': -1, 'penalty': 'l2'}
# Permet de passer de 0.097 à 0.131
final_model = SGDClassifier(alpha=10, fit_intercept=False, max_iter=250, n_jobs=-1, penalty='l2')
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print(accuracy_score(y_test, y_pred))