# Classification - Famille article

Ce notebook contient le code permettant de classifier les familles d'articles à partir des données pré-traitées.

In [139]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier



data_folder = os.path.join(os.getcwd(), '../data_original')
df_correct = pd.read_csv(os.path.join(data_folder, 'correct_data.csv'), sep=',')
df_user = pd.read_csv(os.path.join(data_folder, 'users.csv'), sep=',')

## 1. Clustering

In [140]:
# On normalise toutes les colonnes sauf l'ID acheteur
X = df_user[df_user.columns[1:]].values

scaler = RobustScaler()
X = scaler.fit_transform(X)

df_user[df_user.columns[1:]] = X

In [141]:
column_names = df_user.columns[1:]

selector = VarianceThreshold(threshold=0.2)
X = selector.fit_transform(X)

In [142]:
model = PCA(n_components=15)
X = model.fit_transform(X)

In [143]:
model = KMeans(n_clusters=3, n_init="auto")
labels = model.fit_predict(X)

In [144]:
df_user['label'] = labels

In [145]:
df_correct = df_correct.loc[df_correct.index.repeat(df_correct['Quantité'])]


In [146]:
df_correct.drop(columns=['Date/heure transaction', 'Article', 'Prix unitaire TTC', 'Quantité', 'Total TTC', 'Semestre', 'Automne/Printemps'], inplace=True)

In [147]:
df_correct = df_correct.merge(df_user[['ID acheteur', 'label']], on='ID acheteur', how='left')
df_correct.drop(columns=['ID acheteur'], inplace=True)

In [148]:
df_correct.head()

Unnamed: 0,Famille d'article,Periode,Jour semaine,label
0,Softs,apresmidi,vendredi,2
1,Softs,apresmidi,vendredi,2
2,Softs,apresmidi,vendredi,2
3,Softs,apresmidi,vendredi,2
4,Softs,apresmidi,vendredi,2


## 2. Encodage des données

In [149]:
# One hot encoding sur la periode, le jour de la semaine, le label avec get_dummies
df_correct = pd.get_dummies(df_correct, columns=['Periode', 'Jour semaine', 'label'])

In [150]:
df_correct.head()

Unnamed: 0,Famille d'article,Periode_apresmidi,Periode_matin,Periode_midi,Periode_soir,Jour semaine_jeudi,Jour semaine_lundi,Jour semaine_mardi,Jour semaine_mercredi,Jour semaine_vendredi,label_0,label_1,label_2
0,Softs,1,0,0,0,0,0,0,0,1,0,0,1
1,Softs,1,0,0,0,0,0,0,0,1,0,0,1
2,Softs,1,0,0,0,0,0,0,0,1,0,0,1
3,Softs,1,0,0,0,0,0,0,0,1,0,0,1
4,Softs,1,0,0,0,0,0,0,0,1,0,0,1


Famille d'article = y target

In [151]:
# labelencoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_correct['Famille d\'article'] = le.fit_transform(df_correct['Famille d\'article'])

df_correct.head()

Unnamed: 0,Famille d'article,Periode_apresmidi,Periode_matin,Periode_midi,Periode_soir,Jour semaine_jeudi,Jour semaine_lundi,Jour semaine_mardi,Jour semaine_mercredi,Jour semaine_vendredi,label_0,label_1,label_2
0,7,1,0,0,0,0,0,0,0,1,0,0,1
1,7,1,0,0,0,0,0,0,0,1,0,0,1
2,7,1,0,0,0,0,0,0,0,1,0,0,1
3,7,1,0,0,0,0,0,0,0,1,0,0,1
4,7,1,0,0,0,0,0,0,0,1,0,0,1


## 3. Classification

In [152]:
# modeling
from sklearn.model_selection import train_test_split

X = df_correct[df_correct.columns[1:]]
y = df_correct[df_correct.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [153]:
# SGDClassifier
model = SGDClassifier(alpha=0.1, fit_intercept=True, loss='modified_huber', n_jobs=-1, penalty='l2')
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.6506703165207078
0.64956212225102
