# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix 

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

### 2.1.1 WINES

In [None]:
df_wines_raw = dfc.load_dataset_from_config('wines_data_local', sep=',')

if df_wines_raw is not None and isinstance(df_wines_raw, pd.DataFrame):
    display(df_wines_raw.head())
    dfc.log_general_info(df_wines_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_wines_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_wines_raw))
    df_wines = dfc.normalize_column_names(df_wines_raw)
    display(df_wines.head())

In [None]:
df_wines_desc = df_wines.select_dtypes(include=np.number).describe()
display(df_wines_desc)
df_wines_cr = df_wines.select_dtypes(include=np.number).corr()
display(df_wines_cr)

## 2.2 Data quality refinement

### 2.2.1 WINES

In [None]:
# original backup and dupplicates management
df_wines_orig = df_wines.copy()
df_wines = df_wines.drop_duplicates()

# 2. Data Classification

## 2.1 General Analysis

In [None]:

# categorisation for first level human eye classification estimation using quartiles
malic_acid = pd.cut(
    x=df_wines.malic_acid, 
    bins=[df_wines_desc.malic_acid['min'],
          df_wines_desc.malic_acid['25%'],
          df_wines_desc.malic_acid['50%'],
          df_wines_desc.malic_acid['75%'],
          df_wines_desc.malic_acid['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], malic_acid, normalize='columns'))
flavanoids = pd.cut(    
    x=df_wines.flavanoids, 
    bins=[df_wines_desc.flavanoids['min'],
          df_wines_desc.flavanoids['25%'],
          df_wines_desc.flavanoids['50%'],
          df_wines_desc.flavanoids['75%'],
          df_wines_desc.flavanoids['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], flavanoids, normalize='columns'))


## 2.2 Linear Regression

In [None]:
# separation des variables explicatives (features) et de la variable à prédire (target)
data = df_wines.drop('class', axis=1)
target = df_wines['class']

In [None]:
# séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=66)

In [None]:
# preprocessing des variables explicatives d'entrainement et de test (encodage de discrétisation pour le machine learning)
encoder = OneHotEncoder(handle_unknown='ignore')
# NB : fit réinitialise l'encodeur avec les catégories et les statistiques des données d'entrainement
# inutile de le refaire pour la partie donnée de test
X_train_enc = encoder.fit_transform(X_train)
X_test_enc = encoder.transform(X_test)

In [None]:
# Application du modèle de type logistic regression
logReg = linear_model.LogisticRegression(C=1.0)
logReg.fit(X_train_enc, y_train.to_numpy().ravel())

In [None]:
# aplication du modèle aux données de test
y_pred = logReg.predict(X_test_enc)

In [None]:
# évaluation du score du modèle
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(cm)
score = (cm[0,0]+cm[1,1]+cm[2,2])/cm.sum()
print("Score manuel:",score)
print("Score classifier:", logReg.score(X_test_enc, y_test))