In [59]:
import pandas as pd
import numpy as np

## encode functions
##### main custom functions
def one_hot_encode(df,columns_for_one_hot):
    '''One hot encode the columns for feeding into the model
    :df: Dataframe of the data
    :columns_for_one_hot: array name of columns to input
    :return: Dataframe of the one hot encoded data with prefix = columns_for_one_hot
    '''
#     for col_index in columns_for_one_hot:
#         df[col_index]=df[col_index].replace(['0',0], 'No_{}'.format(col_index))
    ## one hot encode
    for column_for_one_hot in columns_for_one_hot:
        #  Get one hot encoding of columns B
        one_hot = pd.get_dummies(df[column_for_one_hot],prefix=column_for_one_hot)
        # Drop column B as it is now encoded
        df = df.drop(column_for_one_hot,axis = 1)
        # Join the encoded df
        df = df.join(one_hot,rsuffix=column_for_one_hot)
    return df
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import median_absolute_error

def evaluate(y_pred,Y_val):
    f1_sep=round(f1_score(Y_val, y_pred,average='micro'),6)
    print('f1_score is:', str(f1_sep))
    acc_sep=round(accuracy_score(Y_val, y_pred),6)
    print('accuracy_score is:', str(acc_sep))

In [28]:
## read and encode data
dfg = pd.read_csv(
    './vintage_wine_description_processed.csv',index_col=0).fillna(0)
to_encode=['country','province','region_1','region_2','variety','area']
to_drop=['designation','description','title','Location_description','location','winery','price','points']
dfg=dfg.drop(to_drop,axis=1)
dfg=one_hot_encode(dfg,to_encode)
dfg.head()

Unnamed: 0,year,temp1,temp10,temp11,temp12,temp13,temp14,temp15,temp16,temp17,...,area_Eastern France,area_Jura,area_Languedoc,area_Languedoc-Roussillon,area_Loire,area_Lyonnais,area_Provence,area_Rhône,area_Savoy,area_South West France
0,2013,11.39,20.302,16.792,12.432,11.375,9.663,13.65,15.543,17.691,...,0,0,0,0,0,0,1,0,0,0
1,2011,4.69,12.437,7.362,4.967,5.142,6.502,8.337,14.164,16.951,...,0,0,0,0,1,0,0,0,0,0
2,2013,4.35,11.176,7.606,3.811,2.58,4.904,7.617,9.347,12.848,...,0,0,0,0,0,0,0,0,0,0
3,2013,-1.763,10.034,3.674,1.283,-2.77,-4.096,-0.363,6.836,16.905,...,0,0,0,0,0,0,0,0,0,0
4,2012,4.761,10.858,5.631,2.999,4.35,4.755,5.16,9.238,11.43,...,0,0,0,0,0,0,0,0,0,0


In [29]:
## convert to categorical data
for column_name in list(dfg.filter(regex='^(?:taste)').columns):
    dfg[column_name]=pd.Categorical(dfg[column_name].apply(lambda x: 1 if x>0 else 0))

In [30]:
dfg.filter(regex='^(?:taste)').dtypes

taste_aroma      category
taste_fruit      category
taste_herb       category
taste_palat      category
taste_offer      category
                   ...   
taste_make       category
taste_complex    category
taste_power      category
taste_delici     category
taste_also       category
Length: 98, dtype: object

In [34]:
### split xy and filter out columns that are not needed
y_col=list(dfg.filter(regex='^(?:taste)').columns)
# print(y_col)
# regex_out_taste='^(?!taste)'
X = dfg.drop(y_col, axis=1) # Training & Validation data
Y = dfg[y_col]              # Response / Target Variable
### normalize data
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X=scaler.fit_transform(X)
# Y=scaler.fit_transform(Y)
# print(Y)

print(X.shape, Y.shape)

# Split training set so that we validate on 20% of the data
# Note that our algorithms will never have seen the validation 

np.random.seed(5875) # set random seed for reproducibility

from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = \
                train_test_split(X, Y, test_size=0.2)

print('Training Samples:', X_train.shape, Y_train.shape)
print('Validation Samples:', X_val.shape, Y_val.shape)


['taste_aroma', 'taste_fruit', 'taste_herb', 'taste_palat', 'taste_offer', 'taste_appl', 'taste_citru', 'taste_acid', 'taste_ripe', 'taste_fruiti', 'taste_smooth', 'taste_firm', 'taste_tannin', 'taste_juici', 'taste_red', 'taste_berri', 'taste_tart', 'taste_flavor', 'taste_lime', 'taste_green', 'taste_crisp', 'taste_lemon', 'taste_orang', 'taste_bit', 'taste_finish', 'taste_bottl', 'taste_come', 'taste_tannic', 'taste_earthi', 'taste_herbal', 'taste_good', 'taste_blackberri', 'taste_raspberri', 'taste_show', 'taste_mouth', 'taste_full', 'taste_bodi', 'taste_spici', 'taste_dark', 'taste_plum', 'taste_fresh', 'taste_bright', 'taste_white', 'taste_pepper', 'taste_savori', 'taste_balanc', 'taste_soft', 'taste_spice', 'taste_textur', 'taste_peach', 'taste_eleg', 'taste_pear', 'taste_touch', 'taste_cabernet', 'taste_merlot', 'taste_chocol', 'taste_miner', 'taste_charact', 'taste_layer', 'taste_rich', 'taste_black', 'taste_cherri', 'taste_oak', 'taste_vanilla', 'taste_style', 'taste_bake', 't

In [None]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
lgbm = LGBMClassifier()           # create
lgbm_regr= MultiOutputClassifier(lgbm)
lgbm_regr.fit(X_train, Y_train.astype('int'))            # train
y_pred=lgbm_regr.predict(X_val)
evaluate(y_pred,Y_val.astype('int'))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
logreg = LogisticRegression(solver='newton-cg')           # create
regr= MultiOutputClassifier(logreg)
regr.fit(X_train, Y_train.astype('int'))            # train
y_pred=regr.predict(X_val)
evaluate(y_pred,Y_val.astype('int'))


In [48]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bays
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb

In [45]:
Y_train.to_numpy().shape

(100276, 98)

In [52]:
y_pred=regr.predict(X_val)
evaluate(y_pred,Y_val.astype('int'))

ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted', 'samples'].

In [55]:
y_pred

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [60]:
evaluate(y_pred,Y_val.astype('int'))


f1_score is: 0.293169
accuracy_score is: 0.000199
