In [257]:
import pandas as pd
import seaborn as sns
from matplotlib import rcParams

rcParams['figure.figsize'] = (20, 20)

df = pd.read_csv('./winequality/winequality-red.csv', encoding='cp1251', sep=';')
print('Shape - {}\nColumns - {}'.format(df.shape, df.columns))
X = df.drop(columns=['quality'])
y = df['quality']

df.describe()

Shape - (1599, 12)
Columns - Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [258]:
from scipy.stats import shapiro, boxcox

non_normal_columns = ['residual sugar', 'chlorides']

for feature in non_normal_columns:
    X[feature] = pd.Series(boxcox(df[feature])[0])
    
# for feature in X.columns:
#     print("Shapiro test for '{}' = {}".format(feature, shapiro(X[feature])))

In [264]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC


pipeline = make_pipeline(
    MinMaxScaler(),
#     PolynomialFeatures(degree=2),
    SVC(gamma='scale', C=1.2)
#     LogisticRegression(n_jobs=-1, multi_class='ovr', solver='lbfgs', C=1.21), # Choose C from cross validation
#     LinearRegression(n_jobs=-1)
#     RandomForestClassifier(n_estimators=800, max_depth=17, random_state=152, n_jobs=-1)
)

X_n = X.copy()
y_n = y.copy()

X_n['n1'] = X_n['fixed acidity']  / X_n['volatile acidity']
X_n['dioxide'] = X_n['total sulfur dioxide']  / X_n['free sulfur dioxide']
# X_n['is strong alcohol'] = np.where(df['alcohol'] > 10, 1, 0)
# X_n = X_n.drop(columns=['pH'])
# X_n['n1'] = X_n['fixed acidity'] / X_n['volatile acidity']


scores = cross_val_score(pipeline, X_n, y_n, cv=5, scoring='f1_micro')
print("Mean f1 score: {} +-{}".format(scores.mean(), scores.std()))

Mean f1 score: 0.5842270548234773 +-0.033801164871054105


In [265]:
import numpy as np

greedy_X = pd.DataFrame(index=X_n.index.values)

def greedy_select(X_base, prev_score):
    feature_scores = np.zeros(len(X_n.columns))
    
    for index, feature in enumerate(X_base):
        tmp_df = greedy_X.copy()
        tmp_df.insert(len(tmp_df.columns), feature, X_base[feature].values)
        
        scores_for_feature = cross_val_score(pipeline, tmp_df, y, cv=5, scoring='f1_micro')
        feature_scores[index] = scores_for_feature.mean()
    
    max_score = np.max(feature_scores)
    m_index = np.where(feature_scores == max_score)[0][0]
    
    if max_score > prev_score:
        feature_to_add = X_base.columns[m_index]
        print(feature_to_add, max_score)
        greedy_X.insert(len(greedy_X.columns), feature_to_add, X_base[feature_to_add])
        X_base = X_base.drop(columns=[feature_to_add])
    else:
        return None
    
    greedy_select(X_base, max_score)
    
greedy_select(X_n, 0.0)

alcohol 0.5516961471879889
volatile acidity 0.5685976099106632
sulphates 0.5798360518107492
dioxide 0.5948935327703246
total sulfur dioxide 0.596086662646109
citric acid 0.6048251996783097


In [102]:
scores = cross_val_score(pipeline, greedy_X, y, cv=5, scoring='f1_micro')
print("Mean f1 score: {} +-{}".format(scores.mean(), scores.std()))


Mean f1 score: 0.5835765224137649 +-0.03488869343344286
