In [243]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_classifier
from sklearn.base import MultiOutputMixin
from sklearn.base import MetaEstimatorMixin, is_regressor
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.utils.deprecation import deprecated
from sklearn.utils._tags import _safe_tags
from sklearn.utils.validation import _num_samples
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import (
    _check_partial_fit_first_call,
    type_of_target
)
from sklearn.utils.metaestimators import _safe_split, available_if
from sklearn.utils.fixes import delayed
from sklearn.multiclass import (
    _fit_binary,
    _fit_ovo_binary,

    _estimators_has
)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split

class OrdinalClassifier():

    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}

    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf

    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i, y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[i][:,1])
            elif i in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[i-1][:,1] - clfs_predict[i][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[i-1][:,1])
        return np.vstack(predicted).T

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def score(self, X, y, sample_weight=None):
        _, indexed_y = np.unique(y, return_inverse=True)
        return accuracy_score(indexed_y, self.predict(X), sample_weight=sample_weight)
    

def get_sum_error(y, y_pred):
    y_tst = pd.DataFrame(data=y, index=None)
    y_pred = pd.DataFrame(data=y_pred)

    res = y_pred.join(y_tst.reset_index())
    res1 = res.drop(columns='index')
    res.head()
    res1['diff'] = res1[0] - res1['binary_quality']
    total = len(y)
    difference = res1['diff'].sum()
    correct = res1.query("diff == 0").count()
    pct_correct = (correct/total)*100
    return res1


def featurize(df):
  # fill nulls with mean
  for col in df.columns:
    if df[col].isnull().sum() > 0:
      df[col] = df[col].fillna(df[col].mean())
  
  return df

In [21]:
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

In [264]:
df_red = pd.read_csv('winequality-red.csv', delimiter=';')
#df_red['type'] = 0
df_white = pd.read_csv('winequality-white.csv', delimiter=';')
#df_white['type'] = 1
df_red['bin_quality'] = [1 if 7 >= x > 5 else 2 if x > 7 else 0 for x in df_red.quality]
df_red['binary_quality'] = [1 if x >= 8 else 0 for x in df_red.quality]
df_white['bin_quality'] = [1 if 7 >= x > 5 else 2 if x > 7 else 0 for x in df_white.quality]
df_white['binary_quality'] = [1 if x >= 7 else 0 for x in df_white.quality]
df_both = pd.concat([df_red, df_white])
X_red_raw = df_red.drop(columns=["quality","binary_quality", "bin_quality"])
X_white_raw = df_white.drop(columns=["quality", "binary_quality", "bin_quality"])
X_both_raw = df_both.drop(columns="quality")
y_red = df_red["binary_quality"]
y_white = df_white["binary_quality"]
y_both = df_both['quality']





In [143]:
df_red.bin_quality.unique()

array([0, 1, 2], dtype=int64)

In [239]:
X_red = featurize(X_red_raw)
X_white = featurize(X_white_raw)
X_both = featurize(X_both_raw)

X_red = X_red.drop(columns=["total sulfur dioxide"]) # , "pH", "sulphates", "residual sugar"





In [240]:
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size = 0.25)
mm_sc = MinMaxScaler(feature_range = (0, 1))
std_sc = StandardScaler()
X_train_sc = pd.DataFrame(mm_sc.fit_transform(X_train),columns = X_train.columns)
X_test_sc = pd.DataFrame(mm_sc.fit_transform(X_test),columns = X_test.columns)

In [233]:
# df_scaled.head()
from lightgbm.sklearn import LGBMClassifier

xgb_params = {'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 11,
 'missing': -999,
 'n_estimators': 1000,
 'nthread': 4,
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

xgb_clf = xgb.XGBClassifier(
      **xgb_params,
     objective="binary:logistic"
)

from sklearn.svm import SVC

svc=SVC(gamma='auto',probability=True)
svc_bal = SVC(kernel='linear', class_weight="balanced", probability=True)
svc_imb = SVC(kernel='linear', class_weight=None, probability=True)

dtc = DecisionTreeClassifier()
reg = LogisticRegression()

lgb = LGBMClassifier()

clf = OrdinalClassifier(xgb_clf)

y_train_rev = (y_train-3)*-1
y_test_rev = (y_test-3)*-1

# test = df_scaled.iloc[[51]].drop(columns="quality")
clf.fit(X_train_sc, y_train_rev)
"""
max depth
3 -> 0.51
5 -> 0.565
20 -> 0.6225
200 -> 0.6025
100 -> 0.59
40 -> 0.6175

white, DTC, 20 -> 0.599
white, xgb -> -.646
red, xgb -> 0.66
red, xgb, binary:hinge -> 0.70
white, xgb, binary:hinge -> 0.62
white, svc, sc -> 0.6175

"""
#clf.score(X_test, y_test)

score = clf.score(X_test_sc, y_test_rev)

"""
Based off the sum diff, serapating to red and white causes a huge increase in accuracy

binary:hinge, xgb params with max depth 20, 509, scaled features

0.66
5
-622

create 3 different bins
0.75
243
-25

xgb, mm scaled, reversed, binary:hinge. raw-total sulph
score: 0.7075
correct: 93
pct correct: 23.25
total diff: -328
2/3 high quality

drop more columns and fillna

"""

# print total diff of predictions
res = get_sum_error(y_test_rev, clf.predict(X_test_sc))
res.to_csv('sample_results.csv')
df = res.query("diff == 0")
df_high_quality = res.query("bin_quality == 1")
print("score: " + str(score))
print("correct: " + str(df.shape[0]))
print("pct correct: " + str((df.shape[0]/res.shape[0])*100))
print("total diff: " + str(res['diff'].sum()))
df_high_quality.head(20)


Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

score: 0.7225
correct: 86
pct correct: 21.5
total diff: -339


Unnamed: 0,0,bin_quality,diff
146,1,1,0
169,1,1,0
185,1,1,0
296,1,1,0
348,1,1,0


In [49]:
from sklearn.metrics import r2_score

# score = r2_score(y_test, clf.predict(X_test_sc))

In [241]:
from sklearn import preprocessing
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
#from sklearn.cross_validation import *
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05, 0.04, 0.03, 0.06, 0.07], #so called `eta` value
              'max_depth': [6,10,20,40,100],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5,10,20,40,100,500,1000], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=2, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(X_train, y_train)

#trust your CV!
best_params = clf.best_params_
score = clf.score(X_train, y_train)
print('Raw AUC score:', score)

#test_probs = clf.predict_proba(X_test)[:,1]

# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

Fitting 2 folds for each of 175 candidates, totalling 350 fits
Parameters: { "silent" } are not used.

Raw AUC score: 0.9302114411726554


In [235]:
best_params

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 11,
 'missing': -999,
 'n_estimators': 5,
 'nthread': 4,
 'objective': 'multi:softprob',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

In [294]:
best_params_hardcode = {'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 11,
 'missing': -999,
 'n_estimators': 1000,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8,
 'scale_pos_weight': 1}
X_features = X_red_raw
y = y_red
X = StandardScaler().fit_transform(X_features)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=.11, random_state=0)
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

#X_train, y_train = make_classification(n_samples=500, n_features=5, n_informative=3)
X_res, y_res = SMOTE().fit_resample(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.25, random_state=0)

xgb_final = xgb.XGBClassifier()

xgb_final.fit(X_train, y_train)

y_pred = xgb_final.predict(X_test)

from sklearn.metrics import classification_report

res = get_sum_error(y_test, y_pred)
res.to_csv('sample_results.csv')
df = res.query("diff == 0")
df_high_quality = res.query("binary_quality == 1")
print("score: " + str(score))
print("correct: " + str(df.shape[0]))
print("pct correct: " + str((df.shape[0]/res.shape[0])*100))
print("total diff: " + str(res['diff'].sum()))
print(classification_report(y_test, y_pred))
df_high_quality.head(20)

# """
# 75% correct on binary logistic but couldnt pick a 2
# """


score: 0.9302114411726554
correct: 698
pct correct: 99.14772727272727
total diff: 6
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       362
           1       0.98      1.00      0.99       342

    accuracy                           0.99       704
   macro avg       0.99      0.99      0.99       704
weighted avg       0.99      0.99      0.99       704



Unnamed: 0,0,binary_quality,diff
3,1,1,0
6,1,1,0
7,1,1,0
8,1,1,0
11,1,1,0
12,1,1,0
15,1,1,0
16,1,1,0
18,1,1,0
19,1,1,0


In [288]:
y_res.to_frame().query("binary_quality == 1")

Unnamed: 0,binary_quality
202,1
269,1
277,1
288,1
402,1
...,...
2367,1
2368,1
2369,1
2370,1


1137