In [24]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [39]:
import pandas as pd
import numpy as np
import pyodbc as py

import warnings

from datetime import date, timedelta, datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn import neighbors
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import importlib
import functions
importlib.reload(functions)

pd.set_option('display.max_columns', None)

In [40]:
warnings.filterwarnings('ignore')

# Data Loading

## Option 1: via Data_Cleaning_Preparation code (preferred) -> load in .csv file

In [27]:
df = pd.read_csv('/Users/thoma/Documents/seminar_data/cleaned_prepared_data.csv', low_memory = True)

In [28]:
df['orderDate'] = pd.to_datetime(df['orderDate'])
df['cancellationDate'] = pd.to_datetime(df['cancellationDate'])
df['promisedDeliveryDate'] = pd.to_datetime(df['promisedDeliveryDate'])
df['shipmentDate'] = pd.to_datetime(df['shipmentDate'])
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'])
df['startDateCase'] = pd.to_datetime(df['startDateCase'])
df['returnDateTime'] = pd.to_datetime(df['returnDateTime'])
df['registrationDateSeller'] = pd.to_datetime(df['registrationDateSeller'])

In [48]:
historic_variable = 'transporterCode'

In [51]:
#Fixed Columns:
DATE = ['orderDate']
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength','promisedDeliveryDays','partnerSellingDays', 'orderCorona']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
YEAR = ['orderYear2020']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']
HISTORICX = [historic_variable+'HistoricHappyX',historic_variable+'HistoricUnhappyX',historic_variable+'HistoricUnknownX']

#Determinants
DETERMINANT = ['noReturn', 'noCase', 'noCancellation', 'onTimeDelivery']

#Classifications
CLASSIFICATION = ['generalMatchClassification','detailedMatchClassification','binaryMatchClassification','determinantClassification']

## Option 2: via direct sql connection

In [5]:
connection_string = (    
    r'Driver={SQL Server};'
    r'Server=LAPTOP-LD74USH0\SQLEXPRESS;'
    r'Integrated Security=SSPI;'
    r'Trusted_Connection=yes;'
)
pd.set_option('display.max_columns', None)

In [6]:
def sql2df(query, params=[], parse_dates=None, dsn='SQLEXPRESS'):
        with py.connect(connection_string, readonly=True) as conn:
            return pd.read_sql(query, conn, params=params, parse_dates=parse_dates)

In [7]:
# First work with random top 100.000 (to reduce computation time) - 45secs

df = sql2df('''
SELECT TOP 500000 * FROM Seminar.dbo.cleaned_bol_data_full
ORDER BY newid();
''')


In [321]:
# 9.5 minutes 

df = sql2df('''
SELECT * FROM Seminar.dbo.cleaned_bol_data_full;
''')

In [8]:
#Change type of columns
dtype = {'calculationDefinitive': bool,
         'noCancellation': bool,
         'noCase': bool,
         'hasOneCase': bool,
         'hasMoreCases': bool,
         'noReturn': bool,
         'orderWeekend': bool,
         'orderCorona': bool,
         'countryCodeNL': bool,
         'fulfilmentByBol': bool,
         'countryOriginNL': bool,
         'countryOriginBE': bool,
         'countryOriginDE': bool,
         'orderMonday': bool,
         'orderTuesday': bool,
         'orderWednesday': bool,
         'orderThursday': bool,
         'orderFriday': bool,
         'orderSaturday': bool,
         'orderSunday': bool,
         'orderJanuary': bool,
         'orderFebruary': bool,
         'orderMarch': bool,
         'orderApril': bool,
         'orderMay': bool,
         'orderJune': bool,
         'orderJuly': bool,
         'orderAugust': bool,
         'orderSeptember': bool,
         'orderOctober': bool,
         'orderNovember': bool,
         'orderDecember': bool}

df = df.astype(dtype)

#Transform dates to date-type
df['orderDate'] = pd.to_datetime(df['orderDate'], errors='coerce')
df['cancellationDate'] = pd.to_datetime(df['cancellationDate'], errors='coerce')
df['promisedDeliveryDate'] = pd.to_datetime(df['promisedDeliveryDate'], errors='coerce')
df['shipmentDate'] = pd.to_datetime(df['shipmentDate'], errors='coerce')
df['dateTimeFirstDeliveryMoment'] = pd.to_datetime(df['dateTimeFirstDeliveryMoment'], errors='coerce')
df['startDateCase'] = pd.to_datetime(df['startDateCase'], errors='coerce')
df['returnDateTime'] = pd.to_datetime(df['returnDateTime'], errors='coerce')
df['registrationDateSeller'] = pd.to_datetime(df['registrationDateSeller'], errors='coerce')

df.dtypes

orderDate             datetime64[ns]
productId                     object
sellerId                      object
totalPrice                   float64
quantityOrdered                int64
                           ...      
orderSeptember                  bool
orderOctober                    bool
orderNovember                   bool
orderDecember                   bool
productTitleLength             int64
Length: 78, dtype: object

#### Add variables

In [9]:
# Binary classification variable
df['binaryMatchClassification'] = df['generalMatchClassification'].apply(lambda x: 'UNKNOWN' if x == 'UNKNOWN' else 'KNOWN')

# Dummy for year = 2020
df['orderYear2020'] = df['orderYear'].apply(lambda x: True if x == 2020 else False)

#### Transporter Groups

In [10]:
def transporterCluster(transporterCode):
    """
    Function to create a new manually clustered transporter variable: 28 -> 5 categories
    """
    if transporterCode in ['AH-NL','TNT','TNT-EXPRESS','TNT-EXTRA']:
        return 'POSTNL'
    elif transporterCode in ['DHL','DHL_DE','DHLFORYOU']:
        return 'DHL'
    elif transporterCode in ['DPD-NL','DPD-BE']:
        return 'DPD'
    elif transporterCode in ['BRIEFPOST','BPOST_BE','BPOST_BRIEF','DHL-GLOBAL-MAIL','TNT_BRIEF']:
        return 'BRIEF'
    else:
        return 'OTHER'

In [11]:
df['transporterCodeGeneral'] = df['transporterCode'].apply(transporterCluster)
df['transporterCodeGeneral'].value_counts()

POSTNL    221508
BRIEF     165035
DHL        45527
DPD        34475
OTHER      33455
Name: transporterCodeGeneral, dtype: int64

#### Product Groups

In [12]:
def productGroupCluster(productGroup):
    """
    Function to create a new manually clustered product group variable based on categories bol.com
    60 -> 14 groups.
    """
    if productGroup in ['Dutch Books PG','Ebooks and Audiobooks','International Books PG']:
        return 'Books'
    elif productGroup in ['Games Accessories','Games Consoles','Games Software Physical',
                          'Movies','Music']:
        return 'Music, Film & Games'
    elif productGroup in ['Camera','Desktop Monitor and Beamer','Ereaders and Accessories',
                          'Laptop Computers','PC Accessories','Personal Audio',
                          'Sound and Vision Accessories','Storage and Network',
                          'Telephone and Tablet Accessories','Telephones and Tablets','Television']:
        return 'Computer & Electronics'
    elif productGroup in ['General Toys','Recreational and Outdoor Toys']:
        return 'Toys & Hobby'
    elif productGroup in ['Baby and Kids Fashion','Baby PG']:
        return 'Baby & Kids'
    elif productGroup in ['Daily Care PG','Health PG','Perfumery PG','Personal Care']:
        return 'Health & Care'
    elif productGroup in ['Footwear','Jewelry and Watches','Mens and Womens Fashion','Wearables']:
        return 'Fashion, Shoes & Accessories'
    elif productGroup in ['Bodyfashion and Beachwear','Camping and Outdoor','Cycling',
                          'Sporting Equipment','Sportswear','Travel Bags and Accessories']:
        return 'Sports, Outdoor & Travel'
    elif productGroup in ['Educational Dutch','Educational International','Printing and Ink']:
        return 'Office & School'
    elif productGroup in ['Supermarket PG'] :
        return 'Food & Beverage'
    elif productGroup in ['Furniture','Heating and Air','Home Decoration','Home Entertainment',
                          'Household','Household Appliances','Kitchen','Kitchen Machines',
                          'Lighting','Major Domestic Appliances PG','Plumbing and Safety']:
        return 'Home, Cooking & Household'
    elif productGroup in ['Garden','Pet PG','Textiles','Tools and Paint']:
        return 'Pets, Garden & Jobs'
    elif productGroup in ['Car and Motorcycle'] :
        return 'Car & Motor'
    else:
        return 'Other'

In [13]:
df['productGroupGeneral'] = df['productGroup'].apply(productGroupCluster)
df['productGroupGeneral'].value_counts()

Computer & Electronics          145687
Home, Cooking & Household        83348
Sports, Outdoor & Travel         54598
Toys & Hobby                     52834
Pets, Garden & Jobs              35484
Health & Care                    31315
Food & Beverage                  26802
Books                            19435
Music, Film & Games              17343
Baby & Kids                      11784
Fashion, Shoes & Accessories     11453
Office & School                   5450
Car & Motor                       3149
Other                             1318
Name: productGroupGeneral, dtype: int64

In [14]:
#Create dummies of new product grouping
for group in df['productGroupGeneral'].unique():
    
    columnName = 'group' + group.split(' ')[0].replace(',','')
    df[columnName] = df['productGroupGeneral'].apply(lambda x: True if x == group else False)

In [15]:
print(df.columns)
print('Total: ',len(df.columns),' columns')

Index(['orderDate', 'productId', 'sellerId', 'totalPrice', 'quantityOrdered',
       'countryCode', 'cancellationDate', 'cancellationReasonCode',
       'promisedDeliveryDate', 'shipmentDate', 'transporterCode',
       'transporterName', 'transporterNameOther',
       'dateTimeFirstDeliveryMoment', 'fulfilmentType', 'startDateCase',
       'cntDistinctCaseIds', 'returnDateTime', 'quantityReturned',
       'returnCode', 'productTitle', 'brickName', 'chunkName', 'productGroup',
       'productSubGroup', 'productSubSubGroup', 'registrationDateSeller',
       'countryOriginSeller', 'currentCountryAvailabilitySeller',
       'calculationDefinitive', 'noCancellation', 'onTimeDelivery', 'noCase',
       'hasOneCase', 'hasMoreCases', 'noReturn', 'detailedMatchClassification',
       'generalMatchClassification', 'determinantClassification', 'orderYear',
       'orderMonth', 'orderYearMonth', 'orderWeekday', 'orderWeekend',
       'orderCorona', 'transporterFeature', 'partnerSellingMonths',
   

In [16]:
#Fixed Columns:
BASIC = ['totalPrice','quantityOrdered','fulfilmentByBol','countryCodeNL','countryOriginNL','countryOriginBE',
        'countryOriginDE','productTitleLength']
WEEK = ['orderMonday','orderTuesday','orderWednesday','orderThursday','orderFriday','orderSaturday','orderSunday']
MONTH = ['orderJanuary','orderFebruary','orderMarch','orderApril','orderMay','orderJune',
         'orderJuly','orderAugust','orderSeptember','orderOctober','orderNovember','orderDecember']
GROUP = ['groupHealth','groupHome','groupSports','groupComputer','groupPets','groupToys','groupBooks', 
         'groupBaby', 'groupMusic', 'groupFood','groupOffice','groupFashion','groupOther','groupCar']

#Dynamic Columns:
TRANSPORTERX = ['transporterPOSTNL/X','transporterDHL/X','transporterDPD/X','transporterBRIEF/X','transporterOTHER/X']
KNOWNX = ['caseKnownX','returnKnownX','cancellationKnownX','onTimeDeliveryKnownX','lateDeliveryKnownX']
PRODUCTX = ['productOrderCountX','productTotalCountX','productTotalReturnedX','productReturnFractionX']
SELLERX = ['sellerDailyOrdersX']

#Classifications
CLASS = ['generalMatchClassification']

# Create Sample

In [52]:
X_col = BASIC + WEEK + MONTH + YEAR + GROUP + TRANSPORTERX + KNOWNX + PRODUCTX + SELLERX + HISTORICX
y_col = [CLASSIFICATION[0]]  # Match label prediction
#Y_col = [CLASSIFICATION[2]]  # Binary classification

#df_sample = df
df_sample = df.sample(n = 500000, replace = False, random_state = 0)

In [53]:
df_sample = functions.addProductColumns(df_sample, 0)

In [54]:
df_sample = functions.addSellerColumns(df_sample, 0)

In [55]:
df_sample = functions.addHistoricPerformance(df_sample, 'transporterCode', 0)

# Function-based Classification

## Classification

In [None]:
PREDICT_DAYS = 11
REP = 3

resultDic = {}

#estimator = svm.SVC(random_state=0,class_weight='balanced')
# estimator = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),n_estimators=50,random_state=0)
estimator = neighbors.KNeighborsClassifier(n_neighbors = 3)
# estimator = RandomForestClassifier(n_estimators=10,random_state=0,class_weight='balanced')
# estimator = KerasClassifier(build_fn = neuralNetwork,epochs = 10,verbose = 0)
# scaler = preprocessing.MinMaxScaler()

for DAYS in range(PREDICT_DAYS):
    
    X, y = functions.dataX(df_sample, DATE, X_col, y_col, DAYS)

    result = functions.classifyLabelsNew(estimator, X, y, n = REP, scale = 'MinMax')

    resultDic[DAYS] = result
    
    print('DAYS: ',DAYS)

RESULT = pd.DataFrame.from_dict(resultDic, orient='index')

In [57]:
RESULT

Unnamed: 0,accuracy,precision_HAPPY,recall_HAPPY,f1_HAPPY,precision_UNHAPPY,recall_UNHAPPY,f1_UNHAPPY,precision_UNKNOWN,recall_UNKNOWN,f1_UNKNOWN
0,0.820077,0.843694,0.929958,0.884692,0.256725,0.074275,0.113283,0.837935,0.904022,0.869667
1,0.843493,0.857592,0.947459,0.900263,0.560505,0.167934,0.257884,0.84713,0.91085,0.877833
2,0.87736,0.923518,0.929914,0.926657,0.59893,0.464371,0.52173,0.876527,0.939677,0.907
3,0.907403,0.950257,0.949046,0.949604,0.716,0.572355,0.635279,0.888647,0.959893,0.922878
4,0.937299,0.96707,0.9741,0.970516,0.86573,0.642329,0.737198,0.903833,0.983797,0.941991
5,0.956152,0.972953,0.988549,0.980678,0.947327,0.70617,0.809158,0.92782,0.992818,0.959169
6,0.967027,0.976904,0.993877,0.985312,0.978427,0.76245,0.857036,0.94522,0.996382,0.9701
7,0.974365,0.980208,0.99703,0.988545,0.990231,0.804475,0.887712,0.958363,0.998101,0.977814
8,0.979301,0.982778,0.998331,0.990492,0.995598,0.837267,0.909568,0.967405,0.998975,0.98293
9,0.9828,0.984808,0.999061,0.991883,0.997782,0.862052,0.924935,0.973943,0.999384,0.986496


In [62]:
RESULT_BNB.to_excel('/Users/thoma/Documents/seminar_data/base_classifier_results.xlsx', sheet_name = 'BNB, 500K')

In [60]:
def classificationPerformanceOverTime(df, DATE, X_col, y_col, DAYS, estimator, REP, split = 'TimeSeries', smote = False, scale = None, NN = False):
    resultDic = {}
    
    for days in range(DAYS):
        X, y = functions.dataX(df, DATE, X_col, y_col, days)
        result = functions.classifyLabelsNew(estimator, X, y, n = REP)
        resultDic[days] = result
        print('DAYS: ', days)
    
    RESULT = pd.DataFrame.from_dict(resultDic, orient = 'index')
    return(RESULT)

In [61]:
RESULT_BNB = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, BernoulliNB(), 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [63]:
ada_estimator = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                            n_estimators=50,
                                                            random_state=0)
RESULT_ADA = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, ada_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [1]:
RESULT_DT.to_excel('/Users/thoma/Documents/seminar_data/base_classifier_results.xlsx', sheet_name = 'KNN, 500K')

NameError: name 'RESULT_DT' is not defined

In [65]:
dt_estimator = DecisionTreeClassifier(random_state=0,class_weight='balanced')

In [66]:
RESULT_DT = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, dt_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [None]:
# Give importance plot -> show importance of variables

In [67]:
gradient_estimator = GradientBoostingClassifier(random_state=0)
RESULT_GRADIENT = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, gradient_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [68]:
histgradient_estimator = HistGradientBoostingClassifier(random_state=0)
RESULT_HISTGRADIENT = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, histgradient_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [93]:
RESULT_BAG.to_excel('/Users/thoma/Documents/seminar_data/base_classifier_results.xlsx', sheet_name = 'BAG, 100K')

In [69]:
bagging_estimator = BaggingClassifier(n_estimators=10,random_state=0) 
RESULT_BAG = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, bagging_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [70]:
rf_estimator = RandomForestClassifier(n_estimators=10,random_state=0,class_weight='balanced')
RESULT_RF = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, rf_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [95]:
RESULT_RF.to_excel('/Users/thoma/Documents/seminar_data/base_classifier_results.xlsx', sheet_name = 'RF, 100K')

In [71]:
knn_estimator = neighbors.KNeighborsClassifier(n_neighbors = 3)
RESULT_KNN = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, knn_estimator, 3)

DAYS:  0
DAYS:  1
DAYS:  2
DAYS:  3
DAYS:  4
DAYS:  5
DAYS:  6
DAYS:  7
DAYS:  8
DAYS:  9
DAYS:  10


In [105]:
# Too slow to run
svm_estimator = svm.SVC(random_state=0,class_weight='balanced')
RESULT_SVM = classificationPerformanceOverTime(df_sample, DATE, X_col, y_col, 11, svm_estimator, 3) 

KeyboardInterrupt: 

In [None]:
# KNN, SVM, LOG.REG., GAUSSIAN NB

### Function: classifyLabels(classifier, X, y, n, split = 'TimeSeries', smote = False, scale = None, days = 0)

In [242]:
# Naive Bayes Bernoulli 1 day
(X, y) = dataX(df,5)
(accuracy,class_report) = classifyLabels(BernoulliNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['productTotalReturned'] = productTotalReturned
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['orderDateX'] = df_['orderDate'] + timedelta(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['daysFirstOrder'] = (df_['orderDate'] - df_['firstOrder']).dt.days + 1
A value is trying to be set

1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.8126, 2: 0.83996, 3: 0.8458}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.88      0.87      0.88     13647
     UNHAPPY       0.49      0.56      0.53      3112
     UNKNOWN       0.83      0.81      0.82      8241

    accuracy                           0.81     25000
   macro avg       0.74      0.75      0.74     25000
weighted avg       0.82      0.81      0.82     25000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.86      0.92      0.89     15320
     UNHAPPY       0.87      0.43      0.57      2913
     UNKNOWN       0.78      0.84      0.81      6767

    accuracy                           0.84     25000
   macro avg       0.84      0.73      0.76     25000
weighted avg       0.84      0.84      0.83     25000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.87      0.90      0.89     14006
     UNHAPPY       0.85      0.41      0.55      2841
     UNKNOWN       0.80      0.90     

In [None]:
warnings.filterwarnings("ignore")

In [79]:
# Naive Bayes Bernoulli
(accuracy,class_report) = classifyLabels(BernoulliNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


2


  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


3
{1: 0.588248, 2: 0.689048, 3: 0.675984}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.61      0.68     68195
     UNHAPPY       0.15      0.29      0.20     15437
     UNKNOWN       0.66      0.66      0.66     41368

    accuracy                           0.59    125000
   macro avg       0.53      0.52      0.51    125000
weighted avg       0.66      0.59      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.69      0.94      0.80     75861
     UNHAPPY       0.00      0.00      0.00     14564
     UNKNOWN       0.67      0.44      0.53     34575

    accuracy                           0.69    125000
   macro avg       0.45      0.46      0.44    125000
weighted avg       0.61      0.69      0.63    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.66      0.94      0.78     70405
     UNHAPPY       0.00      0.00      0.00     14378
     UNKNOWN       0.72      0.46

In [83]:
# Naive Bayes Gaussian
(accuracy,class_report) = classifyLabels(GaussianNB(), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.648288, 2: 0.646056, 3: 0.644936}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.68      0.84      0.75     68195
     UNHAPPY       0.18      0.08      0.11     15437
     UNKNOWN       0.67      0.55      0.61     41368

    accuracy                           0.65    125000
   macro avg       0.51      0.49      0.49    125000
weighted avg       0.62      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.71      0.83      0.76     75861
     UNHAPPY       0.18      0.10      0.13     14564
     UNKNOWN       0.59      0.48      0.53     34575

    accuracy                           0.65    125000
   macro avg       0.49      0.47      0.47    125000
weighted avg       0.61      0.65      0.62    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.78      0.75     70405
     UNHAPPY       0.16      0.10      0.12     14378
     UNKNOWN       0.62      0.60

In [70]:
# K-NN
(accuracy,class_report) = classifyLabels(neighbors.KNeighborsClassifier(n_neighbors = 3), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.776368, 2: 0.757728, 3: 0.762664}
              precision    recall  f1-score   support

       KNOWN       0.84      0.83      0.83     83632
     UNKNOWN       0.66      0.68      0.67     41368

    accuracy                           0.78    125000
   macro avg       0.75      0.75      0.75    125000
weighted avg       0.78      0.78      0.78    125000

              precision    recall  f1-score   support

       KNOWN       0.83      0.83      0.83     90425
     UNKNOWN       0.56      0.56      0.56     34575

    accuracy                           0.76    125000
   macro avg       0.70      0.70      0.70    125000
weighted avg       0.76      0.76      0.76    125000

              precision    recall  f1-score   support

       KNOWN       0.81      0.84      0.83     84783
     UNKNOWN       0.64      0.59      0.62     40217

    accuracy                           0.76    125000
   macro avg       0.73      0.72      0.72    125000
weighted avg       0.76      0.7

In [85]:
# Logistic Regression
(accuracy,class_report) = classifyLabels(LogisticRegression(random_state=0,
                                                            class_weight='balanced',
                                                            fit_intercept=False,
                                                            solver='liblinear'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.678176, 2: 0.685968, 3: 0.7108}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.75      0.76     68195
     UNHAPPY       0.20      0.03      0.05     15437
     UNKNOWN       0.59      0.80      0.68     41368

    accuracy                           0.68    125000
   macro avg       0.52      0.53      0.50    125000
weighted avg       0.64      0.68      0.65    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.83      0.79     75861
     UNHAPPY       0.21      0.05      0.08     14564
     UNKNOWN       0.58      0.64      0.61     34575

    accuracy                           0.69    125000
   macro avg       0.52      0.51      0.49    125000
weighted avg       0.64      0.69      0.66    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.81      0.79     70405
     UNHAPPY       0.21      0.03      0.05     14378
     UNKNOWN       0.64      0.79  

In [None]:
# SVM (very slow!)
(accuracy,class_report) = classifyLabels(svm.SVC(random_state=0,
                                                 class_weight='balanced'), X, y, n = 3, scale = 'MinMax')
print(accuracy)
for item in class_report.values():
    print(item)

In [87]:
# Decision Tree
(accuracy,class_report) = classifyLabels(DecisionTreeClassifier(random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)



1
2
3
{1: 0.603464, 2: 0.587064, 3: 0.603976}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.67      0.70     68195
     UNHAPPY       0.14      0.18      0.16     15437
     UNKNOWN       0.63      0.64      0.64     41368

    accuracy                           0.60    125000
   macro avg       0.50      0.50      0.50    125000
weighted avg       0.63      0.60      0.61    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.66      0.70     75861
     UNHAPPY       0.13      0.16      0.14     14564
     UNKNOWN       0.52      0.61      0.56     34575

    accuracy                           0.59    125000
   macro avg       0.47      0.48      0.47    125000
weighted avg       0.62      0.59      0.60    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.72      0.69      0.71     70405
     UNHAPPY       0.13      0.18      0.15     14378
     UNKNOWN       0.64      

In [88]:
# AdaBoost
(accuracy,class_report) = classifyLabels(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                            n_estimators=50,
                                                            random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.73088, 2: 0.740224, 3: 0.751032}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.87      0.81     68195
     UNHAPPY       0.15      0.02      0.03     15437
     UNKNOWN       0.71      0.77      0.74     41368

    accuracy                           0.73    125000
   macro avg       0.54      0.55      0.52    125000
weighted avg       0.66      0.73      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.90      0.83     75861
     UNHAPPY       0.25      0.01      0.01     14564
     UNKNOWN       0.69      0.70      0.69     34575

    accuracy                           0.74    125000
   macro avg       0.57      0.54      0.51    125000
weighted avg       0.68      0.74      0.69    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.93      0.83     70405
     UNHAPPY       0.30      0.01      0.02     14378
     UNKNOWN       0.77      0.71 

In [89]:
# Gradient Boosting
(accuracy,class_report) = classifyLabels(GradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.747288, 2: 0.75016, 3: 0.74272}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.82     68195
     UNHAPPY       0.23      0.00      0.00     15437
     UNKNOWN       0.75      0.75      0.75     41368

    accuracy                           0.75    125000
   macro avg       0.58      0.55      0.52    125000
weighted avg       0.68      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.93      0.83     75861
     UNHAPPY       0.45      0.00      0.00     14564
     UNKNOWN       0.72      0.68      0.70     34575

    accuracy                           0.75    125000
   macro avg       0.65      0.54      0.51    125000
weighted avg       0.71      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.73      0.94      0.82     70405
     UNHAPPY       0.32      0.00      0.00     14378
     UNKNOWN       0.78      0.66  

In [90]:
# Hist Gradient Boosting
(accuracy,class_report) = classifyLabels(HistGradientBoostingClassifier(random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.75276, 2: 0.751056, 3: 0.757056}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.92      0.83     68195
     UNHAPPY       0.25      0.00      0.01     15437
     UNKNOWN       0.75      0.76      0.76     41368

    accuracy                           0.75    125000
   macro avg       0.59      0.56      0.53    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.92      0.83     75861
     UNHAPPY       0.22      0.00      0.00     14564
     UNKNOWN       0.71      0.71      0.71     34575

    accuracy                           0.75    125000
   macro avg       0.56      0.54      0.52    125000
weighted avg       0.69      0.75      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.94      0.83     70405
     UNHAPPY       0.26      0.00      0.00     14378
     UNKNOWN       0.79      0.71 

In [91]:
# Bagging
(accuracy,class_report) = classifyLabels(BaggingClassifier(n_estimators=10,
                                                           random_state=0), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  return f(**kwargs)


1


  return f(**kwargs)


2


  return f(**kwargs)


3
{1: 0.704248, 2: 0.691872, 3: 0.691832}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.84      0.79     68195
     UNHAPPY       0.15      0.05      0.07     15437
     UNKNOWN       0.70      0.72      0.71     41368

    accuracy                           0.70    125000
   macro avg       0.53      0.54      0.52    125000
weighted avg       0.66      0.70      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.76      0.82      0.79     75861
     UNHAPPY       0.16      0.06      0.09     14564
     UNKNOWN       0.62      0.68      0.65     34575

    accuracy                           0.69    125000
   macro avg       0.51      0.52      0.51    125000
weighted avg       0.65      0.69      0.67    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.74      0.82      0.78     70405
     UNHAPPY       0.14      0.07      0.10     14378
     UNKNOWN       0.70      0.68

In [92]:
# Random Forest
(accuracy,class_report) = classifyLabels(RandomForestClassifier(n_estimators=10,
                                                                random_state=0,
                                                                class_weight='balanced'), X, y, n = 3)
print(accuracy)
for item in class_report.values():
    print(item)

  clf = clf.fit(X_train,y_train)


1


  clf = clf.fit(X_train,y_train)


2


  clf = clf.fit(X_train,y_train)


3
{1: 0.741472, 2: 0.743208, 3: 0.752912}
              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.90      0.82     68195
     UNHAPPY       0.18      0.03      0.05     15437
     UNKNOWN       0.75      0.74      0.75     41368

    accuracy                           0.74    125000
   macro avg       0.56      0.56      0.54    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.77      0.91      0.83     75861
     UNHAPPY       0.17      0.03      0.05     14564
     UNKNOWN       0.72      0.69      0.70     34575

    accuracy                           0.74    125000
   macro avg       0.55      0.54      0.53    125000
weighted avg       0.68      0.74      0.70    125000

              precision    recall  f1-score   support

 KNOWN HAPPY       0.75      0.93      0.83     70405
     UNHAPPY       0.18      0.03      0.05     14378
     UNKNOWN       0.79      0.71