In [733]:
import pandas as pd
import numpy as np
import os, sys

In [734]:
# GLOBALS
LOCAL_ROOT = "/Users/varunn/Documents/ML"
PROJ_DIR = os.path.join(LOCAL_ROOT, "AV_JanataHack_April10")
DATA_DIR = os.path.join(PROJ_DIR, "data")
TRAIN_FN = os.path.join(DATA_DIR, "train_8wry4cB.csv")
TEST_FN = os.path.join(DATA_DIR, "test_Yix80N0.csv")
SUBMISSION_FN = os.path.join(DATA_DIR, "sample_submission_opxHi4g.csv")

In [735]:
assert os.path.isfile(TRAIN_FN)
assert os.path.isfile(TEST_FN)
assert os.path.isfile(SUBMISSION_FN)

In [736]:
# read data
train_df = pd.read_csv(TRAIN_FN)
print(train_df.shape)
print(train_df.head())

test_df = pd.read_csv(TEST_FN)
print(test_df.shape)
print(test_df.head())

(10500, 5)
  session_id       startTime         endTime  \
0     u16159  15/12/14 18:11  15/12/14 18:12   
1     u10253  16/12/14 14:35  16/12/14 14:41   
2     u19037  01/12/14 15:58  01/12/14 15:58   
3     u14556   23/11/14 2:57   23/11/14 3:00   
4     u24295  17/12/14 16:44  17/12/14 16:46   

                                         ProductList  gender  
0  A00002/B00003/C00006/D28435/;A00002/B00003/C00...  female  
1  A00001/B00009/C00031/D29404/;A00001/B00009/C00...    male  
2                       A00002/B00001/C00020/D16944/  female  
3  A00002/B00004/C00018/D10284/;A00002/B00004/C00...  female  
4  A00001/B00001/C00012/D30805/;A00001/B00001/C00...    male  
(4500, 4)
  session_id       startTime         endTime  \
0     u12112  08/12/14 13:36  08/12/14 13:36   
1     u19725  19/12/14 13:52  19/12/14 13:52   
2     u11795  01/12/14 10:44  01/12/14 10:44   
3     u22639  08/12/14 20:19  08/12/14 20:22   
4     u18034  15/12/14 19:33  15/12/14 19:33   

                       

In [737]:
# Train Stats
print('# sessions: ', train_df['session_id'].nunique(), '\n')
assert train_df['session_id'].nunique() == train_df.shape[0]
print(train_df['gender'].value_counts(), '\n')
print('Missing values')
print(train_df.isnull().sum(), '\n')
print('dtype')
print(train_df.dtypes)

# sessions:  10500 

female    8192
male      2308
Name: gender, dtype: int64 

Missing values
session_id     0
startTime      0
endTime        0
ProductList    0
gender         0
dtype: int64 

dtype
session_id     object
startTime      object
endTime        object
ProductList    object
gender         object
dtype: object


In [738]:
# GLOBALS
ID_COL = 'session_id'
DATE_COLS = ['startTime', 'endTime']
DV_COL = 'gender'
FEAT_PREFIX = 'JH_'

In [739]:
# Label encoding
train_df[DV_COL] = train_df[DV_COL].apply(
    lambda x: 1 if x=='male' else 0)

In [740]:
print(train_df.gender.mean())

0.21980952380952382


In [741]:
# datetime conversion
for col in DATE_COLS:
    print(col)
    train_df[col] = train_df[col].apply(lambda x: pd.to_datetime(x))
    test_df[col] = test_df[col].apply(lambda x: pd.to_datetime(x))

startTime
endTime


In [742]:
def get_part_of_day(hour):
    return (
        "morning" if 5 <= hour <= 11
        else
        "afternoon" if 12 <= hour <= 17
        else
        "evening" if 18 <= hour <= 22
        else
        "night"
    )

In [743]:
# datetime features

# session duration
train_df['session_duration_sec'] = list(map(lambda x, y: (x-y).seconds,
                                            train_df['endTime'],
                                            train_df['startTime']))
test_df['session_duration_sec'] = list(map(lambda x, y: (x-y).seconds,
                                           test_df['endTime'],
                                           test_df['startTime']))

# month of year
train_df['session_start_month'] = train_df['startTime'].dt.month
test_df['session_start_month'] = test_df['startTime'].dt.month
train_df['session_end_month'] = train_df['endTime'].dt.month
test_df['session_end_month'] = test_df['endTime'].dt.month

# day of week
train_df['session_start_dow'] = train_df['startTime'].dt.dayofweek
test_df['session_start_dow'] = test_df['startTime'].dt.dayofweek
train_df['session_end_dow'] = train_df['endTime'].dt.dayofweek
test_df['session_end_dow'] = test_df['endTime'].dt.dayofweek

# week of year
train_df['session_start_woy'] = train_df['startTime'].dt.weekofyear
test_df['session_start_woy'] = test_df['startTime'].dt.weekofyear
train_df['session_end_woy'] = train_df['endTime'].dt.weekofyear
test_df['session_end_woy'] = test_df['endTime'].dt.weekofyear

# hour of the day
train_df['session_start_hod'] = train_df['startTime'].dt.hour
test_df['session_start_hod'] = test_df['startTime'].dt.hour
train_df['session_end_hod'] = train_df['endTime'].dt.hour
test_df['session_end_hod'] = test_df['endTime'].dt.hour

# minute of the hour
train_df['session_start_moh'] = train_df['startTime'].dt.minute
test_df['session_start_moh'] = test_df['startTime'].dt.minute
train_df['session_end_moh'] = train_df['endTime'].dt.minute
test_df['session_end_moh'] = test_df['endTime'].dt.minute

# part of the day
train_df['session_start_part_of_day'] = train_df['session_start_hod'].apply(
    lambda x: get_part_of_day(x))
test_df['session_start_part_of_day'] = test_df['session_start_hod'].apply(
    lambda x: get_part_of_day(x))
train_df['session_end_part_of_day'] = train_df['session_end_hod'].apply(
    lambda x: get_part_of_day(x))
test_df['session_end_part_of_day'] = test_df['session_end_hod'].apply(
    lambda x: get_part_of_day(x))

In [744]:
from sklearn.metrics import roc_auc_score

In [745]:
# productlist parsing

def parse_product_list(productList):
    out = [x.split('/') for x in productList.split(';')]
    prods = list(set([x[0] for x in out]))
    cats = list(set([x[1] for x in out]))
    sub_cats = list(set([x[2] for x in out]))
    sub_sub_cats = list(set([x[3] for x in out]))
    return prods, cats, sub_cats, sub_sub_cats


def get_prod_cat_lst(productlist):
    return list(set(['/'.join(x.split('/')[:2])
                     for x in productlist.split(';')]))


def get_prod_cat_str(productlist):
    return ";".join(set(['/'.join(x.split('/')[:2])
                         for x in productlist.split(';')]))


def get_prod_cat_sub_cat_lst(productlist):
    return list(set(['/'.join(x.split('/')[:3])
                     for x in productlist.split(';')]))


def get_prod_cat_sub_cat_str(productlist):
    return ";".join(set(['/'.join(x.split('/')[:3])
                         for x in productlist.split(';')]))


def get_prod_cat_sub_sub_cat_lst(productlist):
    return list(set(['/'.join(x.split('/')[:4])
                     for x in productlist.split(';')]))


def get_prod_cat_sub_sub_cat_str(productlist):
    return ";".join(set(['/'.join(x.split('/')[:4])
                         for x in productlist.split(';')]))


def parse_product_list_df(data):
    
    parsed_prod_lists = data['ProductList'].apply(
        lambda x: parse_product_list(x))
    for idx, col in [(0, 'prod_lst'), (1, 'cat_lst'), (2, 'sub_cat_lst'),
                     (3, 'sub_sub_cat_lst')]:
        data[col] = parsed_prod_lists.apply(lambda x: x[idx])
    
    data['prod_str'] = data['prod_lst'].apply(lambda x: '/'.join(x))
    data['cat_str'] = data['cat_lst'].apply(lambda x: '/'.join(x))
    data['sub_cat_str'] = data['sub_cat_lst'].apply(lambda x: '/'.join(x))
    data['sub_sub_cat_str'] = data['sub_sub_cat_lst'].apply(lambda x: '/'.join(x))
    data['prod_cat_lst'] = data['ProductList'].apply(get_prod_cat_lst)
    data['prod_cat_str'] = data['ProductList'].apply(get_prod_cat_str)
    data['prod_cat_sub_cat_lst'] = data['ProductList'].apply(
        get_prod_cat_sub_cat_lst)
    data['prod_cat_sub_cat_str'] = data['ProductList'].apply(
        get_prod_cat_sub_cat_str)
    data['prod_cat_sub_sub_cat_lst'] = data['ProductList'].apply(
        get_prod_cat_sub_sub_cat_lst)
    data['prod_cat_sub_sub_cat_str'] = data['ProductList'].apply(
        get_prod_cat_sub_sub_cat_str)
    
    return data

In [746]:
train_df = parse_product_list_df(train_df)
test_df = parse_product_list_df(test_df)

In [747]:
print(train_df.shape)
print(train_df.head())

(10500, 32)
  session_id           startTime             endTime  \
0     u16159 2014-12-15 18:11:00 2014-12-15 18:12:00   
1     u10253 2014-12-16 14:35:00 2014-12-16 14:41:00   
2     u19037 2014-01-12 15:58:00 2014-01-12 15:58:00   
3     u14556 2014-11-23 02:57:00 2014-11-23 03:00:00   
4     u24295 2014-12-17 16:44:00 2014-12-17 16:46:00   

                                         ProductList  gender  \
0  A00002/B00003/C00006/D28435/;A00002/B00003/C00...       0   
1  A00001/B00009/C00031/D29404/;A00001/B00009/C00...       1   
2                       A00002/B00001/C00020/D16944/       0   
3  A00002/B00004/C00018/D10284/;A00002/B00004/C00...       0   
4  A00001/B00001/C00012/D30805/;A00001/B00001/C00...       1   

   session_duration_sec  session_start_month  session_end_month  \
0                    60                   12                 12   
1                   360                   12                 12   
2                     0                    1                  1  

In [748]:
print(train_df['prod_cat_str'].nunique())
print(test_df['prod_cat_str'].nunique())

497
329


In [749]:
from math import log
from sklearn.model_selection import KFold, StratifiedKFold


def _find_cats(data, id_col, cat_col, freq_thresh):
    """
    identifies values of discrete variables that satisfy the frequency threshold
    id_col: deal_id or the entity of choice
    cat_col: discrete variable
    """
    cats = data.groupby(cat_col)[id_col].nunique().to_dict()
    freq_thresh = int(freq_thresh*data[id_col].nunique())
    good_cats = [k for k, v in cats.items() if v >= freq_thresh]

    return good_cats


def _group_cats(data, cat_col, id_col=ID_COL, dv_col=DV_COL,
                freq_thresh=0.003):

    df = pd.DataFrame({id_col: np.repeat(data[id_col].values,
                                         data[cat_col].str.len()),
                       dv_col: np.repeat(data[dv_col].values,
                                         data[cat_col].str.len()),
                       cat_col: np.concatenate(data[cat_col].values)})
    grouped_cats = _find_cats(df, id_col, cat_col, freq_thresh)
    mask = df[cat_col].isin(grouped_cats)
    df.loc[~mask, cat_col] = 'others'

    return grouped_cats, df


def _apply_grouping(test_data, cat_col, grouped_cats, id_col=ID_COL,
                    dv_col=DV_COL):
    
    df = pd.DataFrame({id_col: np.repeat(test_data[id_col].values,
                                         test_data[cat_col].str.len()),
                       cat_col: np.concatenate(test_data[cat_col].values)})
    mask = df[cat_col].isin(grouped_cats)
    df.loc[~mask, cat_col] = 'others'
    return df


def get_woe(data, cat_col, id_col=ID_COL, dv_col=DV_COL):
    table = pd.pivot_table(data, index=cat_col, columns=dv_col,
                           values=id_col,
                           aggfunc=lambda x: len(x.unique()),
                           margins=True)
    table.fillna(value=0, inplace=True)
    table.reset_index(inplace=True)
    table.rename(columns={0: '#good', 1: '#bad'}, inplace=True)
    total_good = table.loc[len(table)-1, '#good']
    total_bad = table.loc[len(table)-1, '#bad']
    table['perc_good'] = table['#good'].apply(lambda x: 1.*x/total_good)
    table['perc_bad'] = table['#bad'].apply(lambda x: 1.*x/total_bad)
    mask = (table['perc_good'] != 0) & (table['perc_bad'] != 0)
    table.loc[mask, 'WOE'] = list(map(
     lambda x, y: log(x / float(y)), table.loc[mask, 'perc_good'],
     table.loc[mask, 'perc_bad']))
    return table


def get_counts(data, cat_col, id_col=ID_COL):
    table = pd.DataFrame(data.groupby(cat_col)[id_col].nunique())
    table.reset_index(inplace=True)
    table.rename(columns={id_col: 'num_sessions'}, inplace=True)
    return table


def getDVEncodeVar(compute_df, target_df, cat_col, dv_col=DV_COL,
                   encode_type='woe'):

    if encode_type == 'woe':
        assert dv_col in target_df
        grouped_df = get_woe(target_df, cat_col)
        return_col = 'WOE'
    elif encode_type == 'count':
        grouped_df = get_counts(target_df, cat_col)
        return_col = 'num_sessions'
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=cat_col)
    merged_df.fillna(-999999, inplace=True)
    return merged_df[return_col].tolist()


def _aggregation(data, cat_col, id_col=ID_COL, dv_col=DV_COL,
                 enc_prefix='_enc'):
    if dv_col in data:
        group_cols = [id_col, dv_col]
    else:
        group_cols = [id_col]
    df = pd.DataFrame(data.groupby(group_cols).agg({
        cat_col: lambda x: list(x), cat_col+enc_prefix: lambda x: list(x)}))
    df.reset_index(inplace=True)
    assert df.shape[0] == df[id_col].nunique()
    return df


def do_target_encode_woe(train_data, test_data, discrete_cols,
                         id_col=ID_COL, dv_col=DV_COL,
                         grouping_flag=True, agg_flag=True):
    
    kf = KFold(n_splits=5, shuffle=True, random_state=2020)
    train_final = train_data.copy()
    test_final = test_data.copy()
    for col in discrete_cols:
        print('Cat Col: ', col)
        
        if grouping_flag:
            print('grouping')
            grouped_cats, train_tmp = _group_cats(train_data, col)
            test_tmp = _apply_grouping(test_data, col, grouped_cats)
        else:
            train_tmp = train_data.copy()
            test_tmp = test_data.copy()
        
        print('encoding')
        train_enc_values = np.zeros(train_tmp.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_tmp):
            dev_X, val_X = train_tmp.iloc[dev_index, :], train_tmp.iloc[val_index, :]
            train_enc_values[val_index] =  np.array( 
                getDVEncodeVar(val_X[[col]], dev_X, col))
            test_enc_values += np.array( 
                getDVEncodeVar(test_tmp[[col]], dev_X, col))
        test_enc_values /= 5.
        train_tmp[col + "_enc"] = train_enc_values
        test_tmp[col + "_enc"] = test_enc_values
        print(train_tmp[col + "_enc"].describe())
        print(test_tmp[col + "_enc"].describe())
        
        if agg_flag:
            print('aggregating')
            train_tmp = _aggregation(train_tmp, col)
            test_tmp = _aggregation(test_tmp, col)
        
        print('merge')
        train_final.drop([col], axis=1, inplace=True)
        group_cols = [id_col, dv_col]
        needed_cols = [col, col+'_enc']
        train_final = pd.merge(train_final,
                               train_tmp[group_cols+needed_cols],
                               on=group_cols, how='left')
        test_final.drop([col], axis=1, inplace=True)
        group_cols = [id_col]
        test_final = pd.merge(test_final,
                              test_tmp[group_cols+needed_cols],
                              on=group_cols, how='left')
    return train_final, test_final


def do_target_encode_count(train_data, test_data, discrete_cols,
                           id_col=ID_COL, prod_flag=True):
    
    kf = KFold(n_splits=5, shuffle=True, random_state=2020)
    train_final = train_data.copy()
    test_final = test_data.copy()
    
    for col in discrete_cols:
        print('Cat Col: ', col)
        
        if prod_flag:
            print('product col')
            train_tmp = pd.DataFrame(
                {id_col: np.repeat(train_data[id_col].values,
                                   train_data[col].str.len()),
                 col: np.concatenate(train_data[col].values)})
            test_tmp = pd.DataFrame(
                {id_col: np.repeat(test_data[id_col].values,
                                   test_data[col].str.len()),
                 col: np.concatenate(test_data[col].values)})
        else:
            train_tmp = train_data.copy()
            test_tmp = test_data.copy()
        
        print('encoding')
        train_enc_values = np.zeros(train_tmp.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_tmp):
            dev_X, val_X = train_tmp.iloc[dev_index, :], train_tmp.iloc[val_index, :]
            train_enc_values[val_index] =  np.array( 
                getDVEncodeVar(val_X[[col]], dev_X, col,
                               encode_type='count'))
            test_enc_values += np.array( 
                getDVEncodeVar(test_tmp[[col]], dev_X, col,
                               encode_type='count'))

        test_enc_values = test_enc_values/5.
        train_tmp[col + "_count_enc"] = train_enc_values
        test_tmp[col + "_count_enc"] = test_enc_values
        print(train_tmp[col + "_count_enc"].describe())
        print(test_tmp[col + "_count_enc"].describe())
        
        if prod_flag:
            print('aggregating')
            train_tmp = _aggregation(train_tmp, col,
                                     enc_prefix='_count_enc')
            test_tmp = _aggregation(test_tmp, col,
                                    enc_prefix='_count_enc')
        
        print('merge')
        train_final.drop([col], axis=1, inplace=True)
        group_cols = [id_col]
        needed_cols = [col, col+'_count_enc']
        train_final = pd.merge(train_final,
                               train_tmp[group_cols+needed_cols],
                               on=group_cols, how='left')
        test_final.drop([col], axis=1, inplace=True)
        test_final = pd.merge(test_final,
                              test_tmp[group_cols+needed_cols],
                              on=group_cols, how='left')
    return train_final, test_final

In [750]:
print('WOE encoding for prod_cols: \n')
prod_cols = ['prod_lst', 'cat_lst', 'sub_cat_lst', 'sub_sub_cat_lst']
train_df1, test_df1 = do_target_encode_woe(train_df, test_df,
                                           prod_cols)

print('WOE encoding for time_cols: \n')
time_cols = ['session_start_month', 'session_end_month',
             'session_start_dow', 'session_end_dow',
             'session_start_woy', 'session_end_woy',
             'session_start_hod', 'session_end_hod',
             'session_start_moh', 'session_end_moh',
             'session_start_part_of_day', 'session_end_part_of_day',
             'prod_str', 'cat_str', 'sub_cat_str', 'sub_sub_cat_str']
train_df1, test_df1 = do_target_encode_woe(train_df1, test_df1,
                                           time_cols,
                                           grouping_flag=False,
                                           agg_flag=False)

print('WOE encoding for prod_cat_lst: \n')
train_df1, test_df1 = do_target_encode_woe(train_df1, test_df1,
                                           ['prod_cat_lst',
                                            'prod_cat_sub_cat_lst',
                                            'prod_cat_sub_sub_cat_lst'])

print('WOE encoding for prod_cat_str: \n')
train_df1, test_df1 = do_target_encode_woe(train_df1, test_df1,
                                           ['prod_cat_str',
                                            'prod_cat_sub_cat_str',
                                            'prod_cat_sub_sub_cat_str'],
                                           grouping_flag=False,
                                           agg_flag=False)

print('Count encoding for prod_cols: \n')
train_df1, test_df1 = do_target_encode_count(train_df1, test_df1,
                                             prod_cols)

print('Count encoding for time_cols: \n')
train_df1, test_df1 = do_target_encode_count(train_df1, test_df1,
                                             time_cols, prod_flag=False)

print('Count encoding for prod_cat_lst: \n')
train_df1, test_df1 = do_target_encode_count(train_df1, test_df1,
                                             ['prod_cat_lst',
                                              'prod_cat_sub_cat_lst',
                                              'prod_cat_sub_sub_cat_lst'])

print('Count encoding for prod_cat_str: \n')
train_df1, test_df1 = do_target_encode_count(train_df1, test_df1,
                                             ['prod_cat_str',
                                              'prod_cat_sub_cat_str',
                                              'prod_cat_sub_sub_cat_str'],
                                             prod_flag=False)

WOE encoding for prod_cols: 

Cat Col:  prod_lst
grouping
encoding
count    10913.000000
mean         0.275415
std          1.222529
min         -2.454873
25%          0.811109
50%          0.831041
75%          0.833999
max          1.103338
Name: prod_lst_enc, dtype: float64
count    4684.000000
mean        0.301727
std         1.204740
min        -2.438714
25%         0.826442
50%         0.826442
75%         0.826442
max         1.032331
Name: prod_lst_enc, dtype: float64
aggregating
merge
Cat Col:  cat_lst
grouping
encoding
count     12071.000000
mean       -911.004159
std       30174.801360
min     -999999.000000
25%          -0.577802
50%           0.813316
75%           1.147519
max           2.045460
Name: cat_lst_enc, dtype: float64
count      5234.000000
mean      -1298.891215
std       16068.558742
min     -199998.834773
25%          -0.521762
50%           0.777545
75%           1.150714
max           1.826808
Name: cat_lst_enc, dtype: float64
aggregating
merge
Cat Col:  s

count     10500.000000
mean     -67999.635520
std      251757.720729
min     -999999.000000
25%          -0.677409
50%           0.809038
75%           1.200807
max           2.382296
Name: cat_str_enc, dtype: float64
count      4500.000000
mean     -72666.254699
std      246368.455288
min     -999999.000000
25%          -0.657818
50%           0.811030
75%           1.234052
max           1.951826
Name: cat_str_enc, dtype: float64
merge
Cat Col:  sub_cat_str
encoding
count     10500.000000
mean    -251999.455076
std      434181.672035
min     -999999.000000
25%     -999999.000000
50%           0.444606
75%           1.130618
max           2.850057
Name: sub_cat_str_enc, dtype: float64
count      4500.000000
mean    -254710.543859
std      423705.533843
min     -999999.000000
25%     -200001.507281
50%           0.328750
75%           1.113286
max           2.099923
Name: sub_cat_str_enc, dtype: float64
merge
Cat Col:  sub_sub_cat_str
encoding
count     10500.000000
mean    -989903.778

Cat Col:  session_start_woy
encoding
count    10500.000000
mean      1085.043048
std        658.388318
min         62.000000
25%        354.000000
50%       1157.500000
75%       1248.000000
max       2082.000000
Name: session_start_woy_count_enc, dtype: float64
count    4500.000000
mean     1076.533867
std       654.154100
min        67.200000
25%       352.000000
50%      1180.000000
75%      1229.600000
max      2068.000000
Name: session_start_woy_count_enc, dtype: float64
merge
Cat Col:  session_end_woy
encoding
count    10500.000000
mean      1086.201714
std        663.958600
min         61.000000
25%        355.000000
50%       1122.000000
75%       1236.000000
max       2093.000000
Name: session_end_woy_count_enc, dtype: float64
count    4500.000000
mean     1078.175289
std       659.325398
min        67.200000
25%       353.600000
50%      1176.000000
75%      1216.800000
max      2080.800000
Name: session_end_woy_count_enc, dtype: float64
merge
Cat Col:  session_start_hod
enco

Cat Col:  prod_cat_sub_sub_cat_str
encoding
count     10500.000000
mean    -851998.685333
std      355117.651677
min     -999999.000000
25%     -999999.000000
50%     -999999.000000
75%     -999999.000000
max          23.000000
Name: prod_cat_sub_sub_cat_str_count_enc, dtype: float64
count      4500.000000
mean    -847465.327289
std      340110.890235
min     -999999.000000
25%     -999999.000000
50%     -999999.000000
75%     -999999.000000
max          20.000000
Name: prod_cat_sub_sub_cat_str_count_enc, dtype: float64
merge


In [751]:
print(train_df1.shape)
print(test_df1.shape)
print(train_df1.head())
print(train_df1.isnull().sum())

(10500, 84)
(4500, 83)
  session_id           startTime             endTime  \
0     u16159 2014-12-15 18:11:00 2014-12-15 18:12:00   
1     u10253 2014-12-16 14:35:00 2014-12-16 14:41:00   
2     u19037 2014-01-12 15:58:00 2014-01-12 15:58:00   
3     u14556 2014-11-23 02:57:00 2014-11-23 03:00:00   
4     u24295 2014-12-17 16:44:00 2014-12-17 16:46:00   

                                         ProductList  gender  \
0  A00002/B00003/C00006/D28435/;A00002/B00003/C00...       0   
1  A00001/B00009/C00031/D29404/;A00001/B00009/C00...       1   
2                       A00002/B00001/C00020/D16944/       0   
3  A00002/B00004/C00018/D10284/;A00002/B00004/C00...       0   
4  A00001/B00001/C00012/D30805/;A00001/B00001/C00...       1   

   session_duration_sec           prod_lst_enc            cat_lst_enc  \
0                    60   [0.8339991858704539]   [0.8195565322658352]   
1                   360  [-2.4236070247273154]   [-2.250655947210555]   
2                     0   [0.8111093

In [752]:
def calc_ratio_feats(numerator, denominator, err_value=0):
    if (denominator) and (denominator != 0):
        return 1.*numerator/denominator
    else:
        return err_value
    
    
def mean_encode_mapping(data, group_col, agg_col):
    table = pd.DataFrame(data.groupby(group_col)[agg_col].mean())
    table.reset_index(inplace=True)
    map_dct = dict(zip(table[group_col], table[agg_col]))
    return map_dct

In [753]:
def calc_features(train_data, test_data, id_col=ID_COL, dv_col=DV_COL,
                  feat_prefix=FEAT_PREFIX):
    
    train_df = train_data.copy()
    test_df = test_data.copy()
    
    prod_cols = ['prod_lst', 'cat_lst', 'sub_cat_lst', 'sub_sub_cat_lst']
    prod_cols_str = ['prod_str', 'cat_str', 'sub_cat_str',
                     'sub_sub_cat_str']
    prod_cols_enc = [x+'_enc' for x in prod_cols]
    prod_cols_count_enc = [x+'_count_enc' for x in prod_cols]
    prod_cat_cols = ['prod_cat_lst', 'prod_cat_sub_cat_lst',
                     'prod_cat_sub_sub_cat_lst',
                     'prod_cat_str', 'prod_cat_sub_cat_str',
                     'prod_cat_sub_sub_cat_str']
    prod_cat_cols_enc = ['prod_cat_lst_enc', 'prod_cat_sub_cat_lst_enc',
                         'prod_cat_sub_sub_cat_lst_enc']
    prod_cat_cols_count_enc = ['prod_cat_lst_count_enc',
                               'prod_cat_sub_cat_lst_count_enc',
                               'prod_cat_sub_sub_cat_lst_count_enc']
    drop_cols = [id_col, dv_col, 'startTime', 'endTime', 'ProductList']
    time_cols = ['session_start_month', 'session_end_month',
                 'session_start_dow', 'session_end_dow',
                 'session_start_woy', 'session_end_woy',
                 'session_start_hod', 'session_end_hod',
                 'session_start_moh', 'session_end_moh',
                 'session_start_part_of_day', 'session_end_part_of_day']
    drop_cols = (drop_cols + prod_cols + prod_cols_str + prod_cols_enc +
                 prod_cols_count_enc + prod_cat_cols + prod_cat_cols_enc +
                 prod_cat_cols_count_enc + time_cols)
    
    print('count of prod_cols')
    for col in prod_cols + ['prod_cat_lst', 'prod_cat_sub_cat_lst']:
        train_df['num_'+col] = train_df[col].apply(lambda x: len(set(x)))
        test_df['num_'+col] = test_df[col].apply(lambda x: len(set(x)))
    
    print('interaction between time_cols')
    p1, p2, p3 = 'session_start_', 'session_end_', 'session_equal_'
    for _name, col1, col2 in [
        (p3+'part_of_day', p1+'part_of_day', p2+'part_of_day'),
        (p3+'dow', p1+'dow', p2+'dow'),
        (p3+'woy', p1+'woy', p2+'woy'),
        (p3+'month', p1+'month', p2+'month')]:
        train_df[_name] = list(
            map(lambda x,y: 1 if x!=y else 0,
                train_df[col1],
                train_df[col2]))
        test_df[_name] = list(
            map(lambda x,y: 1 if x!=y else 0,
                test_df[col1],
                test_df[col2]))
    
    print('aggregating product encodings')
    for col in prod_cols_enc:
        print(col)
        for _name, _func in [('_min', lambda x: np.nanmin(x)),
                             ('_max', lambda x: np.nanmax(x)),
                             ('_mean', lambda x: np.nanmean(x)),
                             ('_std', lambda x: np.nanstd(x))]:
            train_df[col+_name] = train_df[col].apply(lambda x: _func(x))
            test_df[col+_name] = test_df[col].apply(lambda x: _func(x))
            
    for col in prod_cols_count_enc:
        print(col)
        for _name, _func in [('_min', lambda x: np.nanmin(x)),
                             ('_max', lambda x: np.nanmax(x)),
                             ('_mean', lambda x: np.nanmean(x)),
                             ('_std', lambda x: np.nanstd(x))]:
            train_df[col+_name] = train_df[col].apply(lambda x: _func(x))
            test_df[col+_name] = test_df[col].apply(lambda x: _func(x))
            
    for col in prod_cat_cols_enc:
        print(col)
        for _name, _func in [('_min', lambda x: np.nanmin(x)),
                             ('_max', lambda x: np.nanmax(x)),
                             ('_mean', lambda x: np.nanmean(x)),
                             ('_std', lambda x: np.nanstd(x))]:
            train_df[col+_name] = train_df[col].apply(lambda x: _func(x))
            test_df[col+_name] = test_df[col].apply(lambda x: _func(x))
            
    for col in prod_cat_cols_count_enc:
        print(col)
        for _name, _func in [('_min', lambda x: np.nanmin(x)),
                             ('_max', lambda x: np.nanmax(x)),
                             ('_mean', lambda x: np.nanmean(x)),
                             ('_std', lambda x: np.nanstd(x))]:
            train_df[col+_name] = train_df[col].apply(lambda x: _func(x))
            test_df[col+_name] = test_df[col].apply(lambda x: _func(x))
            
    print('Ratio of encodings')
    for _name, num, denom in [
        ('ratio_prod_lst_enc_mean_min', 'prod_lst_enc_mean',
         'prod_lst_enc_min'),
        ('ratio_prod_lst_enc_mean_max', 'prod_lst_enc_mean',
         'prod_lst_enc_max'),
        ('ratio_cat_lst_enc_mean_min', 'cat_lst_enc_mean',
         'cat_lst_enc_min'),
        ('ratio_cat_lst_enc_mean_max', 'cat_lst_enc_mean',
         'cat_lst_enc_max'),
        ('ratio_sub_cat_lst_enc_mean_min', 'sub_cat_lst_enc_mean',
         'sub_cat_lst_enc_min'),
        ('ratio_sub_cat_lst_enc_mean_max', 'sub_cat_lst_enc_mean',
         'sub_cat_lst_enc_max'),
        ('ratio_sub_sub_cat_lst_enc_mean_min', 'sub_sub_cat_lst_enc_mean',
         'sub_sub_cat_lst_enc_min'),
        ('ratio_sub_sub_cat_lst_enc_mean_max', 'sub_sub_cat_lst_enc_mean',
         'sub_sub_cat_lst_enc_max'),
        ('ratio_prod_cat_lst_enc_mean_min', 'prod_cat_lst_enc_mean',
         'prod_cat_lst_enc_min'),
        ('ratio_prod_cat_lst_enc_mean_max', 'prod_cat_lst_enc_mean',
         'prod_cat_lst_enc_max'),
        ('ratio_prod_cat_sub_cat_lst_enc_mean_min', 'prod_cat_sub_cat_lst_enc_mean',
         'prod_cat_sub_cat_lst_enc_min'),
        ('ratio_prod_cat_sub_cat_lst_enc_mean_max', 'prod_cat_sub_cat_lst_enc_mean',
         'prod_cat_sub_cat_lst_enc_max')]:
        print(_name)
        train_df[_name] = list(
            map(lambda x, y: calc_ratio_feats(x, y),
                train_df[num], train_df[denom]))
        test_df[_name] = list(
            map(lambda x, y: calc_ratio_feats(x, y),
                test_df[num], test_df[denom]))
    
    print('MI of encodings')
    for _name, col1, col2, col3 in [
        ('prod_cat_lst_enc_mi', 'prod_cat_lst_enc_mean', 'prod_lst_enc_mean',
         'cat_lst_enc_mean'),
        ('prod_cat_sub_cat_lst_enc_mi', 'prod_cat_sub_cat_lst_enc_mean',
         'prod_cat_lst_enc_mean', 'sub_cat_lst_enc_mean'),
        ('prod_cat_sub_sub_cat_lst_enc_mi', 'prod_cat_sub_sub_cat_lst_enc_mean',
         'prod_cat_sub_cat_lst_enc_mean', 'sub_sub_cat_lst_enc_mean')]:
        print(_name)
        train_df[_name] = list(map(lambda x,y,z: 1.*x/(y*z),
                                      train_df[col1],
                                      train_df[col2],
                                      train_df[col3]))
        test_df[_name] = list(map(lambda x,y,z: 1.*x/(y*z),
                                      test_df[col1],
                                      test_df[col2],
                                      test_df[col3]))
        
    print('mean encodings by time_cols')
    for col1 in ['session_start_part_of_day', 'session_end_part_of_day']:
        for col2 in ['prod_str_enc', 'cat_str_enc', 'sub_cat_str_enc',
                     'sub_sub_cat_str_enc', 'prod_cat_lst_enc_mean',
                     'prod_cat_sub_cat_lst_enc_mean']:
            map_dct = mean_encode_mapping(train_df, col1, col2)
            train_df[col1+'_meanenc_'+col2] = train_df[col1].apply(
                lambda x: map_dct[x])
            
            map_dct = mean_encode_mapping(test_df, col1, col2)
            test_df[col1+'_meanenc_'+col2] = test_df[col1].apply(
                lambda x: map_dct[x])
    
    
    feat_cols = [x for x in list(train_df.columns)
                 if x not in drop_cols]
    print(feat_cols)
    feat_cols_renamed = [feat_prefix+x for x in feat_cols]
    rename_dct = dict(zip(feat_cols, feat_cols_renamed))
    train_df.rename(columns=rename_dct, inplace=True)
    test_df.rename(columns=rename_dct, inplace=True)
    
    return train_df, test_df

In [754]:
train_final, test_final = calc_features(train_df1, test_df1)

count of prod_cols
interaction between time_cols
aggregating product encodings
prod_lst_enc
cat_lst_enc
sub_cat_lst_enc
sub_sub_cat_lst_enc
prod_lst_count_enc
cat_lst_count_enc
sub_cat_lst_count_enc
sub_sub_cat_lst_count_enc
prod_cat_lst_enc
prod_cat_sub_cat_lst_enc
prod_cat_sub_sub_cat_lst_enc
prod_cat_lst_count_enc
prod_cat_sub_cat_lst_count_enc
prod_cat_sub_sub_cat_lst_count_enc
Ratio of encodings
ratio_prod_lst_enc_mean_min
ratio_prod_lst_enc_mean_max
ratio_cat_lst_enc_mean_min
ratio_cat_lst_enc_mean_max
ratio_sub_cat_lst_enc_mean_min
ratio_sub_cat_lst_enc_mean_max
ratio_sub_sub_cat_lst_enc_mean_min
ratio_sub_sub_cat_lst_enc_mean_max
ratio_prod_cat_lst_enc_mean_min
ratio_prod_cat_lst_enc_mean_max
ratio_prod_cat_sub_cat_lst_enc_mean_min
ratio_prod_cat_sub_cat_lst_enc_mean_max
MI of encodings
prod_cat_lst_enc_mi
prod_cat_sub_cat_lst_enc_mi
prod_cat_sub_sub_cat_lst_enc_mi
mean encodings by time_cols
['session_duration_sec', 'session_start_month_enc', 'session_end_month_enc', 'session_

In [755]:
print(train_final.shape)
print(test_final.shape)
feat_cols = [x for x in list(train_final.columns) if x.startswith(FEAT_PREFIX)]
print(len(feat_cols))

(10500, 177)
(4500, 176)
132


In [765]:
from sklearn import metrics
from sklearn import ensemble
from sklearn.metrics import roc_auc_score, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
import operator, joblib, os
import pandas as pd
import numpy as np


def create_feature_map(features, fmap_fn):
    outfile = open(fmap_fn, 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(fmap_fn, imp_fn, train_X, train_y, test_X, test_y=None,
           test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8,
           eta=0.05):
    # define hyperparameters
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = 'auc'
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names, fmap_fn)
        importance = model.get_fscore()
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv(imp_fn, index=False, encoding='utf-8')

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)

    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2, model


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params['boosting_type'] = 'gbdt'
    params["objective"] = "binary"
    params['metric'] = 'binary_error'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 10
    params['subsample'] = 0.8
    params['reg_alpha'] = 0.1
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.8
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=300, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    pred_class_y = np.where(pred_test_y > 0.5, 1, 0)
    loss = 0
    if test_y is not None:
        loss = get_accuracy(test_y, pred_class_y)

    return pred_test_y, pred_class_y, loss, pred_test_y2, model


def runRF(train_X, train_y, test_X, test_y=None, test_X2=None,
          dep=20, num_trees=1000, leaf=10, feat=0.2):
    model = ensemble.RandomForestClassifier(
            n_estimators = num_trees,
                    max_depth = dep,
                    min_samples_split = 2,
                    min_samples_leaf = leaf,
                    max_features =  feat,
                    n_jobs = 4,
                    random_state = 0)
    model.fit(train_X, train_y)
    pred_train_class_y = model.predict(train_X)
    pred_test_class_y = model.predict(test_X)
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_test_class_y2 = model.predict(test_X2)
    pred_test_y2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    
    train_loss = get_accuracy(train_y, pred_train_class_y)
    test_loss = get_accuracy(test_y, pred_test_class_y)
    print("Train and Test accuracy: ", train_loss, test_loss)
    return pred_test_y, pred_test_class_y, test_loss, pred_test_y2, pred_test_class_y2, model


def xgb_predict(data, model):
    xgtest = xgb.DMatrix(data)
    probs = list(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
    return probs


def xgb_persist(model, model_fn):
    joblib.dump(model, model_fn)


def xgb_load(model_fn):
    return joblib.load(model_fn)


def get_auc(actual, pred):
    auc = roc_auc_score(actual, pred)
    return max(auc, 1-auc)


def get_accuracy(actual, pred):
    return 100.*(actual==pred).mean()


def get_prec_rec_f_score(actual, pred, pos_class=1):

    if pos_class == 1:
        tn, fp, fn, tp = confusion_matrix(actual, pred, labels=[0,1]).ravel()
    elif pos_class == 0:
        tp, fn, fp, tn = confusion_matrix(actual, pred, labels=[0,1]).ravel()
    prec = 100.*(tp/float(tp+fp+0.000000001))
    rec = 100.*(tp/float(tp+fn+0.000000001))
    f_score = 2.*prec*rec/(prec+rec+0.000000001)
    return prec, rec, f_score


def calc_metrics(data, dv_col, score_col, pred_class_col, id_col='deal_id'):

    df = data.copy()

    assert df.shape[0] == df[id_col].nunique()

    feats = ['num_deals', 'dv_rate', 'auc', 'accuracy',
             'prec_win', 'rec_win', 'f_score_win', 'prec_lost',
             'rec_lost', 'f_score_lost']
    actual = df[dv_col]
    pred = df[pred_class_col]
    score = df[score_col]
    prec_good, rec_good, f_score_good = get_prec_rec_f_score(actual, pred, 1)
    prec_bad, rec_bad, f_score_bad = get_prec_rec_f_score(actual, pred, 0)
    feat_values = [df.shape[0], round(df[dv_col].mean(), 4),
                   get_auc(actual, score),
                   get_accuracy(actual, pred),
                   prec_good, rec_good, f_score_good,
                   prec_bad, rec_bad, f_score_bad]
    return dict(zip(feats, feat_values))


def find_optimal_threshold(data, dv_col, score_col, prob_thresholds,
                           id_col='deal_id'):

    df = data[[id_col, dv_col, score_col]]

    accs = []
    max_acc = 0
    for prob_threshold in prob_thresholds:
        print(prob_threshold)
        df['pred_class'] = df[score_col].apply(
            lambda x: 1 if x >= prob_threshold else 0)
        acc = get_accuracy(df[dv_col], df['pred_class'])
        accs.append(acc)
        if acc > max_acc:
            max_acc = acc

    max_idx = np.argmax(accs)
    return prob_thresholds[max_idx], max_acc

In [778]:
# feature selection
from sklearn.feature_selection import RFE
# create the RFE model and select 10 attributes
gbm = lgb.LGBMClassifier()
rfe = RFE(gbm, 40)
rfe = rfe.fit(train_final[feat_cols], train_final[DV_COL])

# summarize the selection of the attributes
print(rfe.support_)

# summarize the ranking of the attributes
fea_rank_ = pd.DataFrame({'cols':feat_cols,
                          'feat_rank':rfe.ranking_})

[False  True False  True False  True  True  True  True  True  True  True
 False False False False False  True  True False  True False  True False
  True False  True  True  True  True False False False False  True  True
 False False False False False False False False False False False False
 False  True False False False  True False False False  True  True False
 False False False  True False False False False False  True False False
 False  True  True False False False False  True False False  True  True
 False  True  True False False False False False False  True False False
 False  True False False False False False False False False False False
 False False False  True  True False False False False  True  True  True
 False False False False False False False False False False False False]


AttributeError: 'DataFrame' object has no attribute 'fea_rank'

In [782]:
fea_rank_.sort_values(by=['feat_rank'], ascending = True, inplace=True)
fea_rank_.reset_index(drop=True, inplace=True)
selected_feat_cols = fea_rank_['cols'][:40]
print(len(selected_feat_cols))

40


In [783]:
MODEL_NAME = 'lgb'

In [784]:
print("Model building..")
feat_cols = [x for x in list(train_final.columns)
             if (x.startswith(FEAT_PREFIX))]
kf = KFold(n_splits=5, shuffle=True, random_state=2020)
cv_scores = []
models = []
pred_test_full = 0
pred_val_full = np.zeros(train_final.shape[0])
pred_class_val_full = np.zeros(train_final.shape[0])
count = 0
for dev_index, val_index in kf.split(train_final):
    print('Iteration: ', count+1)
    dev_X, val_X = (train_final.loc[dev_index, selected_feat_cols],
                    train_final.loc[val_index, selected_feat_cols])
    dev_y, val_y = (train_final.loc[dev_index, DV_COL],
                    train_final.loc[val_index, DV_COL])

    if MODEL_NAME == 'xgb':
        pred_val, loss, pred_test, model = runXGB(
            fmap_fn=os.path.join(DATA_DIR, 'xgb_feat_map_fname'),
            imp_fn=os.path.join(DATA_DIR, 'xgb_feat_imp_fname'),
            train_X=dev_X, train_y=dev_y, test_X=val_X, test_y=val_y,
            test_X2=test_final[selected_feat_cols], rounds=5000, dep=8,
            feature_names=selected_feat_cols)
    elif MODEL_NAME == 'lgb':
        pred_val, pred_class, loss, pred_test, model = runLGB(
            train_X=dev_X, train_y=dev_y, test_X=val_X, test_y=val_y,
            test_X2=test_final[selected_feat_cols], rounds=5000, dep=6,
            feature_names=selected_feat_cols)
    elif MODEL_NAME == 'rf':
        pred_val, pred_class, loss, pred_test, pred_test_class, model = runRF(
            train_X=dev_X, train_y=dev_y, test_X=val_X, test_y=val_y,
            test_X2=test_final[selected_feat_cols], num_trees=1000, dep=20,
            feat=0.7)
    pred_val_full[val_index] = pred_val
    pred_class_val_full[val_index] = pred_class
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    models.append(model)
    print(cv_scores)
    print('mean cv score: ', np.mean(cv_scores))
    count += 1

pred_test_full /= 5.
print(get_accuracy(train_final[DV_COL], pred_class_val_full))
train_final['pred_prob'] = pred_val_full
train_final['pred_class'] = pred_class_val_full

test_final['pred_prob'] = pred_test_full

Model building..
Iteration:  1
Training until validation scores don't improve for 300 rounds
[20]	valid_0's binary_error: 0.120952
[40]	valid_0's binary_error: 0.112381
[60]	valid_0's binary_error: 0.11
[80]	valid_0's binary_error: 0.112381
[100]	valid_0's binary_error: 0.111429
[120]	valid_0's binary_error: 0.112381
[140]	valid_0's binary_error: 0.112381
[160]	valid_0's binary_error: 0.112857
[180]	valid_0's binary_error: 0.112381
[200]	valid_0's binary_error: 0.113333
[220]	valid_0's binary_error: 0.110476
[240]	valid_0's binary_error: 0.110476
[260]	valid_0's binary_error: 0.111905
[280]	valid_0's binary_error: 0.114762
[300]	valid_0's binary_error: 0.112381
[320]	valid_0's binary_error: 0.112857
[340]	valid_0's binary_error: 0.110476
Early stopping, best iteration is:
[57]	valid_0's binary_error: 0.109048
[89.09523809523809]
mean cv score:  89.09523809523809
Iteration:  2
Training until validation scores don't improve for 300 rounds
[20]	valid_0's binary_error: 0.131429
[40]	valid_

In [785]:
print('optimal threshold - optimized for accuracy')
PROB_THRESHOLDS = [x/100. for x in range(71)]
best_threshold, max_acc = find_optimal_threshold(train_final, dv_col=DV_COL,
                                        score_col='pred_prob',
                                        prob_thresholds=PROB_THRESHOLDS,
                                        id_col=ID_COL)

optimal threshold - optimized for accuracy
0.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.01
0.02
0.03
0.04
0.05
0.06
0.07
0.08
0.09
0.1
0.11
0.12
0.13
0.14
0.15
0.16
0.17
0.18
0.19
0.2
0.21
0.22
0.23
0.24
0.25
0.26
0.27
0.28
0.29
0.3
0.31
0.32
0.33
0.34
0.35
0.36
0.37
0.38
0.39
0.4
0.41
0.42
0.43
0.44
0.45
0.46
0.47
0.48
0.49
0.5
0.51
0.52
0.53
0.54
0.55
0.56
0.57
0.58
0.59
0.6
0.61
0.62
0.63
0.64
0.65
0.66
0.67
0.68
0.69
0.7


In [786]:
best_threshold, max_acc

(0.5, 87.67619047619047)

In [787]:
test_final['pred_class'] = test_final['pred_prob'].apply(
    lambda x: 1 if x >= best_threshold else 0)

In [788]:
print(test_final.shape)
test_final['pred_class'].mean()

(4500, 178)


0.16577777777777777

In [789]:
out_df = test_final[[ID_COL, 'pred_class']]
out_df[DV_COL] = out_df['pred_class'].apply(lambda x: "male" if x==1 else
                                            "female")
print(out_df.shape)
print(out_df.head())
print(out_df[DV_COL].value_counts())
print(out_df['pred_class'].value_counts())
print(out_df['pred_class'].mean())

(4500, 3)
  session_id  pred_class  gender
0     u12112           0  female
1     u19725           0  female
2     u11795           0  female
3     u22639           1    male
4     u18034           0  female
female    3754
male       746
Name: gender, dtype: int64
0    3754
1     746
Name: pred_class, dtype: int64
0.16577777777777777


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [790]:
# save
out_df.drop('pred_class', axis=1, inplace=True)
out_fn = os.path.join(DATA_DIR, "lgb_Merror_v10.csv")
out_df.to_csv(out_fn, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
