In [1]:
import sys; sys.path.append('..')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('../data/train.csv').drop_duplicates()
df.shape

(114765, 13)

In [2]:
from custom.metrics import *
df_fp = df[df.feedback == 0]
df_tp = df[df.feedback == 1]
score = f2_score(df.feedback.values, np.ones_like(df.feedback.values))
accuracy = len(df_tp) / len(df)
accuracy, score

(0.8772012373110268, 0.9727646762121391)

In [3]:
group_columns = ['action_recommendation_id', 'action_recommendation_category', 'equipment_category', 'usage_type']
group_columns

['action_recommendation_id',
 'action_recommendation_category',
 'equipment_category',
 'usage_type']

In [4]:
groups = df[group_columns].value_counts().to_frame('total_count').merge(
    df[df.feedback == 1][group_columns].value_counts().to_frame('tp_count'),
    how='right',
    left_index=True,
    right_index=True
).merge(
    df[df.feedback == 0][group_columns].value_counts().to_frame('fp_count'),
    how='right',
    left_index=True,
    right_index=True
)
groups = groups.assign(accuracy=(1 - groups.fp_count / groups.total_count))
groups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,total_count,tp_count,fp_count,accuracy
action_recommendation_id,action_recommendation_category,equipment_category,usage_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ar00000209,arc02,tp002,ut011,2858.0,2034.0,824,0.711686
ar00000293,arc02,tp002,ut011,3613.0,2994.0,619,0.828674
ar00000174,arc03,tp006,ut007,1497.0,1039.0,458,0.694055
ar00000048,arc02,tp002,ut011,1034.0,670.0,364,0.647969
ar00000293,arc02,tp007,ut011,1311.0,1019.0,292,0.777269
...,...,...,...,...,...,...,...
ar00000193,arc03,tp004,ut005,43.0,42.0,1,0.976744
ar00000193,arc03,tp004,ut008,26.0,25.0,1,0.961538
ar00000193,arc03,tp005,ut001,6.0,5.0,1,0.833333
ar00000193,arc03,tp005,ut005,15.0,14.0,1,0.933333


In [10]:
groups.index.names

FrozenList(['action_recommendation_id', 'action_recommendation_category', 'equipment_category', 'usage_type'])

In [5]:
l_bound = 0.65
h_bound = 0.95
lower_group = groups[groups.accuracy < l_bound].index
upper_group = groups[groups.accuracy >= h_bound].index
middle_group = groups[(groups.accuracy >= l_bound) & (groups.accuracy < h_bound)].index
lower_group, upper_group, middle_group

(MultiIndex([('ar00000048', 'arc02', 'tp002', 'ut011'),
             ('ar00000209', 'arc02', 'tp007', 'ut011'),
             ('ar00000174', 'arc03', 'tp012', 'ut007'),
             ('ar00000209', 'arc02', 'tp007', 'ut009'),
             ('ar00000048', 'arc02', 'ec020', 'ut011'),
             ('ar00000048', 'arc02', 'tp007', 'ut011'),
             ('ar00000220', 'arc03', 'tp011', 'ut011'),
             ('ar00000221', 'arc02', 'tp002', 'ut011'),
             ('ar00000294', 'arc02', 'tp002', 'ut011'),
             ('ar00000209', 'arc02', 'tp007', 'ut008'),
             ...
             ('ar00000183', 'arc04', 'tp007', 'ut011'),
             ('ar00000198', 'arc03', 'tp016', 'ut007'),
             ('ar00000199', 'arc01', 'tp017', 'ut009'),
             ('ar00000201', 'arc03', 'tp005', 'ut010'),
             ('ar00000201', 'arc03', 'tp005', 'ut013'),
             ('ar00000049', 'arc02', 'ec020', 'ut008'),
             ('ar00000027', 'arc03', 'tp009', 'ut005'),
             ('ar00000027', 'ar

In [6]:
def assign_group_values(x, *groups):
    group_values = []
    for i, row in enumerate(x[group_columns].values):
        group_values.append(0)
        for j, group in enumerate(groups):
            for value in group:
                if tuple(row.tolist()) == value:
                    group_values[i] = j + 1
    return group_values

In [7]:
%%time
group_values = assign_group_values(df.copy(), middle_group, lower_group, upper_group)

CPU times: user 1min, sys: 41.1 ms, total: 1min
Wall time: 1min


In [8]:
df = df.assign(group=group_values)
df

Unnamed: 0,case_id,equipment_id,completion_date,action_recommendation_id,action_recommendation_type,action_recommendation_category,equipment_area,usage_type,speed_category,load_category,floors_category,equipment_category,feedback,group
0,b7c775ad-4ebe-4848-9c53-37e7c5658e21,ele0000754,2018-10-03,ar00000174,art01,arc03,ga00000803,ut012,7,6,8,tp006,1,3
1,b177eefd-3946-4949-9699-0a91879350f9,ele0000789,2018-10-03,ar00000248,art02,arc03,ga00000269,ut005,4,6,2,tp001,1,0
2,fc2d568a-c53c-43ef-8871-a49ec22ab3b1,ele0001227,2018-10-03,ar00000158,art02,arc03,ga00000064,ut011,6,6,7,tp006,1,1
3,8e645922-1268-4c6b-ae6b-7b1605689cca,ele0001754,2018-10-03,ar00000105,art01,arc03,ga00000662,ut005,8,7,8,tp014,1,1
4,b44f10b1-9238-44db-8f0c-2d68e8c015a7,ele0002087,2018-10-03,ar00000148,art02,arc03,ga00001256,ut011,7,6,8,tp013,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115470,8eda4489-c34c-4e8a-b592-ad57b5fd842a,ele0029759,2020-01-30,ar00000124,art01,arc03,ga00000817,ut011,6,7,8,tp005,1,1
115471,0113eba6-6928-461a-b994-35a0b2eb9f4e,ele0029783,2020-01-30,ar00000291,art02,arc01,ga00001027,ut001,4,3,1,tp015,1,0
115472,c2eab0dc-218a-4a95-ab28-47449f42f660,ele0029785,2020-01-30,ar00000048,art01,arc02,ga00000355,ut011,2,1,4,tp002,1,2
115473,c2eab0dc-218a-4a95-ab28-47449f42f660,ele0029785,2020-01-30,ar00000293,art01,arc02,ga00000355,ut011,2,1,4,tp002,1,1
