In [1]:
%matplotlib inline
from utils import *
import xgboost as xgb
from time import time

Using Theano backend.


In [2]:
act_train_data = pd.read_csv("./act_train.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, 
                             parse_dates=['date'])
act_test_data  = pd.read_csv("./act_test.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("./people.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, 
                             parse_dates=['date'])

In [3]:
act_train_data[:3]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0


The goal of the problem is to identify whether an activity is valuable (1/0 in the column 'outcome').  
From the data description, only type 1 activity has many characters. That is, most entries in char_x are NaN (excepting char_10), i.e. char_1~9 are imbalanced.

In [4]:
print('First see if the data is balanced.\n')
print('Number of 0\'s: {0}, with {1:.3f}%'.format(len(np.where(act_train_data['outcome']==0)[0]), 100.*(1-act_train_data['outcome'].mean())) )
print('Number of 1\'s: {0}, with {1:.3f}%'.format(len(np.where(act_train_data['outcome']==1)[0]), 100.*(act_train_data['outcome'].mean())) )

First see if the data is balanced.

Number of 0's: 1221794, with 55.605%
Number of 1's: 975497, with 44.395%


In [5]:
act_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197291 entries, 0 to 2197290
Data columns (total 15 columns):
people_id            object
activity_id          object
date                 datetime64[ns]
activity_category    object
char_1               object
char_2               object
char_3               object
char_4               object
char_5               object
char_6               object
char_7               object
char_8               object
char_9               object
char_10              object
outcome              int8
dtypes: datetime64[ns](1), int8(1), object(13)
memory usage: 236.8+ MB


In [6]:
{col:act_train_data[col].nunique() for col in act_train_data.columns}

{'activity_category': 7,
 'activity_id': 2197291,
 'char_1': 51,
 'char_10': 6515,
 'char_2': 32,
 'char_3': 11,
 'char_4': 7,
 'char_5': 7,
 'char_6': 5,
 'char_7': 8,
 'char_8': 18,
 'char_9': 19,
 'date': 411,
 'outcome': 2,
 'people_id': 151295}

In [7]:
people_data[:3]

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99


In [8]:
people_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189118 entries, 0 to 189117
Data columns (total 41 columns):
people_id    189118 non-null object
char_1       189118 non-null object
group_1      189118 non-null object
char_2       189118 non-null object
date         189118 non-null datetime64[ns]
char_3       189118 non-null object
char_4       189118 non-null object
char_5       189118 non-null object
char_6       189118 non-null object
char_7       189118 non-null object
char_8       189118 non-null object
char_9       189118 non-null object
char_10      189118 non-null bool
char_11      189118 non-null bool
char_12      189118 non-null bool
char_13      189118 non-null bool
char_14      189118 non-null bool
char_15      189118 non-null bool
char_16      189118 non-null bool
char_17      189118 non-null bool
char_18      189118 non-null bool
char_19      189118 non-null bool
char_20      189118 non-null bool
char_21      189118 non-null bool
char_22      189118 non-null bool
char_23

In [9]:
[people_data[col].nunique() for col in people_data.columns]

[189118,
 2,
 34224,
 3,
 1196,
 43,
 25,
 9,
 7,
 25,
 8,
 9,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 101]

In [10]:
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)


The data contains 189,118 people and 2,197,291 activities.

### Merge people file and activity file

In [11]:
def act_data_treatment(df):
    for col in list(df.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if df[col].dtype == 'object':
                # regard NA as a category
                df[col].fillna('type 0', inplace=True)
                df[col] = df[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif df[col].dtype == 'bool':
                # change binary feature to type int (0/1)
                df[col] = df[col].astype(np.int8)
    
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['isweekend'] = (df['date'].dt.weekday >= 5).astype('int8')
    df = df.drop('date', axis=1)
    
    return df

In [13]:
act_train_data = act_data_treatment(act_train_data)
act_test_data  = act_data_treatment(act_test_data)
people_data    = act_data_treatment(people_data)

In [14]:
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 18)
Test data shape: (498687, 17)
People data shape: (189118, 44)


In [16]:
act_train_data.columns

Index([u'people_id', u'activity_id', u'activity_category', u'char_1',
       u'char_2', u'char_3', u'char_4', u'char_5', u'char_6', u'char_7',
       u'char_8', u'char_9', u'char_10', u'outcome', u'year', u'month', u'day',
       u'isweekend'],
      dtype='object')

In [15]:
[act_train_data[col].nunique() for col in act_train_data.columns]

[151295, 2197291, 7, 52, 33, 12, 8, 8, 6, 9, 19, 20, 6516, 2, 2, 12, 31, 2]

In [17]:
act_train_data[:10]

Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome,year,month,day,isweekend
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,0,0,76,0,2023,8,26,1
1,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,0,0,1,0,2022,9,27,0
2,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,0,0,1,0,2022,9,27,0
3,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,0,0,1,0,2023,8,4,0
4,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,0,0,1,0,2023,8,26,1
5,ppl_100,act2_898576,4,0,0,0,0,0,0,0,0,0,1727,0,2023,8,4,0
6,ppl_100002,act2_1233489,2,0,0,0,0,0,0,0,0,0,1,1,2022,11,23,0
7,ppl_100002,act2_1623405,2,0,0,0,0,0,0,0,0,0,1,1,2022,11,23,0
8,ppl_100003,act2_1111598,2,0,0,0,0,0,0,0,0,0,1,1,2023,2,7,0
9,ppl_100003,act2_1177453,2,0,0,0,0,0,0,0,0,0,1,1,2023,6,28,0


In [18]:
act_train_data.loc[act_train_data['activity_category']==1][:10]

Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome,year,month,day,isweekend
52,ppl_100025,act1_9923,1,3,5,1,1,6,3,3,6,8,0,0,2022,11,25,0
105,ppl_100033,act1_198174,1,36,11,5,1,6,1,1,4,1,0,0,2022,7,26,0
106,ppl_100033,act1_214090,1,24,6,6,3,1,3,4,5,1,0,0,2023,6,15,0
107,ppl_100033,act1_230588,1,2,2,3,3,5,2,2,4,2,0,0,2023,2,28,0
108,ppl_100033,act1_271874,1,2,5,3,2,6,1,1,6,8,0,0,2022,7,26,0
124,ppl_100035,act1_104259,1,5,2,7,3,1,3,5,4,7,0,1,2023,7,28,0
125,ppl_100035,act1_188526,1,5,2,8,3,1,2,6,9,13,0,1,2023,2,3,0
126,ppl_100035,act1_212220,1,3,2,8,3,1,2,3,9,13,0,1,2023,2,2,0
127,ppl_100035,act1_313621,1,5,2,8,3,1,2,2,9,13,0,1,2023,2,3,0
128,ppl_100035,act1_336085,1,5,2,8,3,1,2,2,9,13,0,1,2023,2,3,0


In [19]:
train = act_train_data.merge(people_data, on='people_id', how='left')
test  = act_test_data.merge(people_data, on='people_id', how='left')
act_id = act_test_data['activity_id']

del act_train_data, act_test_data, people_data

In [20]:
print("Train data shape: " + format(train.shape))
print("Test data shape: " + format(test.shape))
train.columns

Train data shape: (2197291, 61)
Test data shape: (498687, 60)


Index([u'people_id', u'activity_id', u'activity_category', u'char_1_x',
       u'char_2_x', u'char_3_x', u'char_4_x', u'char_5_x', u'char_6_x',
       u'char_7_x', u'char_8_x', u'char_9_x', u'char_10_x', u'outcome',
       u'year_x', u'month_x', u'day_x', u'isweekend_x', u'char_1_y',
       u'group_1', u'char_2_y', u'char_3_y', u'char_4_y', u'char_5_y',
       u'char_6_y', u'char_7_y', u'char_8_y', u'char_9_y', u'char_10_y',
       u'char_11', u'char_12', u'char_13', u'char_14', u'char_15', u'char_16',
       u'char_17', u'char_18', u'char_19', u'char_20', u'char_21', u'char_22',
       u'char_23', u'char_24', u'char_25', u'char_26', u'char_27', u'char_28',
       u'char_29', u'char_30', u'char_31', u'char_32', u'char_33', u'char_34',
       u'char_35', u'char_36', u'char_37', u'char_38', u'year_y', u'month_y',
       u'day_y', u'isweekend_y'],
      dtype='object')

In [24]:
%%time
{col:train[col].nunique() for col in train.columns}

CPU times: user 1.44 s, sys: 876 ms, total: 2.31 s
Wall time: 2.34 s


{'activity_category': 7,
 'activity_id': 2197291,
 'char_10_x': 6516,
 'char_10_y': 2,
 'char_11': 2,
 'char_12': 2,
 'char_13': 2,
 'char_14': 2,
 'char_15': 2,
 'char_16': 2,
 'char_17': 2,
 'char_18': 2,
 'char_19': 2,
 'char_1_x': 52,
 'char_1_y': 2,
 'char_20': 2,
 'char_21': 2,
 'char_22': 2,
 'char_23': 2,
 'char_24': 2,
 'char_25': 2,
 'char_26': 2,
 'char_27': 2,
 'char_28': 2,
 'char_29': 2,
 'char_2_x': 33,
 'char_2_y': 3,
 'char_30': 2,
 'char_31': 2,
 'char_32': 2,
 'char_33': 2,
 'char_34': 2,
 'char_35': 2,
 'char_36': 2,
 'char_37': 2,
 'char_38': 101,
 'char_3_x': 12,
 'char_3_y': 43,
 'char_4_x': 8,
 'char_4_y': 25,
 'char_5_x': 8,
 'char_5_y': 9,
 'char_6_x': 6,
 'char_6_y': 7,
 'char_7_x': 9,
 'char_7_y': 25,
 'char_8_x': 19,
 'char_8_y': 8,
 'char_9_x': 20,
 'char_9_y': 9,
 'day_x': 31,
 'day_y': 31,
 'group_1': 29899,
 'isweekend_x': 2,
 'isweekend_y': 2,
 'month_x': 12,
 'month_y': 12,
 'outcome': 2,
 'people_id': 151295,
 'year_x': 2,
 'year_y': 4}

In [25]:
%%time
train.apply(lambda x: x.nunique(), axis=0)  # slower

CPU times: user 9.68 s, sys: 5.7 s, total: 15.4 s
Wall time: 17 s


people_id             151295
activity_id          2197291
activity_category          7
char_1_x                  52
char_2_x                  33
char_3_x                  12
char_4_x                   8
char_5_x                   8
char_6_x                   6
char_7_x                   9
char_8_x                  19
char_9_x                  20
char_10_x               6516
outcome                    2
year_x                     2
month_x                   12
day_x                     31
isweekend_x                2
char_1_y                   2
group_1                29899
char_2_y                   3
char_3_y                  43
char_4_y                  25
char_5_y                   9
char_6_y                   7
char_7_y                  25
char_8_y                   8
char_9_y                   9
char_10_y                  2
char_11                    2
                      ...   
char_13                    2
char_14                    2
char_15                    2
char_16       

In [26]:
%%time
{col:test[col].nunique() for col in test.columns}

CPU times: user 427 ms, sys: 251 ms, total: 678 ms
Wall time: 677 ms


{'activity_category': 7,
 'activity_id': 498687,
 'char_10_x': 3962,
 'char_10_y': 2,
 'char_11': 2,
 'char_12': 2,
 'char_13': 2,
 'char_14': 2,
 'char_15': 2,
 'char_16': 2,
 'char_17': 2,
 'char_18': 2,
 'char_19': 2,
 'char_1_x': 49,
 'char_1_y': 2,
 'char_20': 2,
 'char_21': 2,
 'char_22': 2,
 'char_23': 2,
 'char_24': 2,
 'char_25': 2,
 'char_26': 2,
 'char_27': 2,
 'char_28': 2,
 'char_29': 2,
 'char_2_x': 32,
 'char_2_y': 3,
 'char_30': 2,
 'char_31': 2,
 'char_32': 2,
 'char_33': 2,
 'char_34': 2,
 'char_35': 2,
 'char_36': 2,
 'char_37': 2,
 'char_38': 101,
 'char_3_x': 12,
 'char_3_y': 41,
 'char_4_x': 8,
 'char_4_y': 25,
 'char_5_x': 7,
 'char_5_y': 9,
 'char_6_x': 6,
 'char_6_y': 7,
 'char_7_x': 9,
 'char_7_y': 25,
 'char_8_x': 19,
 'char_8_y': 8,
 'char_9_x': 20,
 'char_9_y': 9,
 'day_x': 31,
 'day_y': 31,
 'group_1': 11640,
 'isweekend_x': 2,
 'isweekend_y': 2,
 'month_x': 12,
 'month_y': 12,
 'people_id': 37823,
 'year_x': 2,
 'year_y': 4}

Features 'group_1' and 'char_10_x' are with the most number of categories.

In [33]:
# see intersection between train/test
len(set(train['group_1']) & set(test['group_1']))

7315

In [34]:
len(set(train['char_10_x']) & set(test['char_10_x']))

3508

In [21]:
train['people_id'].nunique()

151295

In [22]:
test['people_id'].nunique()

37823

train/test split by people_id

### Some observations

Now let us observe the number of samples for each category. Take feature 'group_1' first.

In [45]:
%%time
num2idx = [(len(np.where(train['group_1']==i)[0]),i) for i in train['group_1'].unique()]

CPU times: user 1min 2s, sys: 275 ms, total: 1min 3s
Wall time: 1min 3s


In [40]:
idx2num = [(i,n) for n,i in sorted(num2idx)]

In [54]:
print(idx2num[23950:24000])

[(30320, 50), (31494, 50), (31570, 50), (31591, 50), (31653, 50), (32075, 50), (32099, 50), (32131, 50), (33547, 50), (34014, 50), (36759, 50), (36890, 50), (37155, 50), (37380, 50), (37477, 50), (37648, 50), (38745, 50), (39519, 50), (39547, 50), (39653, 50), (39771, 50), (40015, 50), (40599, 50), (40828, 50), (41062, 50), (41613, 50), (45346, 50), (48069, 50), (40, 51), (524, 51), (640, 51), (758, 51), (1417, 51), (1639, 51), (1705, 51), (2061, 51), (2771, 51), (2947, 51), (3254, 51), (3768, 51), (4993, 51), (5211, 51), (5900, 51), (6140, 51), (7446, 51), (8542, 51), (8584, 51), (10330, 51), (10893, 51), (11952, 51)]


In [41]:
print(idx2num[-10:])

[(1482, 3272), (7256, 3759), (450, 3867), (17899, 4767), (9280, 5492), (20472, 7053), (8386, 7929), (667, 15865), (27940, 53668), (17304, 799125)]


In [62]:
%%time
cat_with_zeros = []
cat_with_ones = []
for i,n in idx2num[24000:]:
    if train['outcome'].loc[train['group_1']==i].sum() == 0:
        cat_with_zeros.append(i)
    if train['outcome'].loc[train['group_1']==i].sum() == n:
        cat_with_ones.append(i)

CPU times: user 33.3 s, sys: 242 ms, total: 33.6 s
Wall time: 33.7 s


In [64]:
[dict(idx2num)[i] for i in cat_with_zeros[-10:]]

[672, 815, 817, 859, 967, 1003, 1061, 1184, 1399, 799125]

In [66]:
[dict(idx2num)[i] for i in cat_with_ones[-10:]]

[2874, 2967, 2968, 3007, 3759, 3867, 4767, 5492, 7929, 15865]

Or

In [35]:
idx2num = train.groupby('group_1')['outcome'].count().sort_values(ascending=False)
idx2num.name = 'count'
idx2num

group_1
17304    799125
27940     53668
667       15865
8386       7929
20472      7053
9280       5492
17899      4767
450        3867
7256       3759
1482       3272
9702       3094
1270       3007
3598       2968
142        2967
15723      2874
19048      2631
3229       2597
11143      2586
7124       2574
12079      2538
5393       2447
1816       2401
1490       2355
5149       2263
17544      2177
418        2166
88         2096
3250       1917
11659      1892
3461       1770
          ...  
22816         1
22861         1
22885         1
22889         1
23017         1
23026         1
23062         1
22586         1
22546         1
21945         1
22240         1
21997         1
22060         1
22092         1
22096         1
22097         1
22124         1
22205         1
22228         1
22249         1
22499         1
22250         1
22251         1
22272         1
22295         1
22304         1
22405         1
22413         1
22473         1
23401         1
Name: count, dty

In [36]:
idx2outcome = train.groupby('group_1')['outcome'].mean()
idx2outcome.name = 'mean'
idx2outcome

group_1
1        0.000000
2        1.000000
3        1.000000
4        1.000000
5        1.000000
6        1.000000
7        0.878378
8        1.000000
9        1.000000
10       0.153846
11       1.000000
12       0.000000
13       1.000000
14       0.000000
17       0.973684
20       0.852273
21       1.000000
24       1.000000
25       1.000000
26       1.000000
27       1.000000
28       1.000000
29       1.000000
30       1.000000
31       1.000000
32       1.000000
33       1.000000
34       1.000000
35       0.973333
36       1.000000
           ...   
51396    1.000000
51399    1.000000
51403    1.000000
51405    0.600000
51409    1.000000
51414    0.000000
51415    1.000000
51416    1.000000
51417    1.000000
51422    1.000000
51424    0.000000
51428    0.000000
51429    1.000000
51434    1.000000
51435    1.000000
51437    1.000000
51439    1.000000
51441    1.000000
51442    1.000000
51444    1.000000
51446    0.000000
51448    1.000000
51449    1.000000
51450    1.000000
51

In [39]:
df = pd.concat([idx2num, idx2outcome], axis=1).sort_values('count', axis=0, ascending=False)
df

Unnamed: 0_level_0,count,mean
group_1,Unnamed: 1_level_1,Unnamed: 2_level_1
17304,799125,0.000000
27940,53668,0.000224
667,15865,1.000000
8386,7929,1.000000
20472,7053,0.999858
9280,5492,1.000000
17899,4767,1.000000
450,3867,1.000000
7256,3759,1.000000
1482,3272,0.991137


We can see the category '17304' of 'group_1' are with its 799125 samples all zeros, which is a huge part with ~30% of all train data.

Now try 'char_10_x'.

In [46]:
%%time
num2idx = [(len(np.where(train['char_10_x']==i)[0]), i) for i in train['char_10_x'].unique()]

CPU times: user 15.3 s, sys: 165 ms, total: 15.5 s
Wall time: 15.7 s


In [47]:
idx2num = [(i,n) for n,i in sorted(num2idx)]

In [48]:
print(idx2num[4850:4900])

[(1221, 50), (1378, 50), (1457, 50), (2760, 50), (3200, 50), (3771, 50), (3912, 50), (4174, 50), (4271, 50), (4688, 50), (5170, 50), (79, 51), (632, 51), (688, 51), (929, 51), (1033, 51), (2164, 51), (2474, 51), (2517, 51), (2660, 51), (2670, 51), (2706, 51), (2729, 51), (3517, 51), (3563, 51), (3696, 51), (4181, 51), (4308, 51), (4850, 51), (5000, 51), (7251, 51), (7312, 51), (7611, 51), (7949, 51), (8205, 51), (8421, 51), (188, 52), (806, 52), (870, 52), (926, 52), (1082, 52), (1934, 52), (1936, 52), (2501, 52), (2531, 52), (2596, 52), (3526, 52), (3672, 52), (3859, 52), (4206, 52)]


In [49]:
print(idx2num[-10:])

[(433, 17282), (481, 18019), (52, 19515), (489, 23471), (452, 23513), (61, 35417), (2, 116191), (0, 157615), (23, 200408), (1, 904683)]


In [51]:
%%time
cat_with_zeros = []
cat_with_ones = []
for i,n in idx2num[4900:]:
    if train['outcome'].loc[train['char_10_x']==i].sum() == 0:
        cat_with_zeros.append(i)
    if train['outcome'].loc[train['char_10_x']==i].sum() == n:
        cat_with_ones.append(i)

CPU times: user 8.55 s, sys: 1.11 s, total: 9.66 s
Wall time: 9.66 s


In [52]:
[dict(idx2num)[i] for i in cat_with_zeros[-10:]]

[183, 195, 271, 293, 330, 399, 399, 459, 471, 1061]

In [53]:
[dict(idx2num)[i] for i in cat_with_ones[-10:]]

[93, 99, 104, 104, 106, 107, 108, 147, 173, 504]

Or

In [40]:
idx2num = train.groupby('char_10_x')['outcome'].count().sort_values(ascending=False)
idx2num.name = 'count'
idx2num

char_10_x
1       904683
23      200408
0       157615
2       116191
61       35417
452      23513
489      23471
52       19515
481      18019
433      17282
8        16112
3        14139
450      12824
649      11630
899      11427
400      10569
464      10368
55        8072
248       7860
257       7349
420       6719
201       6574
297       5145
600       4998
1058      4993
143       4760
1069      4624
110       4253
230       3875
1251      3798
         ...  
1436         1
3132         1
4420         1
7268         1
3450         1
4556         1
4964         1
4405         1
3436         1
2881         1
7876         1
6757         1
3437         1
7886         1
3438         1
3439         1
7892         1
7893         1
7297         1
2560         1
4952         1
7898         1
3441         1
2556         1
4401         1
7904         1
7905         1
7907         1
7294         1
6952         1
Name: count, dtype: int64

In [41]:
idx2outcome = train.groupby('char_10_x')['outcome'].mean()
idx2outcome.name = 'mean'
idx2outcome

char_10_x
0       0.411325
1       0.510324
2       0.434569
3       0.413396
4       0.004622
5       0.041204
6       0.041084
7       0.009259
8       0.276192
9       0.496203
10      0.252078
11      0.329949
12      0.211009
13      0.210526
14      0.157895
15      0.540741
16      0.565217
17      0.190000
18      0.291339
19      0.241379
20      0.350254
21      0.531434
22      0.396624
23      0.046231
24      0.010217
25      0.500000
26      0.555755
27      0.020921
28      0.020115
29      0.007105
          ...   
9181    0.000000
9182    0.000000
9183    0.000000
9187    1.000000
9188    1.000000
9189    0.000000
9192    1.000000
9193    1.000000
9194    1.000000
9199    0.000000
9202    1.000000
9204    1.000000
9205    1.000000
9207    1.000000
9208    0.000000
9210    1.000000
9214    1.000000
9215    0.000000
9218    1.000000
9219    1.000000
9220    1.000000
9226    1.000000
9227    1.000000
9228    0.000000
9235    1.000000
9237    0.000000
9241    0.000000
9243

In [42]:
df = pd.concat([idx2num, idx2outcome], axis=1).sort_values('count', axis=0, ascending=False)
df

Unnamed: 0_level_0,count,mean
char_10_x,Unnamed: 1_level_1,Unnamed: 2_level_1
1,904683,0.510324
23,200408,0.046231
0,157615,0.411325
2,116191,0.434569
61,35417,0.513906
452,23513,0.505635
489,23471,0.530740
52,19515,0.446631
481,18019,0.633276
433,17282,0.503298


The most number of samples of one category in 'char_10_x' with all 1's or 0's is 1061, which is rather small part of the data.

Now try 'activity_category'

In [57]:
train.groupby('activity_category')['outcome'].size()  # or .count()

activity_category
1    157615
2    904683
3    429408
4    207465
5    490710
6      4253
7      3157
dtype: int64

In [79]:
print('Percentage of zeros')
print(1-train.groupby('activity_category')['outcome'].mean())
print('\nPercentage of ones')
print(train.groupby('activity_category')['outcome'].mean())

Percentage of zeros
activity_category
1    0.588675
2    0.489676
3    0.748011
4    0.510795
5    0.519757
6    0.444157
7    0.600253
Name: outcome, dtype: float64

Percentage of ones
activity_category
1    0.411325
2    0.510324
3    0.251989
4    0.489205
5    0.480243
6    0.555843
7    0.399747
Name: outcome, dtype: float64


No category's with all 1's or 0's.

In [54]:
train[['people_id', 'activity_category', 'char_1_x', 'char_2_x', 'char_3_x', 'char_4_x', 'char_5_x', \
       'char_6_x', 'char_7_x', 'char_8_x', 'char_9_x', 'char_10_x', \
       'outcome']].loc[train['activity_category']==1][:100]

Unnamed: 0,people_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_10_x,outcome
52,ppl_100025,1,3,5,1,1,6,3,3,6,8,0,0
105,ppl_100033,1,36,11,5,1,6,1,1,4,1,0,0
106,ppl_100033,1,24,6,6,3,1,3,4,5,1,0,0
107,ppl_100033,1,2,2,3,3,5,2,2,4,2,0,0
108,ppl_100033,1,2,5,3,2,6,1,1,6,8,0,0
124,ppl_100035,1,5,2,7,3,1,3,5,4,7,0,1
125,ppl_100035,1,5,2,8,3,1,2,6,9,13,0,1
126,ppl_100035,1,3,2,8,3,1,2,3,9,13,0,1
127,ppl_100035,1,5,2,8,3,1,2,2,9,13,0,1
128,ppl_100035,1,5,2,8,3,1,2,2,9,13,0,1


In [55]:
train[['people_id', 'activity_category', 'char_10_x', 'outcome']].loc[train['activity_category']==2][:100]

Unnamed: 0,people_id,activity_category,char_10_x,outcome
1,ppl_100,2,1,0
2,ppl_100,2,1,0
3,ppl_100,2,1,0
4,ppl_100,2,1,0
6,ppl_100002,2,1,1
7,ppl_100002,2,1,1
8,ppl_100003,2,1,1
9,ppl_100003,2,1,1
12,ppl_100003,2,1,1
15,ppl_100003,2,1,1


Now see isweekend features 'isweekend_x' and 'isweekend_y'

In [77]:
print('Percentage of zeros')
print(1-train.groupby('isweekend_x')['outcome'].mean())
print('\nPercentage of ones')
print(train.groupby('isweekend_x')['outcome'].mean())

Percentage of zeros
isweekend_x
0    0.556175
1    0.555624
Name: outcome, dtype: float64

Percentage of ones
isweekend_x
0    0.443825
1    0.444376
Name: outcome, dtype: float64


In [78]:
print('Percentage of zeros')
print(1-train.groupby('isweekend_y')['outcome'].mean())
print('\nPercentage of ones')
print(train.groupby('isweekend_y')['outcome'].mean())

Percentage of zeros
isweekend_y
0    0.539729
1    0.613425
Name: outcome, dtype: float64

Percentage of ones
isweekend_y
0    0.460271
1    0.386575
Name: outcome, dtype: float64


### Try something
From the above we found one category '17304' of 'group_1' is with obvious signal, that all corresponding outcome are 0's. So we could make a guess for the outcome of the testset (with 0's of the category, yet others with 1's).

In [37]:
pred = np.ones(test.shape[0])
pred[np.where(test['group_1']==17304)[0]] = 0
submit = pd.DataFrame({'activity_id': act_id, 'outcome': pred})
submit.to_csv('group_1_guess.csv', index=False)

We got test AUC score ~0.828, a not bad result!

In [56]:
trn_data = train.loc[train['group_1']!=17304]

In [57]:
trn_data.shape

(1398166, 61)

In [58]:
trn_data['group_1'].nunique()

29898

In [83]:
print('Number of 0\'s: {0}, with {1:.3f}%'.format(len(np.where(trn_data['outcome']==0)[0]), 100.*(1-trn_data['outcome'].mean())) )
print('Number of 1\'s: {0}, with {1:.3f}%'.format(len(np.where(trn_data['outcome']==1)[0]), 100.*(trn_data['outcome'].mean())) )

Number of 0's: 422669, with 30.230%
Number of 1's: 975497, with 69.770%


Now we can see it's a relatively unbalanced data.

In [59]:
tt_data = test.loc[test['group_1']!=17304]

In [60]:
tt_data.shape

(333083, 60)

In [61]:
tt_data['group_1'].nunique()

11639

Now try to see 'char_10_x' again.

In [62]:
%%time
num2idx = [(len(np.where(trn_data['char_10_x']==i)[0]), i) for i in trn_data['char_10_x'].unique()]

CPU times: user 7.62 s, sys: 72.2 ms, total: 7.69 s
Wall time: 7.71 s


In [63]:
idx2num = [(i,n) for n,i in sorted(num2idx)]

In [64]:
print(idx2num[4200:4250])

[(1086, 51), (1242, 51), (1617, 51), (2396, 51), (3534, 51), (4865, 51), (5000, 51), (5450, 51), (5681, 51), (7251, 51), (7312, 51), (7611, 51), (7949, 51), (8205, 51), (8421, 51), (24, 52), (28, 52), (705, 52), (1095, 52), (1208, 52), (1439, 52), (1575, 52), (1884, 52), (1887, 52), (2223, 52), (3052, 52), (4056, 52), (4206, 52), (7054, 52), (7408, 52), (90, 53), (316, 53), (342, 53), (357, 53), (603, 53), (740, 53), (804, 53), (960, 53), (1092, 53), (1605, 53), (1750, 53), (1894, 53), (2308, 53), (2472, 53), (2483, 53), (2520, 53), (2853, 53), (4782, 53), (4937, 53), (5206, 53)]


In [65]:
print(idx2num[-10:])

[(433, 11858), (52, 12102), (481, 14085), (452, 16390), (489, 16736), (61, 25552), (23, 63997), (2, 65435), (0, 90293), (1, 648115)]


In [66]:
%%time
cat_with_zeros = []
cat_with_ones = []
for i,n in idx2num[4900:]:
    if trn_data['outcome'].loc[trn_data['char_10_x']==i].sum() == 0:
        cat_with_zeros.append(i)
    if trn_data['outcome'].loc[trn_data['char_10_x']==i].sum() == n:
        cat_with_ones.append(i)

CPU times: user 1.81 s, sys: 25.1 ms, total: 1.84 s
Wall time: 1.84 s


In [67]:
[dict(idx2num)[i] for i in cat_with_zeros[-10:]]

[183, 331]

In [68]:
[dict(idx2num)[i] for i in cat_with_ones[-10:]]

[158, 160, 173, 206, 306, 504]

Now also see 'activity_category' again.

In [38]:
trn_data.groupby('activity_category')['outcome'].size()  # or .count()

activity_category
1     90293
2    648115
3    198150
4    140769
5    315849
6      3248
7      1742
dtype: int64

In [82]:
print('Percentage of zeros')
print(1-trn_data.groupby('activity_category')['outcome'].mean())
print('\nPercentage of ones')
print(trn_data.groupby('activity_category')['outcome'].mean())

Percentage of zeros
activity_category
1    0.281993
2    0.287656
3    0.453919
4    0.279010
5    0.253884
6    0.272167
7    0.275545
Name: outcome, dtype: float64

Percentage of ones
activity_category
1    0.718007
2    0.712344
3    0.546081
4    0.720990
5    0.746116
6    0.727833
7    0.724455
Name: outcome, dtype: float64


In [69]:
trn_data[['people_id', 'activity_category', 'char_10_x', 'outcome']].loc[trn_data['activity_category']==2][:100]

Unnamed: 0,people_id,activity_category,char_10_x,outcome
6,ppl_100002,2,1,1
7,ppl_100002,2,1,1
8,ppl_100003,2,1,1
9,ppl_100003,2,1,1
12,ppl_100003,2,1,1
15,ppl_100003,2,1,1
16,ppl_100003,2,1,1
17,ppl_100003,2,1,1
18,ppl_100003,2,1,1
20,ppl_100003,2,1,1


Now see isweekend features 'isweekend_x' and 'isweekend_y'

In [91]:
print('Percentage of zeros')
print(1-trn_data.groupby('isweekend_x')['outcome'].mean())
print('\nPercentage of ones')
print(trn_data.groupby('isweekend_x')['outcome'].mean())

Percentage of zeros
isweekend_x
0    0.283734
1    0.356614
Name: outcome, dtype: float64

Percentage of ones
isweekend_x
0    0.716266
1    0.643386
Name: outcome, dtype: float64


In [92]:
print('Percentage of zeros')
print(1-trn_data.groupby('isweekend_y')['outcome'].mean())
print('\nPercentage of ones')
print(trn_data.groupby('isweekend_y')['outcome'].mean())

Percentage of zeros
isweekend_y
0    0.309054
1    0.272541
Name: outcome, dtype: float64

Percentage of ones
isweekend_y
0    0.690946
1    0.727459
Name: outcome, dtype: float64


Conclusions:
1. Train/test split by people_id
2. Outcomes of group_1 category 17304 tend to be with all 0's.
3. The outcomes of the rest data tend to be with more 1's.