### feature selection

In [1]:
import os, sys
import pandas
import numpy as np
resource_dir = "/home/hadoop/deepy/secrets/facebook/resources/"

---

In [2]:
train_raw = pandas.read_csv(resource_dir + "train.csv").values.copy()

In [3]:
good_bidders = train_raw[train_raw[:, 3] == 0.0][:, 0]
bad_bidders = train_raw[train_raw[:, 3] == 1.0][:, 0]

In [4]:
bid_raw = pandas.read_csv(resource_dir + "bids.csv").values.copy()

In [5]:
good_bids = [i for i in xrange(len(bid_raw)) if bid_raw[i][1] in good_bidders]

In [6]:
bad_bids = [i for i in xrange(len(bid_raw)) if bid_raw[i][1] in bad_bidders]

In [33]:
from collections import Counter

def get_feature_counters(feature_n):
    good_counter, bad_counter = Counter(), Counter()
    good_features, bad_features = [], []
    for i in good_bids:
        good_features.append(bid_raw[i][feature_n])
    for i in bad_bids:
        bad_features.append(bid_raw[i][feature_n])
    good_counter = Counter(good_features)
    bad_counter = Counter(bad_features)
    return good_counter, bad_counter

def get_feature_diff(good_counter, bad_counter):
    good_sum, bad_sum = sum(good_counter.values()), sum(bad_counter.values())
    diffs = []
    for k, bad_v in bad_counter.most_common():
        good_v = good_counter.get(k)
        if good_v == None:
            good_v = 0
        bad_ratio, good_ratio = float(bad_v) / bad_sum, float(good_v) / good_sum
        diffs.append((abs(bad_ratio - good_ratio), bad_ratio - good_ratio, k))
    diffs.sort(reverse=True)
    return diffs

In [46]:
diffs = get_feature_diff(*get_feature_counters(6)) # country
diffs

[(0.1340413547843862, 0.1340413547843862, 'us'),
 (0.11329768322707913, -0.11329768322707913, 'ng'),
 (0.04349996844827638, -0.04349996844827638, 'gh'),
 (0.041975727674589694, -0.041975727674589694, 'ke'),
 (0.04100529440370175, 0.04100529440370175, 'de'),
 (0.020993438338995453, -0.020993438338995453, 'za'),
 (0.019384546730291916, 0.019384546730291916, 'ca'),
 (0.01901903337178193, 0.01901903337178193, 'jp'),
 (0.018018075569331313, 0.018018075569331313, 'au'),
 (0.016542549762282595, 0.016542549762282595, 'th'),
 (0.015134374716208593, 0.015134374716208593, 'kr'),
 (0.013231659581839823, -0.013231659581839823, 'tr'),
 (0.012669059297825174, -0.012669059297825174, 'vn'),
 (0.010826911359592361, 0.010826911359592361, 'ru'),
 (0.010064886108794647, -0.010064886108794647, 'et'),
 (0.009689543095607036, -0.009689543095607036, 'bd'),
 (0.008837816693841444, 0.008837816693841444, 'mx'),
 (0.008320278123129795, -0.008320278123129795, 'lk'),
 (0.008125303152176709, -0.008125303152176709, 'i

In [47]:
diffs = get_feature_diff(*get_feature_counters(2)) # auc
diffs

[(0.12239911519860061, 0.12239911519860061, 'jqx39'),
 (0.027609763121885282, 0.027609763121885282, '1f9t0'),
 (0.017335238651926876, 0.017335238651926876, 'xsj5g'),
 (0.012378363938725283, -0.012378363938725283, 'opnq4'),
 (0.012126649873933602, 0.012126649873933602, 'kxsj8'),
 (0.010190791850991763, 0.010190791850991763, 's3k40'),
 (0.00974965624329209, 0.00974965624329209, '5umtc'),
 (0.008302161177731464, 0.008302161177731464, 'rimjm'),
 (0.008108422501150464, -0.008108422501150464, 'hy5xd'),
 (0.00810767947696097, -0.00810767947696097, 'no958'),
 (0.007882895712682967, -0.007882895712682967, 'ihm9a'),
 (0.007856808898271998, -0.007856808898271998, 'jefix'),
 (0.007412039604328948, -0.007412039604328948, 'pyqrm'),
 (0.007181493499378049, -0.007181493499378049, 'w05vh'),
 (0.006895487858515134, -0.006895487858515134, 'du967'),
 (0.00640833576266436, -0.00640833576266436, 'enf1x'),
 (0.006214605279164588, -0.006214605279164588, 'zem2w'),
 (0.00587785178071079, -0.00587785178071079, '

In [40]:
diffs = get_feature_diff(*get_feature_counters(3)) # merchandise
diffs

[(0.20516431670700125, 0.20516431670700125, 'sporting goods'),
 (0.1190184726265049, -0.1190184726265049, 'jewelry'),
 (0.10103785063730858, -0.10103785063730858, 'home goods'),
 (0.06975494688748579, 0.06975494688748579, 'mobile'),
 (0.04111183814398192, -0.04111183814398192, 'office equipment'),
 (0.0246287341398103, 0.0246287341398103, 'computers'),
 (0.0015061696307495844, -0.0015061696307495844, 'books and music')]

In [42]:
diffs = get_feature_diff(*get_feature_counters(4)) # dev
diffs

[(0.07596701368306234, -0.07596701368306234, 'phone4'),
 (0.034844180555392366, 0.034844180555392366, 'phone119'),
 (0.02924069751160254, 0.02924069751160254, 'phone17'),
 (0.028836798692044335, 0.028836798692044335, 'phone46'),
 (0.020922521698096993, -0.020922521698096993, 'phone35'),
 (0.020207783497767737, 0.020207783497767737, 'phone62'),
 (0.019088437692453213, -0.019088437692453213, 'phone101'),
 (0.017529945702557597, 0.017529945702557597, 'phone13'),
 (0.01687583854311393, -0.01687583854311393, 'phone45'),
 (0.012962179152355379, -0.012962179152355379, 'phone2'),
 (0.011058373440810796, 0.011058373440810796, 'phone115'),
 (0.011054467529776733, 0.011054467529776733, 'phone122'),
 (0.010224305981718348, -0.010224305981718348, 'phone5'),
 (0.009401422173349237, 0.009401422173349237, 'phone28'),
 (0.009196344980029518, -0.009196344980029518, 'phone25'),
 (0.009175408907004249, 0.009175408907004249, 'phone237'),
 (0.009130353307505413, -0.009130353307505413, 'phone80'),
 (0.009001

In [45]:
diffs = get_feature_diff(*get_feature_counters(8)) # url
diffs

[(0.23729595983481105, 0.23729595983481105, 'vasstdc27m7nks3'),
 (0.0049519919796867625, 0.0049519919796867625, 'lacduz3i6mjlfkd'),
 (0.003210350713842334, 0.003210350713842334, 'rmdm1f1lak48s6g'),
 (0.003148484173816942, 0.003148484173816942, '4dd8ei0o5oqsua3'),
 (0.0028007510418468586, -0.0028007510418468586, '96ky12gxeqflpwz'),
 (0.002271977808814401, 0.002271977808814401, 'hzsvpefhf94rnlb'),
 (0.0017870305710738672, 0.0017870305710738672, '8golk6oetcgd6wm'),
 (0.0017264972635063166, -0.0017264972635063166, '1bltvi87id7pau1'),
 (0.0016876163873370577, 0.0016876163873370577, 'ds6j090wqr4tmmf'),
 (0.001590626939788951, 0.001590626939788951, '41hbwy7lobmyerr'),
 (0.0015300085350713841, 0.0015300085350713841, 'mp0g18rqmfrqctc'),
 (0.0012128723518717108, 0.0012128723518717108, 'n7hs0kmoakimcyr'),
 (0.0011287644781115946, -0.0011287644781115946, 'at6pnrc7xip63yk'),
 (0.0011008302296710117, 0.0011008302296710117, 'mnq94u1xp9s57wi'),
 (0.0010814323401613904, 0.0010814323401613904, 'kwhc56qj

In [71]:
good_sc1 = Counter([s[:32] for s in train_raw[train_raw[:, 3] == 0.0][:, 1]])
bad_sc1 = Counter([s[:32] for s in train_raw[train_raw[:, 3] == 1.0][:, 1]])
diffs = get_feature_diff(good_sc1, bad_sc1)
diffs

[(0.04492959894271331,
  -0.04492959894271331,
  'a3d2de7675556553a5f08e4c88d2c228'),
 (0.02912621359223301,
  0.02912621359223301,
  '0875307e1731af94b3b64725ad0deb7d'),
 (0.019417475728155338,
  0.019417475728155338,
  'b0a4197bc483e81400b84d4d5ada8331'),
 (0.019417475728155338,
  0.019417475728155338,
  '05c67ba7faaefff08b0a3c97742f1bc0'),
 (0.0141818736339145, 0.0141818736339145, '3e9073fb9219ceb4a1dc9dbb9e1acbe9'),
 (0.009708737864077669,
  0.009708737864077669,
  'f8bb349197cc1af99d8f677ccc7e6045'),
 (0.009708737864077669,
  0.009708737864077669,
  'f33942502654584b5e7c6fc8cdc5ca6a'),
 (0.009708737864077669,
  0.009708737864077669,
  'ee3295c7b31a1e9116ebf0cf2df68ed1'),
 (0.009708737864077669,
  0.009708737864077669,
  'ee20219e9d6b897b115119011ce3d9f1'),
 (0.009708737864077669,
  0.009708737864077669,
  'ebf462c4467717ab00122916e1cde7a0'),
 (0.009708737864077669,
  0.009708737864077669,
  'e35a74340ac6d90fcb4fe9b84c6eaa21'),
 (0.009708737864077669,
  0.009708737864077669,
  'deb

In [72]:
good_sc1 = Counter([s[:32] for s in train_raw[train_raw[:, 3] == 0.0][:, 2]])
bad_sc1 = Counter([s[:32] for s in train_raw[train_raw[:, 3] == 1.0][:, 2]])
diffs = get_feature_diff(good_sc1, bad_sc1)
diffs

[(0.06359985767295279,
  -0.06359985767295279,
  'a3d2de7675556553a5f08e4c88d2c228'),
 (0.03569359019976617,
  0.03569359019976617,
  'ca8d4b018cb62966eebb2974f5a83b4f'),
 (0.019417475728155338,
  0.019417475728155338,
  'eff9d6ada4c268930ee05a81ec4e9ebf'),
 (0.014705433843338583,
  0.014705433843338583,
  '4d90f2e709f1fc0810e5aef472dd3935'),
 (0.009708737864077669,
  0.009708737864077669,
  'fcec7ba7b352f0a5e62ca742391e8ab3'),
 (0.009708737864077669,
  0.009708737864077669,
  'eab8700fadb60099a2e0cdd62eeb902e'),
 (0.009708737864077669,
  0.009708737864077669,
  'e8832d1597661a09fc47c8ad4514176e'),
 (0.009708737864077669,
  0.009708737864077669,
  'e35f11bb4e5882069f7dc3765e410167'),
 (0.009708737864077669,
  0.009708737864077669,
  'dee44a87c5db48c9f6ca23e37d6f6107'),
 (0.009708737864077669,
  0.009708737864077669,
  'dc465fb916d7cab3c086ebbcdcc1ed13'),
 (0.009708737864077669,
  0.009708737864077669,
  'db147bf6056d00428b1bbf250c6e9759'),
 (0.009708737864077669,
  0.009708737864077669