In [0]:
from numpy import array, hstack
from sklearn import metrics, linear_model
from sklearn import naive_bayes
from sklearn import preprocessing
from scipy import sparse
from itertools import combinations
from sklearn.model_selection import train_test_split

# from sets import Set
import numpy as np
import pandas as pd
import sys

In [0]:
SEED = 55
# SEED = int(sys.argv[2])

def group_data(data, degree=3, hash=hash):
    """ 
    numpy.array -> numpy.array
    
    Groups all columns of data into all combinations of triples
    """
    new_data = []
    m,n = data.shape
    for indicies in combinations(range(n), degree):
      if (5 in indicies) and (7 in indicies):
          print ("feature Xd")
      elif (2 in indicies) and (3 in indicies):
          print ("feature Xd")
      else:
                new_data.append([hash(tuple(v)) for v in data[:,indicies]])
    return array(new_data).T

def OneHotEncoder(data, keymap=None):
     """
     OneHotEncoder takes data matrix with categorical columns and
     converts it to a sparse binary matrix.
     
     Returns sparse binary matrix and keymap mapping categories to indicies.
     If a keymap is supplied on input it will be used instead of creating one
     and any categories appearing in the data that are not in the keymap are
     ignored
     """
     if keymap is None:
          keymap = []
          for col in data.T:
               uniques = set(list(col))
               keymap.append(dict((key, i) for i, key in enumerate(uniques)))
     total_pts = data.shape[0]
     outdat = []
     for i, col in enumerate(data.T):
          km = keymap[i]
          num_labels = len(km)
          spmat = sparse.lil_matrix((total_pts, num_labels))
          for j, val in enumerate(col):
               if val in km:
                    spmat[j, km[val]] = 1
          outdat.append(spmat)
     outdat = sparse.hstack(outdat).tocsr()
     return outdat, keymap

def create_test_submission(filename, prediction):
    content = ['id,ACTION']
    for i, p in enumerate(prediction):
        content.append('%i,%f' %(i+1,p))
    f = open(filename, 'w')
    f.write('\n'.join(content))
    f.close()
    print ('Saved')

# This loop essentially from Paul's starter code
# I (Ben) increased the size of train at the expense of test, because
# when train is small many features will not be found in train.
def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = train_test_split(
                                       X, y, test_size=1.0/float(N), 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.roc_auc_score(y_cv, preds)
        #print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N

In [15]:
learner = sys.argv[1]
print ("Reading dataset...")
train_data = pd.read_csv('D:/applied/amazon/train.csv')
test_data = pd.read_csv('D:/applied/amazon/test.csv')
submit=learner + str(SEED) + '.csv'
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
num_train = np.shape(train_data)[0]
# Transform data
print ("Transforming data...")
# Relabel the variable values to smallest possible so that I can use bincount
# on them later.
relabler = preprocessing.LabelEncoder()
for col in range(len(all_data[0,:])):
    relabler.fit(all_data[:, col])
    all_data[:, col] = relabler.transform(all_data[:, col])

Reading dataset...
Transforming data...


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  


In [16]:
########################## 2nd order features ################################
dp = group_data(all_data, degree=2) 
for col in range(len(dp[0,:])):
    relabler.fit(dp[:, col])
    dp[:, col] = relabler.transform(dp[:, col])
    uniques = len(set(dp[:,col]))
    maximum = max(dp[:,col])
    print (col)
    if maximum < 65534:
        count_map = np.bincount((dp[:, col]).astype('uint16'))
        for n,i in enumerate(dp[:, col]):
            if count_map[i] <= 1:
                dp[n, col] = uniques
            elif count_map[i] == 2:
                dp[n, col] = uniques+1
    else:
        for n,i in enumerate(dp[:, col]):
            if (dp[:, col] == i).sum() <= 1:
                dp[n, col] = uniques
            elif (dp[:, col] == i).sum() == 2:
                dp[n, col] = uniques+1
    print( uniques) # unique values
    uniques = len(set(dp[:,col]))
    print (uniques)
    relabler.fit(dp[:, col])
    dp[:, col] = relabler.transform(dp[:, col])

feature Xd
feature Xd
0
59168
7849
1
14683
5926
2
19469
6853
3
27849
8498
4
33130
7198
5
47424
5994
6
18885
6393
7
5247
4233
8
5463
4385
9
6044
4769
10
9431
6909
11
10522
7120
12
6694
5086
13
1311
1143
14
1794
1373
15
4225
2938
16
854
695
17
1569
1344
18
2486
1971
19
5008
3554
20
1178
983
21
3395
2847
22
5644
4211
23
1827
1558
24
3805
2758
25
3253
2362


In [18]:
########################## 3rd order features ################################
dt = group_data(all_data, degree=3)
for col in range(len(dt[0,:])):
    relabler.fit(dt[:, col])
    dt[:, col] = relabler.transform(dt[:, col])
    uniques = len(set(dt[:,col]))
    maximum = max(dt[:,col])
    print (col)
    if maximum < 65534:
        count_map = np.bincount((dt[:, col]).astype('uint16'))
        for n,i in enumerate(dt[:, col]):
            if count_map[i] <= 1:
                dt[n, col] = uniques
            elif count_map[i] == 2:
                dt[n, col] = uniques+1
    else:
        for n,i in enumerate(dt[:, col]):
            if (dt[:, col] == i).sum() <= 1:
                dt[n, col] = uniques
            elif (dt[:, col] == i).sum() == 2:
                dt[n, col] = uniques+1
    print (uniques)
    uniques = len(set(dt[:,col]))
    print (uniques)
    relabler.fit(dt[:, col])
    dt[:, col] = relabler.transform(dt[:, col])

feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
feature Xd
0
60000
7633
1
60960
7372
2
63587
6820
3
79826
1668
4
81233
1567
5
65386
6114
6
34514
8723
7
41365
7330
8
52307
5712
9
26699
7163
10
36375
8671
11
48645
7261
12
58054
5479
13
32817
7679
14
55768
6305
15
63319
5010
16
39331
7999
17
56040
5185
18
48756
5782
19
6326
4916
20
9682
7020
21
10712
7205
22
6977
5222
23
6490
5022
24
9826
7106
25
10838
7277
26
7150
5345
27
10277
7359
28
11245
7486
29
7627
5639
30
11867
8033
31
10709
7199
32
4858
3867
33
6791
4891
34
2989
2447
35
5067
3516
36
4465
3092
37
5195
4089
38
7079
5071
39
3280
2650
40
5937
4202
41
5228
3691
42
6723
4999
43
5838
4304


In [0]:
########################## 1st order features ################################
for col in range(len(all_data[0,:])):
    relabler.fit(all_data[:, col])
    all_data[:, col] = relabler.transform(all_data[:, col])
    uniques = len(set(all_data[:,col]))
    maximum = max(all_data[:,col])
    # print col
    if maximum < 65534:
        count_map = np.bincount((all_data[:, col]).astype('uint16'))
        for n,i in enumerate(all_data[:, col]):
            if count_map[i] <= 1:
                all_data[n, col] = uniques
            elif count_map[i] == 2:
                all_data[n, col] = uniques+1
    else:
        for n,i in enumerate(all_data[:, col]):
            if (all_data[:, col] == i).sum() <= 1:
                all_data[n, col] = uniques
            elif (all_data[:, col] == i).sum() == 2:
                all_data[n, col] = uniques+1
    # print uniques
    uniques = len(set(all_data[:,col]))
    # print uniques
    relabler.fit(all_data[:, col])
    all_data[:, col] = relabler.transform(all_data[:, col])

In [0]:
# Collect the training features together
y = array(train_data.ACTION)
X = all_data[:num_train]
X_2 = dp[:num_train]
X_3 = dt[:num_train]

# Collect the testing features together
X_test = all_data[num_train:]
X_test_2 = dp[num_train:]
X_test_3 = dt[num_train:]

X_train_all = np.hstack((X, X_2, X_3))
X_test_all = np.hstack((X_test, X_test_2, X_test_3))
num_features = X_train_all.shape[1]
    
if learner == 'NB':
    model = naive_bayes.BernoulliNB(alpha=0.03)
else:
    model = linear_model.LogisticRegression(class_weight='balanced', penalty='l2')
    
# Xts holds one hot encodings for each individual feature in memory
# speeding up feature selection 
Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]

In [31]:
print ("Performing greedy feature selection...")
score_hist = []
N = 10
good_features = set([])
# Greedy feature selection loop
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
    scores = []
    for f in range(len(Xts)):
        if f not in good_features:
            feats = list(good_features) + [f]
            Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
            score = cv_loop(Xt, y, model, N)
            scores.append((score, f))
            # print ("Feature: %i Mean AUC: %f" % (f, score))
    good_features.add(sorted(scores)[-1][1])
    score_hist.append(sorted(scores)[-1])
    # print ("Current features: %s" % sorted(list(good_features)))

Performing greedy feature selection...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [38]:
# Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print ("Selected features %s" % good_features)
gf = open("feats" + submit, 'w')
# print (>>gf, good_features)
gf.close()
print(len(good_features), " features")
    
print("Performing hyperparameter selection...")

Selected features [0, 7, 8, 9, 28, 36, 40, 60, 62, 64, 65, 69]
12  features
Performing hyperparameter selection...


In [41]:
# Hyperparameter selection loop
score_hist = []
Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
if learner == 'NB':
    Cvals = [0.001, 0.003, 0.006, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1]
else:
    Cvals = np.logspace(-4, 4, 15, base=2)  # for logistic
for C in Cvals:
    if learner == 'NB':
        model.alpha = C
    else:
        model.C = C
    score = cv_loop(Xt, y, model, N)
    score_hist.append((score,C))
    print ("C: %f Mean AUC: %f" %(C, score))
bestC = sorted(score_hist)[-1][1]
print ("Best C value: %f" % (bestC))
    
print ("Performing One Hot Encoding on entire dataset...")
Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
Xt, keymap = OneHotEncoder(Xt)
X_train = Xt[:num_train]
X_test = Xt[num_train:]



C: 0.062500 Mean AUC: 0.900382




C: 0.092875 Mean AUC: 0.904509




C: 0.138011 Mean AUC: 0.907123




C: 0.205084 Mean AUC: 0.908503




C: 0.304753 Mean AUC: 0.909146




C: 0.452862 Mean AUC: 0.909240




C: 0.672950 Mean AUC: 0.908972




C: 1.000000 Mean AUC: 0.908373




C: 1.485994 Mean AUC: 0.907396




C: 2.208179 Mean AUC: 0.906117




C: 3.281341 Mean AUC: 0.904500




C: 4.876055 Mean AUC: 0.902561




C: 7.245789 Mean AUC: 0.900351




C: 10.767202 Mean AUC: 0.897949




C: 16.000000 Mean AUC: 0.895381
Best C value: 0.452862
Performing One Hot Encoding on entire dataset...


In [53]:
if learner == 'NB':
    model.alpha = bestC
else:
    model.C = bestC

print ("Training full model...")
print ("Making prediction and saving results...")
model.fit(X_train, y)

# save result
preds = model.predict_proba(X_test)[:,1]
testx = pd.read_csv("D:/applied/amazon/test.csv") 
submit = pd.DataFrame()
submit["id"] = range(1, testx.shape[0]+1)
print(preds.shape)
submit["ACTION"] =  preds
submit.to_csv("D:/applied/amazon/logistic_result_1.csv", index = False)

# preds = model.predict_proba(X_train)[:,1]



Training full model...
Making prediction and saving results...
(58921,)
