In [1]:
import pandas as pd
import numpy as np
import time
import gc

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

# Feature Col

In [7]:
target = 'is_attributed'
feature_count =  [
                    'app_count',
                    'ip_count',
                    'os_count',
                    'channel_count',
                    'device_count']

feature_mean = ['app_mean',
                    'ip_mean',
                    'os_mean',
                    'channel_mean',
                    'device_mean']



                         

feature_ori = ['app', 'channel', 'device', 'os', 'hour', 'ip']

feature_cols = []
added_feature = []

added_feature.extend(feature_count)
added_feature.extend(feature_mean)
feature_cols.extend(added_feature)
feature_cols.extend(feature_ori)

train_cols = feature_cols.copy()
train_cols.append(target)

In [8]:
feature_cols

['app_count',
 'ip_count',
 'os_count',
 'channel_count',
 'device_count',
 'app_mean',
 'ip_mean',
 'os_mean',
 'channel_mean',
 'device_mean',
 'app',
 'channel',
 'device',
 'os',
 'hour',
 'ip']

# Define functin

In [9]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

def countsort(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts().iloc[::-1]
    count_map.iloc[:] = list(range(1, len(count_map) + 1))
    
    return group.map(count_map).fillna(-1)



def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  

    group = get_group(df_train, cols)
    group_history = get_group(df_history, cols)
    mean_map = df_history.groupby(group_history)[target].mean()
    return group.map(mean_map).fillna(-0.01)


def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)


def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

# def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
#     # normalization
#     group = get_group(df_train, cols)
#     result = pd.Series(np.zeros(len(df_train)), index=df_train.index)
#     grouping = df_train.groupby(group)
#     for each_group in grouping:
#         df_each = each_group[1]
#         click = moving_diff(df_each[timecol], mode='next', norm=True)
#         result.loc[click.index] = click.values
#     return result



# def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
#     # normalization
#     group = get_group(df_train, cols)
#     result = pd.Series(np.zeros(len(df_train)), index=df_train.index)
#     grouping = df_train.groupby(group)
#     for each_group in grouping:
#         df_each = each_group[1]
#         click = moving_diff(df_each[timecol], mode='previous', norm=True)
#         result.loc[click.index] = click.values
        
    return result

def moving_diff(ser, mode='next', norm=False):
    tmp = ser.copy()
    if mode == 'next':
        tmp.iloc[:-1] = ser[1:].values
        result = tmp - ser
    elif mode == 'previous':
        tmp.iloc[1:] = ser[:-1].values
        result = ser - tmp
    if norm:
        ave = np.mean(result)
#         std = np.std(result)
#         if std != 0:
#             result = (result - ave) / std
        if ave != 0:
            result = result / ave
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

def firsttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    first_time = df_train.groupby(group)[timecol].first()
    
    return  df_train[timecol] - group.map(first_time)


def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__
    
    
from lightfm import LightFM
import scipy.sparse as sp
from scipy.sparse import coo_matrix
from lightfm import LightFM
from sklearn.preprocessing import LabelEncoder

def get_var(df_history, df, group_col, agg_col):
    group = get_group(df, group_col)
    group_history = get_group(df_history, group_col)
    df_temp = pd.DataFrame()
    df_temp['group'] = group_history.values
    df_temp['agg'] = df_history[agg_col].values
    group_map =df_temp.groupby('group')['agg'].var()
    result = group.map(group_map).fillna(0)
    return result

def matrix_factorization(df_history, df, target, item_col, userid_col, userraw_col):
    """
    userid_col is unique user id
    item_col is unique itme id
    userraw_col is used to construct user feature. dim: user_id*userraw
    """
    dff = pd.DataFrame()
    dff_history = pd.DataFrame()


    #1. process item
    if item_col is None:
        dff['item'] = np.zeros(len(df))
        dff_history['item'] = np.zeros(len(df_history))
    else:
        encoder = LabelEncoder()
        group = get_group(df, item_col)
        group_history = get_group(df_history, item_col)
        encoder.fit(pd.concat([group, group_history]))
        dff['item'] = encoder.transform(group)
        dff_history['item'] = encoder.transform(group_history)
#     print('processing item done!')

    #2. user raw
    group = get_group(df, userraw_col)
    group_history = get_group(df_history, userraw_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['userraw'] = encoder.transform(group)
    dff_history['userraw'] = encoder.transform(group_history)
#     print('processing user raw done')


    #3. user_id
    group = get_group(df, userid_col)
    group_history = get_group(df_history, userid_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['user_id'] = encoder.transform(group)
    dff_history['user_id'] = encoder.transform(group_history)
#     print('processing user id done')



    num_users = max(dff.user_id.max(), dff_history.user_id.max()) + 1
    num_items = max(dff.item.max(), dff_history.item.max()) + 1
    num_userraw = max(dff.userraw.max(), dff_history.userraw.max()) + 1

    M = coo_matrix(
            (df_history[target], ( dff_history.user_id, dff_history.item)),
            shape=(num_users, num_items)
        )

    user_features = pd.concat([dff, dff_history])[['userraw', 'user_id']].drop_duplicates()

    user_features = coo_matrix(
        (np.ones(len(user_features)), (user_features.user_id, user_features.userraw)),
        shape=(num_users, num_userraw)
    )

    user_features = sp.hstack([sp.eye(num_users), user_features])

    model = LightFM(no_components=50, learning_rate=0.1)
    print('fitting lightFM')
    model.fit(
            M, 
            epochs=2, 
            num_threads=36, 
            user_features=user_features,
        )
    print('predicting lightFM')
    result = model.predict(
        dff.user_id.values, 
        dff.item.values, 
        user_features=user_features,
    )
    return result



def regression(df_history, df, cols, target= 'is_attributed', time_col='timestamp', shift=1500000000):
    df = df.copy()
    df_history = df_history.copy()
    df.loc[:,time_col] = df.loc[:,time_col] - shift
    df_history.loc[:,time_col] = df_history.loc[:,time_col] - shift
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)

    targets = {}
    times = {}
    for (y, t), u in zip(df_history[[target, time_col]].values, group_history):
        if u not in targets:
            targets[u] = [y]
            times[u] = [t]
        else:
            targets[u].append(y)
            times[u].append(t)

    linal_user = {}
    for u in times:
        if len(times[u]) > 1:
            A = np.vstack([times[u], np.ones(len(times[u]))]).T
            linal_user[u] = np.linalg.inv(A.T.dot(A)).dot(A.T).dot(targets[u])

    result = []

    for t, u in zip(df[time_col], group):
        if u not in times:
            result.append(-0.5)
        else:
            if len(times[u]) < 2:
                result.append(-0.5)
            else:
                result.append(linal_user[u].dot([t, 1]))
    return result
    


In [6]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(train[col].max(),test[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

# Feature engineering on Train

In [10]:
import sys
from itertools import combinations
combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]





counter = 0
df_train = train.copy()
print('got train data')
df_history = train.copy()
print('got historical data')

###########################################################################
for func in [count, mean]:
            if func.__name__ == count.__name__:
                df_all = pd.concat([train, test])
            else:
                try:
                    del df_all
                    gc.collect()
                except Exception:
                    print('df_all does not exist')

            for num_col in [1,2,3,4,5]:
                for cols in combinations(combine_col, num_col):
                    feature_name = col_name(cols, func=func)
                    if feature_name not in added_feature:
                           continue
                    counter += 1
                    if func.__name__ == count.__name__:
                            print('count function')
                            df_train[feature_name] = func(df_all, df_train, cols, target='is_attributed')

                    elif func.__name__ == mean.__name__:
                            print('mean function')
                            df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                    elif func.__name__ == reversemean.__name__:
                            print('reverse mean function')
                            df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')

                    elif func.__name__ == regression.__name__:
                            print('regression function')
                            df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')

                    else:
                            print('time related function')
                            df_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')

                    all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
                    print(all_str)
                    with open('feature_all.txt', 'w') as text_file:
                        text_file.write(all_str + '\n')

    
    

               
               
save_file_name = '{}_singCountMean.csv'.format('train')
save_file_path = '/home/kai/data/kaggle/talkingdata/wl/data/analysis/' + save_file_name
df_train = df_train[train_cols]
print('------')
print(df_train.columns.values)
df_train.to_csv(save_file_path, index=False)
print(save_file_path)
print('======================================================')

got train data
got historical data
count function
all 1:   ip_count   			 size: 17.90933745354414 G.
count function
all 2:   app_count   			 size: 19.286978788673878 G.
count function
all 3:   device_count   			 size: 20.664620123803616 G.
count function
all 4:   os_count   			 size: 22.042261458933353 G.
count function
all 5:   channel_count   			 size: 23.41990279406309 G.
mean function
all 6:   ip_mean   			 size: 24.79754412919283 G.
mean function
all 7:   app_mean   			 size: 26.175185464322567 G.
mean function
all 8:   device_mean   			 size: 27.552826799452305 G.
mean function
all 9:   os_mean   			 size: 28.930468134582043 G.
mean function
all 10:   channel_mean   			 size: 30.30810946971178 G.
------
['app_count' 'ip_count' 'os_count' 'channel_count' 'device_count'
 'app_mean' 'ip_mean' 'os_mean' 'channel_mean' 'device_mean' 'app'
 'channel' 'device' 'os' 'hour' 'ip' 'is_attributed']
/home/kai/data/kaggle/talkingdata/wl/data/analysis/train_singCountMean.csv


# Analysis

# load train 

In [2]:
import pandas as pd
df_train = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/analysis/train_singCountMean.csv')

# load test prediction

In [8]:
import numpy as np
path = '/home/kai/data/kaggle/talkingdata/data/'
test = pd.read_csv(path + 'test_cleaned_final.csv')
pred = np.load('/home/kai/data/kaggle/talkingdata/wl/data/analysis/sub0.9801.npy')
test['is_attributed'] = pred

# IP

In [9]:
white_list_count = df_train[df_train.ip_mean == 1].ip.value_counts()
white_list = white_list_count.index.values

In [10]:
len(white_list)

42014

In [11]:
def checkinlist(x, checklist):
    if x in checklist:
        return True
    else:
        return False
# test_check = test.ip.apply(checkinlist, args=(white_list,))
    

In [12]:
import multiprocessing
import pandas as pd
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    col = kwargs.pop('col')
    checklist = kwargs.pop('checklist')
    return df[col].apply(func, args=(checklist,))

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
    

def func(x, checklist):
    return x in checklist

In [13]:
worker = 10
ip_test = apply_by_multiprocessing(test, func, axis=1, workers=worker, col='ip', checklist=white_list)  

In [14]:
ip_test_result = test[ip_test]
ip_test_result['counts_in_train'] = ip_test_result.ip.map(white_list_count)
ip_test_result['counts_in_test'] = ip_test_result.ip.map(ip_test_result.ip.value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
df_train[df_train.ip==5348].head(2)

Unnamed: 0,app_count,ip_count,os_count,channel_count,device_count,app_mean,ip_mean,os_mean,channel_mean,device_mean,app,channel,device,os,hour,ip,is_attributed
553,36112780,1421256,48516446,2929579,191690321,0.000303,0.001889,0.001803,0.000234,0.001758,3,480,1,19,16,5348,0
592,11044457,1421256,48516446,3873558,191690321,0.00025,0.001889,0.001803,0.000335,0.001758,14,379,1,19,16,5348,0


In [17]:
ip_test_result[ip_test_result.ip==18394]

Unnamed: 0,ip,app,device,os,channel,day,hour,timestamp,minute,second,intesthh,is_attributed,counts_in_train,counts_in_test
178588,18394,9,1,2,466,10,4,1510286592,3,12,1,0.004439,1,9
178859,18394,9,1,2,215,10,4,1510286593,3,13,1,0.267653,1,9
4343339,18394,45,1,61,465,10,5,1510291207,20,7,1,0.999666,1,9
16685810,18394,9,1,53,215,10,14,1510323647,20,47,1,0.023999,1,9
16686253,18394,9,1,53,215,10,14,1510323647,20,47,1,0.062168,1,9
18386388,18394,9,1,53,215,10,14,1510325526,52,6,1,0.003484,1,9
18391028,18394,9,1,53,215,10,14,1510325531,52,11,1,0.030722,1,9
18391112,18394,9,1,53,215,10,14,1510325531,52,11,1,0.002968,1,9
18392337,18394,9,1,53,215,10,14,1510325533,52,13,1,0.171313,1,9


In [47]:
ip_test_result

Unnamed: 0,ip,app,device,os,channel,day,hour,timestamp,minute,second,intesthh,is_attributed,counts_in_train,counts_in_test
61515,69815,10,1,32,113,10,4,1510286462,1,2,1,0.999692,7,2
80287,65235,2,1,2,477,10,4,1510286484,1,24,1,0.052781,1,8
162303,109195,10,1,11,113,10,4,1510286575,2,55,1,0.999695,1,3
178588,18394,9,1,2,466,10,4,1510286592,3,12,1,0.004439,1,9
178859,18394,9,1,2,215,10,4,1510286593,3,13,1,0.267653,1,9
207095,109195,5,1,37,113,10,4,1510286623,3,43,1,0.999750,1,3
228158,112656,18,1,3,449,10,4,1510286645,4,5,1,0.666274,3,7
229161,112656,1,1,3,101,10,4,1510286647,4,7,1,0.590764,3,7
229625,112656,9,1,3,253,10,4,1510286647,4,7,1,0.067814,3,7
233969,112656,9,1,3,466,10,4,1510286652,4,12,1,0.640133,3,7


In [46]:
ip_test_result.groupby('ip')['is_attributed'].mean()

ip
230       0.978422
756       0.999785
943       0.357548
4216      0.823206
4598      0.886523
4707      0.999778
5190      0.105944
5544      0.446632
5683      0.996611
5824      0.304803
6470      0.106532
6613      0.994029
7631      0.999649
7842      0.537648
8603      0.944009
9258      0.999787
9536      0.999484
9589      0.999500
10049     0.317179
10213     0.999795
10350     0.986964
11629     0.996327
11759     0.179084
12296     0.160868
12972     0.783149
13630     0.999833
14083     0.999730
15045     0.135479
18394     0.174046
19031     0.566692
            ...   
107076    0.998970
107925    0.172733
109195    0.949198
109717    0.997654
110411    0.998883
111325    0.999139
111614    0.999791
111957    0.286983
112090    0.947776
112656    0.549884
113247    0.457769
113341    0.953675
114217    0.999616
114512    0.942882
114542    0.814088
114758    0.825138
115287    0.998475
115569    0.997745
115876    0.249596
120059    0.066831
121417    0.998944
122137   