In [1]:
import pandas as pd
import numpy as np
import time
import gc
import matplotlib.pyplot as plt
from lightfm import LightFM
import scipy.sparse as sp
from scipy.sparse import coo_matrix
from lightfm import LightFM
from sklearn.preprocessing import LabelEncoder
import sys,os
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
import pickle
from sklearn.model_selection import KFold

In [2]:
X_Train = pd.read_pickle(ENV.application_train_cleaned.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test = pd.read_pickle(ENV.application_test_cleaned.value)
print('Test shape: {}'.format(X_Test.shape))

X_Train_ori = pd.read_csv(ENV.application_train_ori.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test_ori = pd.read_csv(ENV.application_test_ori.value)
print('Test shape: {}'.format(X_Test.shape))

# X_pre = pd.read_pickle(ENV.previous_application_cleaned.value)
# print('Previous App shape: {}'.format(X_pre.shape))

# X_bu_b = pd.read_pickle(ENV.bureau_balance_clean.value)
# print('Bureau Balance shape: {}'.format(X_bu_b.shape))

# X_bu = pd.read_pickle(ENV.bureau_cleaned.value)
# print('Bureau shape: {}'.format(X_bu.shape))

# X_ins = pd.read_pickle(ENV.installments_payments_clean.value)
# print('Installment shape: {}'.format(X_ins.shape))

# X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean.value)
# print('POS CASH shape: {}'.format(X_pos.shape))

# X_cc = pd.read_pickle(ENV.credit_card_balance_clean.value)
# print('Credit Card shape: {}'.format(X_cc.shape))

cat_col = pickle.load(open(ENV.clean_categorical_col.value,'rb'))


Train shape: (307511, 122)
Test shape: (48744, 121)
Train shape: (307511, 122)
Test shape: (48744, 121)


In [3]:
all_feautures = list(X_Test_ori.columns)
X = pd.concat([X_Train_ori[all_feautures],X_Test_ori[all_feautures]])


In [4]:

def scoring(x,n,fillna=True):
    """
    x should be list or array
    if fillna is True, will use mean to fill na
    """
    try:
        s = x.values.copy()
    except:
        pass
    
    ori = pd.Series(data=s,index=range(len(x)))
    if fillna:
        ori = ori.fillna(ori.mean())
    not_null = ori[ori.notnull()]
    not_null = not_null.sort_values(ascending = True)
    kf = KFold(n_splits=n)
    score = 1
    for t,v in kf.split(not_null):
        not_null.iloc[v] = score
        score += 1
    ori.loc[not_null.index] = not_null.values
    ori = ori.fillna(-1)
    return ori

#get_group
def get_group(df,cols):
    groups = df[cols[0]].astype('str')
    for each in cols[1:]:
        groups += '_' + df[each].astype('str')
    return groups
# s = scoring(X_Train_ori.AMT_ANNUITY,5,True)

In [5]:
def matrix_factorization(df_history, df, target, item_col, userid_col, userraw_col):
    """
    userid_col is unique user id
    item_col is unique itme id
    userraw_col is used to construct user feature. dim: user_id*userraw
    """
    dff = pd.DataFrame()
    dff_history = pd.DataFrame()


    #1. process item
    if item_col is None:
        dff['item'] = np.zeros(len(df))
        dff_history['item'] = np.zeros(len(df_history))
    else:
        encoder = LabelEncoder()
        group = get_group(df, item_col)
        group_history = get_group(df_history, item_col)
        encoder.fit(pd.concat([group, group_history]))
        dff['item'] = encoder.transform(group)
        dff_history['item'] = encoder.transform(group_history)
#     print('processing item done!')

    #2. user raw
    group = get_group(df, userraw_col)
    group_history = get_group(df_history, userraw_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['userraw'] = encoder.transform(group)
    dff_history['userraw'] = encoder.transform(group_history)
#     print('processing user raw done')


    #3. user_id
    group = get_group(df, userid_col)
    group_history = get_group(df_history, userid_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['user_id'] = encoder.transform(group)
    dff_history['user_id'] = encoder.transform(group_history)
#     print('processing user id done')



    num_users = max(dff.user_id.max(), dff_history.user_id.max()) + 1
    num_items = max(dff.item.max(), dff_history.item.max()) + 1
    num_userraw = max(dff.userraw.max(), dff_history.userraw.max()) + 1

    M = coo_matrix(
            (df_history[target], ( dff_history.user_id, dff_history.item)),
            shape=(num_users, num_items)
        )

    user_features = pd.concat([dff, dff_history])[['userraw', 'user_id']].drop_duplicates()

    user_features = coo_matrix(
        (np.ones(len(user_features)), (user_features.user_id, user_features.userraw)),
        shape=(num_users, num_userraw)
    )

    user_features = sp.hstack([sp.eye(num_users), user_features])

    model = LightFM(no_components=50, learning_rate=0.1)
    print('fitting lightFM')
    model.fit(
            M, 
            epochs=2, 
            num_threads=36, 
            user_features=user_features,
        )
    print('predicting lightFM')
    result = model.predict(
        dff.user_id.values, 
        dff.item.values, 
        user_features=user_features,
    )
    return result

In [6]:
item_col = ['NAME_CONTRACT_TYPE','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']
user_col = ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN',
            'AMT_INCOME_TOTAL','NAME_TYPE_SUITE','NAME_INCOME_TYPE',
            'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE',
            'DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH',
            'OWN_CAR_AGE','ORGANIZATION_TYPE','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
            'CNT_FAM_MEMBERS','OCCUPATION_TYPE']
external_col = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']
contact_related = ['FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE',
                   'FLAG_PHONE','FLAG_EMAIL',]

living_related = ['REGION_POPULATION_RELATIVE','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY',
                   'REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION',
                   'LIVE_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY',
                   'REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY']
house_related = ['APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG',
                 'YEARS_BUILD_AVG','COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG',
                 'FLOORSMAX_AVG','FLOORSMIN_AVG','LANDAREA_AVG','LIVINGAPARTMENTS_AVG',
                 'LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG','NONLIVINGAREA_AVG',
                 'APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE',
                 'YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE',
                 'FLOORSMAX_MODE','FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE',
                 'LIVINGAREA_MODE','NONLIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE',
                 'APARTMENTS_MEDI','BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI',
                 'COMMONAREA_MEDI','ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI',
                 'FLOORSMIN_MEDI','LANDAREA_MEDI','LIVINGAPARTMENTS_MEDI',
                 'LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MEDI','NONLIVINGAREA_MEDI',
                 'FONDKAPREMONT_MODE','HOUSETYPE_MODE','TOTALAREA_MODE','WALLSMATERIAL_MODE',
                 'EMERGENCYSTATE_MODE']
surounding_related = ['OBS_30_CNT_SOCIAL_CIRCLE',
              'DEF_30_CNT_SOCIAL_CIRCLE',
              'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','DAYS_LAST_PHONE_CHANGE']

application_related = ['WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START']

document_comp = ['FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',]

bureau_comp = ['AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
               'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']


In [7]:
def get_group_score(df,cols,t_limit=80,n=5,ignore=[]):
    groups = df[cols[0]].astype('str')
    groups = groups.reset_index(drop=True)
    if groups.nunique() > t_limit:
        if cols[0] not in ignore:
            groups = scoring(groups,n,fillna=False)
            groups = groups.astype('str')
            print('unique process {}'.format(cols[0]))
    for each in cols[1:]:
        
        c = df[each].copy()
        c = c.reset_index(drop=True)
        if c.nunique() > t_limit:
            if each not in ignore:
                print('unique process {}'.format(each))
                c = scoring(c,n,fillna=False)
        groups += '_' + c.astype('str')
    groups.fillna('na')
    return groups

In [8]:
encoder = LabelEncoder()

# Process Bureau related

In [9]:
# 60
bur = get_group_score(X,bureau_comp,t_limit=10,n=6)
bur.loc[:] = encoder.fit_transform(bur)
print(bur.nunique())

unique process AMT_REQ_CREDIT_BUREAU_MON
unique process AMT_REQ_CREDIT_BUREAU_QRT
unique process AMT_REQ_CREDIT_BUREAU_YEAR
1211


# Process Housing related

In [10]:
# Processing Hoursing related
housing = get_group_score(X,house_related,t_limit=10,n=4)
housing.loc[:] = encoder.fit_transform(housing)
print(housing.nunique())

unique process APARTMENTS_AVG
unique process BASEMENTAREA_AVG
unique process YEARS_BEGINEXPLUATATION_AVG
unique process YEARS_BUILD_AVG
unique process COMMONAREA_AVG
unique process ELEVATORS_AVG
unique process ENTRANCES_AVG
unique process FLOORSMAX_AVG
unique process FLOORSMIN_AVG
unique process LANDAREA_AVG
unique process LIVINGAPARTMENTS_AVG
unique process LIVINGAREA_AVG
unique process NONLIVINGAPARTMENTS_AVG
unique process NONLIVINGAREA_AVG
unique process APARTMENTS_MODE
unique process BASEMENTAREA_MODE
unique process YEARS_BEGINEXPLUATATION_MODE
unique process YEARS_BUILD_MODE
unique process COMMONAREA_MODE
unique process ELEVATORS_MODE
unique process ENTRANCES_MODE
unique process FLOORSMAX_MODE
unique process FLOORSMIN_MODE
unique process LANDAREA_MODE
unique process LIVINGAPARTMENTS_MODE
unique process LIVINGAREA_MODE
unique process NONLIVINGAPARTMENTS_MODE
unique process NONLIVINGAREA_MODE
unique process APARTMENTS_MEDI
unique process BASEMENTAREA_MEDI
unique process YEARS_BEGIN

# process document_comp

In [11]:
# Processing Hoursing related
documents = get_group_score(X,document_comp,t_limit=4,n=3)
documents.loc[:] = encoder.fit_transform(documents)
print(documents.nunique())

72


# Process application

In [12]:
application_related
# Processing Hoursing related
application = get_group_score(X,application_related,t_limit=8,n=8,ignore=['HOUR_APPR_PROCESS_START'])
application.loc[:] = encoder.fit_transform(application)
print(application.nunique())

168


# Processing surround

In [13]:
# Processing Hoursing related
surounding = get_group_score(X,surounding_related,t_limit=10,n=5)
surounding.loc[:] = encoder.fit_transform(surounding)
print(surounding.nunique())

unique process OBS_30_CNT_SOCIAL_CIRCLE
unique process OBS_60_CNT_SOCIAL_CIRCLE
unique process DAYS_LAST_PHONE_CHANGE
368


# Processing living

In [14]:
# Processing Hoursing related
living = get_group_score(X,living_related,t_limit=10,n=5)
living.loc[:] = encoder.fit_transform(living)
print(living.nunique())

unique process REGION_POPULATION_RELATIVE
328


# Processing contact

In [15]:
# Processing Hoursing related
contact = get_group_score(X,contact_related,t_limit=10,n=10)
contact.loc[:] = encoder.fit_transform(contact)
print(contact.nunique())

25


# Processing user

In [16]:
# Processing Hoursing related
user = get_group_score(X,user_col,t_limit=20,n=10,ignore=['ORGANIZATION_TYPE','OCCUPATION_TYPE'])
user.loc[:] = encoder.fit_transform(user)
print(user.nunique())

unique process AMT_INCOME_TOTAL
unique process DAYS_BIRTH
unique process DAYS_EMPLOYED
unique process DAYS_REGISTRATION
unique process DAYS_ID_PUBLISH
unique process OWN_CAR_AGE
unique process EXT_SOURCE_1
unique process EXT_SOURCE_2
unique process EXT_SOURCE_3
355615


# Processing External Source 

In [17]:
# Processing Hoursing related
ext = get_group_score(X,external_col,t_limit=50,n=50,ignore=['ORGANIZATION_TYPE','OCCUPATION_TYPE'])
ext.loc[:] = encoder.fit_transform(ext)
print(ext.nunique())

unique process EXT_SOURCE_1
unique process EXT_SOURCE_2
unique process EXT_SOURCE_3
113475


# Processing item

In [18]:
item =  get_group_score(X,item_col,t_limit=10,n=10,ignore=[])
item.loc[:] = encoder.fit_transform(item)
print(item.nunique())

unique process AMT_CREDIT
unique process AMT_ANNUITY
unique process AMT_GOODS_PRICE
367


# concat new dataframe

In [19]:
new_df = pd.DataFrame({'bureau':bur.values, 
                       'house':housing.values,
                       'document':documents.values,
                       'application':application,
                       'surround':surounding,
                       'living':living,
                       'contact':contact,
                       'user':user,
                       'item':item,
                       'ext':ext})

df_history = new_df.iloc[:307511-20000].copy()
print(df_history.shape)
df_testtrain = new_df.iloc[307511-20000:307511].copy()
print(df_testtrain.shape)
df_test = new_df.iloc[307511:].copy()
print(df_test.shape)
targets = X_Train.iloc[:307511-20000]['TARGET'].values
df_history['TARGET'] = targets
df_train = new_df.iloc[:307511].copy()
print(df_train.shape)
df_train['TARGET'] = X_Train.iloc[:307511]['TARGET'].values


(287511, 10)
(20000, 10)
(48744, 10)
(307511, 10)


In [20]:
fold = 5
kf = KFold(n_splits=fold)
val_index=[]
train_index = []
for t,v in kf.split(df_history):
    train_index.append(t)
    val_index.append(v)

In [21]:
df_light = pd.DataFrame()
df_light['SK_ID_CURR'] = X.SK_ID_CURR
from itertools import combinations
fix_user = ['user','application']
user_ava = ['bureau','house','document','surround','living','contact',]
for combs in range(0,7):
    for each in combinations(user_ava,combs):
        feature_value = []
        userid_col = fix_user.copy()
        userid_col.extend(list(each))
        feature_name = 'LightFM_'+'_'.join(userid_col)
        print(feature_name)
        ####### generate features
        # step 1. generate kfolds
        for each_fold in range(fold):
            print('fold {}'.format(each_fold))
            t_index = train_index[each_fold]
            v_index = val_index[each_fold]
            fm_history_df = df_history.iloc[t_index]
            fm_val_df = df_history.iloc[v_index]
            r = matrix_factorization(fm_history_df,
                         fm_val_df,
                         target='TARGET',
                         item_col=['item',],
                         userid_col=userid_col,
                         userraw_col=['user'])
            r = list(r)
            feature_value.extend(r)
        # step 2. apply to traintest
        print('apply to train_test')
        r = matrix_factorization(df_history,
                         df_testtrain,
                         target='TARGET',
                         item_col=['item',],
                         userid_col=userid_col,
                         userraw_col=['user'])
        feature_value.extend(r)
        # step 3. apply to test
        print('apply to test')
        r = matrix_factorization(df_train,
                         df_test,
                         target='TARGET',
                         item_col=['item',],
                         userid_col=userid_col,
                         userraw_col=['user'])
        feature_value.extend(r)
        df_light[feature_name] = feature_value
        print('================================')
df_light.to_pickle(ENV.lightfm_v1.value)

LightFM_user_application
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_house
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitti

fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_document
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_surround
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_living
fold 0
fit

fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_document_living
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_document_contact
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply

predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM
LightFM_user_application_bureau_house_document_surround_living_contact
fold 0
fitting lightFM
predicting lightFM
fold 1
fitting lightFM
predicting lightFM
fold 2
fitting lightFM
predicting lightFM
fold 3
fitting lightFM
predicting lightFM
fold 4
fitting lightFM
predicting lightFM
apply to train_test
fitting lightFM
predicting lightFM
apply to test
fitting lightFM
predicting lightFM


In [25]:
df_light.describe()

Unnamed: 0,SK_ID_CURR,LightFM_user_application,LightFM_user_application_bureau,LightFM_user_application_house,LightFM_user_application_document,LightFM_user_application_surround,LightFM_user_application_living,LightFM_user_application_contact,LightFM_user_application_bureau_house,LightFM_user_application_bureau_document,...,LightFM_user_application_house_document_living_contact,LightFM_user_application_house_surround_living_contact,LightFM_user_application_document_surround_living_contact,LightFM_user_application_bureau_house_document_surround_living,LightFM_user_application_bureau_house_document_surround_contact,LightFM_user_application_bureau_house_document_living_contact,LightFM_user_application_bureau_house_surround_living_contact,LightFM_user_application_bureau_document_surround_living_contact,LightFM_user_application_house_document_surround_living_contact,LightFM_user_application_bureau_house_document_surround_living_contact
count,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,...,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0,356255.0
mean,278128.0,-2.318111,-2.316917,-2.318488,-2.318547,-2.317743,-2.317476,-2.317462,-2.316972,-2.317414,...,-2.317596,-2.317591,-2.317243,-2.317158,-2.318019,-2.317458,-2.317328,-2.316596,-2.318378,-2.31773
std,102842.104413,0.4462,0.445879,0.446391,0.446214,0.445119,0.445036,0.445928,0.445459,0.445987,...,0.445753,0.445849,0.445482,0.445988,0.446818,0.445519,0.444792,0.445292,0.44685,0.446451
min,100001.0,-3.212132,-3.227758,-3.239988,-3.238286,-3.239985,-3.234018,-3.236585,-3.231636,-3.231741,...,-3.224111,-3.235216,-3.231487,-3.229947,-3.235679,-3.23595,-3.233476,-3.237295,-3.236491,-3.240269
25%,189064.5,-2.653245,-2.650077,-2.653837,-2.648435,-2.655917,-2.65178,-2.66079,-2.653198,-2.653671,...,-2.654869,-2.651645,-2.657917,-2.650331,-2.654299,-2.649465,-2.659331,-2.65654,-2.655887,-2.651863
50%,278128.0,-2.385657,-2.383821,-2.382123,-2.385908,-2.385818,-2.383078,-2.389554,-2.382727,-2.382074,...,-2.386658,-2.383433,-2.383018,-2.381519,-2.380725,-2.383067,-2.380026,-2.383453,-2.384151,-2.377466
75%,367191.5,-2.032625,-2.039033,-2.034185,-2.036518,-2.036032,-2.03202,-2.033225,-2.039342,-2.037795,...,-2.033375,-2.036202,-2.039467,-2.032474,-2.032272,-2.035452,-2.035258,-2.029261,-2.036317,-2.028435
max,456255.0,0.09158,0.09162,0.091871,0.091673,0.09178,0.091664,0.091414,0.091416,0.091741,...,0.091882,0.091931,0.091822,0.091912,0.091629,0.091636,0.091691,0.091849,0.091765,0.092246


In [22]:
len(feature_value)

356255

In [23]:
aaaa

NameError: name 'aaaa' is not defined

In [27]:
r = matrix_factorization(df_history,
                         df_testtrain,
                         target='TARGET',
                         item_col=['item',],
                         userid_col=['user','application'],
                         userraw_col=['user'])

fitting lightFM
predicting lightFM


In [28]:
list(r)

[-2.724619150161743,
 -2.3400514125823975,
 -2.5598483085632324,
 -2.7037322521209717,
 -2.9035565853118896,
 -2.7635505199432373,
 -2.4595601558685303,
 -2.4620614051818848,
 -2.455408811569214,
 -2.5653061866760254,
 -2.6394271850585938,
 -1.5449867248535156,
 -2.6109118461608887,
 -2.0682294368743896,
 -2.3947789669036865,
 -2.563316822052002,
 -2.0691730976104736,
 -2.071018695831299,
 -2.705087661743164,
 -2.6117305755615234,
 -2.5314505100250244,
 -1.817025065422058,
 -2.651716947555542,
 -2.2237486839294434,
 -2.228170156478882,
 -2.320091724395752,
 -2.161504030227661,
 -1.4164012670516968,
 -2.2625317573547363,
 -2.336378574371338,
 -1.5424875020980835,
 -1.6910752058029175,
 -2.5514485836029053,
 -2.484785556793213,
 -2.6474738121032715,
 -2.235687494277954,
 -2.9405508041381836,
 -2.016004800796509,
 -2.0842974185943604,
 -2.795483350753784,
 -1.0330761671066284,
 -2.002720832824707,
 -2.8627638816833496,
 -2.4337480068206787,
 -2.1612656116485596,
 -2.193995237350464,
 -3.1

In [None]:
np.corrcoef(r,df_history.TARGET.values)

In [None]:
np.corrcoef(r,X_Train_ori.TARGET.values[307511-20000:307511])

In [None]:
r[20:40]

In [None]:
 X_Train_ori.TARGET.values[20:40]

In [None]:
tail_target = X_Train_ori.TARGET.values[307511-20000:307511]
tail_target[20:40]

In [26]:
def matrix_factorization(df_history, df, target, item_col, userid_col, userraw_col):
    """
    userid_col is unique user id
    item_col is unique itme id
    userraw_col is used to construct user feature. dim: user_id*userraw
    """
    dff = pd.DataFrame()
    dff_history = pd.DataFrame()


    #1. process item
    if item_col is None:
        dff['item'] = np.zeros(len(df))
        dff_history['item'] = np.zeros(len(df_history))
    else:
        encoder = LabelEncoder()
        group = get_group(df, item_col)
        group_history = get_group(df_history, item_col)
        encoder.fit(pd.concat([group, group_history]))
        dff['item'] = encoder.transform(group)
        dff_history['item'] = encoder.transform(group_history)
#     print('processing item done!')

    #2. user raw
    group = get_group(df, userraw_col)
    group_history = get_group(df_history, userraw_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['userraw'] = encoder.transform(group)
    dff_history['userraw'] = encoder.transform(group_history)
#     print('processing user raw done')


    #3. user_id
    group = get_group(df, userid_col)
    group_history = get_group(df_history, userid_col)
    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    dff['user_id'] = encoder.transform(group)
    dff_history['user_id'] = encoder.transform(group_history)
#     print('processing user id done')



    num_users = max(dff.user_id.max(), dff_history.user_id.max()) + 1
    num_items = max(dff.item.max(), dff_history.item.max()) + 1
    num_userraw = max(dff.userraw.max(), dff_history.userraw.max()) + 1

    M = coo_matrix(
            (df_history[target], ( dff_history.user_id, dff_history.item)),
            shape=(num_users, num_items)
        )

    user_features = pd.concat([dff, dff_history])[['userraw', 'user_id']].drop_duplicates()

    user_features = coo_matrix(
        (np.ones(len(user_features)), (user_features.user_id, user_features.userraw)),
        shape=(num_users, num_userraw)
    )

    user_features = sp.hstack([sp.eye(num_users), user_features])

    model = LightFM(no_components=5, learning_rate=0.1)
    print('fitting lightFM')
    model.fit(
            M, 
            epochs=2, 
            num_threads=36, 
            user_features=user_features,
        )
    print('predicting lightFM')
    result = model.predict(
        dff.user_id.values, 
        dff.item.values, 
        user_features=user_features,
    )
    return result

In [None]:
# def matrix_factorization(df_history, df, target, item_col, userid_col, userraw_col):
#     """
#     userid_col is unique user id
#     item_col is unique itme id
#     userraw_col is used to construct user feature. dim: user_id*userraw
#     """
#     dff = pd.DataFrame()


# usage:
#     df_train[feature_name] = matrix_factorization(df_history, df_train,target, item_col=['channel'], userid_col=['ip', 'app','device','os'], userraw_col=['ip'])


dff = pd.DataFrame()
dff_history = pd.DataFrame()

#1. process item
if item_col is None:
    dff['item'] = np.zeros(len(df))
    dff_history['item'] = np.zeros(len(df_history))
else:
    encoder = LabelEncoder()
    group = get_group(df, item_col)
    group_history = get_group(df_history, item_col)
    encoder.fit(pd.concat([group, group_history]))
    dff['item'] = encoder.transform(group)
    dff_history['item'] = encoder.transform(group_history)
#     print('processing item done!')

#2. user raw
group = get_group(df, userraw_col)
group_history = get_group(df_history, userraw_col)
encoder = LabelEncoder()
encoder.fit(pd.concat([group, group_history]))
dff['userraw'] = encoder.transform(group)
dff_history['userraw'] = encoder.transform(group_history)
#     print('processing user raw done')


#3. user_id
group = get_group(df, userid_col)
group_history = get_group(df_history, userid_col)
encoder = LabelEncoder()
encoder.fit(pd.concat([group, group_history]))
dff['user_id'] = encoder.transform(group)
dff_history['user_id'] = encoder.transform(group_history)
#     print('processing user id done')



num_users = max(dff.user_id.max(), dff_history.user_id.max()) + 1
num_items = max(dff.item.max(), dff_history.item.max()) + 1
num_userraw = max(dff.userraw.max(), dff_history.userraw.max()) + 1

M = coo_matrix(
            (df_history[target], ( dff_history.user_id, dff_history.item)),
            shape=(num_users, num_items)
        )

user_features = pd.concat([dff, dff_history])[['userraw', 'user_id']].drop_duplicates()

user_features = coo_matrix(
    (np.ones(len(user_features)), (user_features.user_id, user_features.userraw)),
    shape=(num_users, num_userraw)
)

user_features = sp.hstack([sp.eye(num_users), user_features])

model = LightFM(no_components=50, learning_rate=0.1)
print('fitting lightFM')
model.fit(
        M, 
        epochs=2, 
        num_threads=36, 
        user_features=user_features,
    )
print('predicting lightFM')
result = model.predict(
    dff.user_id.values, 
    dff.item.values, 
    user_features=user_features,
)