In [1]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
base_dir = './dataset/'

In [4]:
user_app_usage = pd.read_hdf(base_dir+'user_app_usage_df.h5',key='data')
user_app_usage.shape

(4020281, 2)

In [5]:
user_app_usage.head()

Unnamed: 0,uId,appIds
0,1000009,a008224#a001055#a002392#a003386#a003659#a00548...
1,1000010,a006085#a003327#a001578#a007488#a00451#a001050...
2,1000011,a007629#a006461#a00604#a006400#a007439#a00326#...
3,1000012,a003570#a008587#a007439#a005893#a007447#a00588...
4,1000014,a001671#a007445#a007088#a008879#a001062


In [6]:
train_set = pd.read_csv(base_dir+'age_train.csv',names=['uId','age_group'],dtype={'uId':np.uint32,'age_group':np.uint8})
train_set.shape

(4000000, 2)

In [7]:
test_set = pd.read_csv(base_dir+'age_test.csv',names=['uId'],dtype={'uId':np.uint32})
test_set.shape

(1000000, 1)

In [8]:
user_app_usage['appIds'] = user_app_usage['appIds'].astype(str)

In [9]:
app_usage_list=np.array(user_app_usage['appIds'].apply(lambda x:shuffle(x.strip().split('#'))))
len(app_usage_list)

4020281

In [10]:
usage_model = FastText(app_usage_list, size=128, window=5, min_count=5, workers=12,sg=1,iter=10)

In [11]:
usage_model.save(base_dir+'app_usage_fasttext.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
result={
    'sum':[]
}
for i in app_usage_list:
    no_exist_count=0
    try:
        user_usage_info=np.array(usage_model[i]).sum(aixs=1)
        result['sum'].append(user_usage_info)
    except:
        user_usage_info=np.array([0 for _ in range(128)],dtype=np.float32)
        for app_id in i:
            if app_id in usage_model.wv.vocab:
                user_usage_info+=usage_model[app_id]
            else:
                no_exist_count+=1
        result['sum'].append(user_usage_info)

  import sys
  del sys.path[0]


In [13]:
result_sum=pd.DataFrame(result['sum'],dtype=np.float32)
result_sum.rename(columns=lambda x:str(x)+'_sum',inplace=True)
result_sum_with_uid=pd.concat([user_app_usage[['uId']],result_sum],axis=1)

In [14]:
train_app_usage_emb_fasttext_sum = train_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = train_app_usage_emb_fasttext_sum.pop('uId')
train_app_usage_emb_fasttext_sum.shape

(4000000, 128)

In [15]:
h5 = pd.HDFStore(base_dir+'train_app_usage_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = train_app_usage_emb_fasttext_sum
h5.close()

In [16]:
test_app_usage_emb_fasttext_sum = test_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = test_app_usage_emb_fasttext_sum.pop('uId')
test_app_usage_emb_fasttext_sum.shape

(1000000, 128)

In [17]:
h5 = pd.HDFStore(base_dir+'test_app_usage_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = test_app_usage_emb_fasttext_sum
h5.close()