In [1]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
base_dir = './dataset/'

In [3]:
user_app_actived = pd.read_csv(base_dir+'user_app_actived.csv',names=['uId','appId'],dtype={'uId':np.uint32,'appId':str})
user_app_actived.shape

(4999341, 2)

In [4]:
user_app_actived.head()

Unnamed: 0,uId,appId
0,1000110,a001048#a003072#a004443#a006024#a007087#a00743...
1,1000542,a001010#a00158#a001671#a002450#a003484#a003577...
2,1000866,a001048#a00108#a004622#a007104#a0075
3,1001028,a001012#a001055#a001062#a001275#a001403#a00158...
4,1001190,a001012#a00107#a001304#a001403#a001533#a001679...


In [2]:
user_app_usage = pd.read_csv(base_dir+'user_app_usage_df.csv')
user_app_usage.shape

(1992667, 2)

In [8]:
user_app_usage.head()

Unnamed: 0,appIds,uId
0,a00289826#a00290037#a00289511#a00184278#a00178...,1000001
1,a00361265#a00271777#a00135785#a0048562#a001205...,1000002
2,a00289826#a0048522#a00329061#a00290027#a002895...,1000003
3,a00289826#a0036423#a0036526#a00290015#a0028705...,1000004
4,a00289826#a00167245#a00481059#a00331594#a00290...,1000007


In [5]:
train_set = pd.read_csv(base_dir+'age_train.csv',names=['uId','age_group'],dtype={'uId':np.uint32,'age_group':np.uint8})
train_set.shape

(4000000, 2)

In [6]:
test_set = pd.read_csv(base_dir+'age_test.csv',names=['uId'],dtype={'uId':np.uint32})
test_set.shape

(1000000, 1)

### 激活app

In [7]:
app_actived_list=np.array(user_app_actived['appId'].apply(lambda x:shuffle(x.strip().split('#'))))
len(app_actived_list)

4999341

#### fasttext

In [8]:
model = FastText(app_actived_list, size=128, window=5, min_count=5, workers=20,sg=1,iter=10)

In [9]:
model.save(base_dir+'app_actived_fasttext.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# model = FastText.load(fname)

In [10]:
result={
    'sum':[]
}
for i in app_actived_list:
    no_exist_count=0
    try:
        user_actived_info=np.array(model[i]).sum(aixs=1)
        result['sum'].append(user_actived_info)
    except:
        user_actived_info=np.array([0 for _ in range(128)],dtype=np.float32)
        for app_id in i:
            if app_id in model.wv.vocab:
                user_actived_info+=model[app_id]
            else:
                no_exist_count+=1
        result['sum'].append(user_actived_info)

  import sys
  del sys.path[0]


In [11]:
result_sum=pd.DataFrame(result['sum'],dtype=np.float32)
result_sum.rename(columns=lambda x:str(x)+'_sum',inplace=True)
result_sum_with_uid=pd.concat([user_app_actived[['uId']],result_sum],axis=1)

In [13]:
result_sum_with_uid.shape

(4999341, 129)

In [28]:
# h5 = pd.HDFStore('./dataset/app_actived_embedding_fasttext_sum.h5','w',complevel=4,complib='blosc')
# h5['data'] = result_sum_with_uid
# h5.close()

In [14]:
train_app_activted_emb_fasttext_sum = train_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = train_app_activted_emb_fasttext_sum.pop('uId')
train_app_activted_emb_fasttext_sum.shape

(4000000, 128)

In [15]:
h5 = pd.HDFStore(base_dir+'train_app_activted_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = train_app_activted_emb_fasttext_sum
h5.close()

In [16]:
test_app_activted_emb_fasttext_sum = test_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = test_app_activted_emb_fasttext_sum.pop('uId')
test_app_activted_emb_fasttext_sum.shape

(1000000, 128)

In [17]:
h5 = pd.HDFStore(base_dir+'test_app_activted_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = test_app_activted_emb_fasttext_sum
h5.close()

### 使用app

In [3]:
user_app_usage['appIds'] = user_app_usage['appIds'].astype(str)

In [5]:
app_usage_list=np.array(user_app_usage['appIds'].apply(lambda x:shuffle(x.strip().split('#'))))
len(app_usage_list)

1992667

In [6]:
usage_model = FastText(app_usage_list, size=128, window=5, min_count=5, workers=25,sg=1,iter=10)

In [7]:
usage_model.save(base_dir+'app_usage_fasttext.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
result={
    'sum':[]
}
for i in app_usage_list:
    no_exist_count=0
    try:
        user_usage_info=np.array(usage_model[i]).sum(aixs=1)
        result['sum'].append(user_usage_info)
    except:
        user_usage_info=np.array([0 for _ in range(128)],dtype=np.float32)
        for app_id in i:
            if app_id in usage_model.wv.vocab:
                user_usage_info+=usage_model[app_id]
            else:
                no_exist_count+=1
        result['sum'].append(user_usage_info)

  
  from ipykernel import kernelapp as app


In [15]:
result_sum=pd.DataFrame(result['sum'],dtype=np.float32)
result_sum.rename(columns=lambda x:str(x)+'_sum',inplace=True)
result_sum_with_uid=pd.concat([user_app_usage[['uId']],result_sum],axis=1)

In [16]:
# h5 = pd.HDFStore('./dataset/app_usage_embedding_fasttext_sum.h5','w',complevel=4,complib='blosc')
# h5['data'] = result_sum_with_uid
# h5.close()

In [27]:
train_app_usage_emb_fasttext_sum = train_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = train_app_usage_emb_fasttext_sum.pop('uId')
train_app_usage_emb_fasttext_sum.shape

(2010000, 129)

In [30]:
h5 = pd.HDFStore(base_dir+'train_app_usage_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = train_app_usage_emb_fasttext_sum
h5.close()

In [31]:
test_app_usage_emb_fasttext_sum = test_set[['uId']].merge(result_sum_with_uid,on='uId',how='left')
_ = test_app_usage_emb_fasttext_sum.pop('uId')
test_app_usage_emb_fasttext_sum.shape

(502500, 129)

In [33]:
h5 = pd.HDFStore(base_dir+'test_app_usage_emb_fasttext_sum.h5','w',complevel=4,complib='blosc')
h5['data'] = test_app_usage_emb_fasttext_sum
h5.close()

In [None]:
print(0)