# 用户数据处理
（只取训练集和测试集中出现的用户ID）

数据来源于Kaggle竞赛：Event Recommendation Engine Challenge，根据
events they’ve responded to in the past
user demographic information
what events they’ve seen and clicked on in our app
用户对某个活动是否感兴趣

竞赛官网：
https://www.kaggle.com/c/event-recommendation-engine-challenge/data

用户描述信息在users.csv文件：共7维特征
user_id
locale：地区，语言
birthyear：出身年
gender：性别
joinedAt：用户加入APP的时间，ISO-8601 UTC time
location：地点
timezone：时区

# 导入工具包

In [1]:
import pandas as pd

import numpy as np
import scipy.sparse as ss
import scipy.io as sio

#保存数据
import pickle

#event的特征需要编码
#from utils import FeatureEng
from sklearn.preprocessing import normalize
#相似度/距离
import scipy.spatial.distance as ssd

总的用户数目超过训练集和测试集中的用户，
为节省处理时间和内存，先去处理train和test，得到竞赛需要用到的事件和用户
然后对在训练集和测试集中出现过的事件和用户建立新的ID索引
先运行user_event.ipynb,
得到事件列表文件：PE_userIndex.pkl

# 读取之前算好的测试集和训练集中出现过的用户

In [2]:
# 数据地址
dpath = './data/'

In [3]:
#读取训练集和测试集中出现过的用户列表
userIndex = pickle.load(open(dpath+"PE_userIndex.pkl", 'rb'))
n_users = len(userIndex)

print("number of users in train & test :%d" % n_users)

number of users in train & test :3391


# 处理users.csv --> 特征编码、用户之间的相似度

In [4]:
#读取数据
users = pd.read_csv(dpath+"users.csv")
users.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38209 entries, 0 to 38208
Data columns (total 7 columns):
user_id      38209 non-null int64
locale       38209 non-null object
birthyear    38209 non-null object
gender       38100 non-null object
joinedAt     38152 non-null object
location     32745 non-null object
timezone     37773 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


In [6]:
# 特征编码工具
#该事件涉及国家、城市、时间等信息的处理

#保存数据
import pickle

#特征编码
import datetime
import hashlib
# POSIX locale database and functionality
import locale
#国家的编码、名字、语言、货币等信息
import pycountry

import numpy as np
import scipy.io as sio
import scipy.sparse as ss

from collections import defaultdict
from sklearn.preprocessing import normalize

In [7]:
#类别型特征编码
#这里写成类的形式，因为编码字典要在不同的文件中使用
class FeatureEng(object):
    def __init__(self):
    
        # 载入 locales
        self.localeIdMap = defaultdict(int)
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
        #print locale.locale_alias.keys()

        # 载入 countries
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == "usa":ctryIdx["US"] = i
            if c.name.lower() == "canada":ctryIdx["CA"] = i
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1

        # 载入 gender id 字典
        ##缺失补0，性别未知
        self.genderIdMap = defaultdict(int, {'NaN': 0, "male":1, "female":2})

    def getLocaleId(self, locstr):
        return self.localeIdMap[locstr.lower()]

    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]

    def getJoinedYearMonth(self, dateString):
        try:
            dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
            return (dttm.year-2010)*12 + dttm.month
        except:  #缺失补0
            return 0

    def getCountryId(self, location):
        if (isinstance(location, str)
            and len(location.strip()) > 0
            and location.rfind("  ") > -1):
            return self.countryIdMap[location[location.rindex("  ") + 2:].lower()]
        else:
            return 0

    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == "None" else int(birthYear)
        except:
            return 0

    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0

    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            return int(hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)

    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)

In [8]:
FE = FeatureEng()

In [9]:
#locale	birthyear	gender	joinedAt	location	timezone
#去掉user_id列
n_cols = users.shape[1] - 1
cols = ['LocaleId', 'BirthYearInt', 'GenderId', 'JoinedYearMonth', 'CountryId', 'TimezoneInt']

#users编码后的特征
#userMatrix = np.zeros((n_users, n_cols), dtype=np.int)
userMatrix = ss.dok_matrix((n_users, n_cols))

for u in range(users.shape[0]): 
    userId = str(users.loc[u,'user_id'])
    
    if userId in userIndex.keys():  #在训练集或测试集中出现
        i = userIndex[userId]
    
        userMatrix[i, 0] = FE.getLocaleId(users.loc[u,'locale'])
        userMatrix[i, 1] = FE.getBirthYearInt(users.loc[u,'birthyear'])
        userMatrix[i, 2] = FE.getGenderId(users.loc[u,'gender'])
        userMatrix[i, 3] = FE.getJoinedYearMonth(users.loc[u,'joinedAt'])
        
        #由于地点的写法不规范，该编码似乎不起作用（所有样本的特征都被编码成0了）
        userMatrix[i, 4] = FE.getCountryId(users.loc[u,'location'])
        
        userMatrix[i, 5] = FE.getTimezoneInt(users.loc[u,'timezone'])

In [10]:
print(userMatrix)

  (2593, 0)	246.0
  (2593, 1)	1993.0
  (2593, 2)	1.0
  (2593, 3)	35.0
  (2593, 5)	-300.0
  (106, 0)	136.0
  (106, 1)	1993.0
  (106, 2)	2.0
  (106, 3)	31.0
  (106, 5)	-240.0
  (2898, 0)	246.0
  (2898, 1)	1994.0
  (2898, 2)	1.0
  (2898, 3)	35.0
  (2898, 5)	420.0
  (2934, 0)	162.0
  (2934, 1)	1994.0
  (2934, 2)	1.0
  (2934, 3)	35.0
  (2934, 4)	64.0
  (2934, 5)	60.0
  (2721, 0)	246.0
  (2721, 1)	1994.0
  (2721, 2)	1.0
  (2721, 3)	34.0
  :	:
  (2013, 3)	33.0
  (2013, 4)	103.0
  (2013, 5)	420.0
  (1703, 0)	246.0
  (1703, 1)	1994.0
  (1703, 2)	2.0
  (1703, 3)	34.0
  (1703, 4)	103.0
  (1703, 5)	420.0
  (1190, 0)	136.0
  (1190, 1)	1996.0
  (1190, 2)	2.0
  (1190, 3)	34.0
  (1190, 5)	420.0
  (1352, 0)	136.0
  (1352, 1)	1996.0
  (1352, 2)	2.0
  (1352, 3)	34.0
  (1352, 5)	420.0
  (2398, 0)	246.0
  (2398, 1)	1994.0
  (2398, 2)	2.0
  (2398, 3)	34.0
  (2398, 4)	103.0
  (2398, 5)	420.0


In [11]:
# 归一化用户矩阵
userMatrix = normalize(userMatrix, norm="l2", axis=0, copy=False)
sio.mmwrite(dpath+"US_userMatrix", userMatrix)

In [12]:
print(userMatrix)

  (2593, 0)	0.021628856085564065
  (106, 0)	0.011957416372506963
  (2898, 0)	0.021628856085564065
  (2934, 0)	0.014243393031956823
  (2721, 0)	0.021628856085564065
  (2224, 0)	0.011957416372506963
  (3282, 0)	0.021628856085564065
  (214, 0)	0.011957416372506963
  (1594, 0)	0.017144824945873955
  (36, 0)	0.021628856085564065
  (9, 0)	0.021628856085564065
  (1197, 0)	0.011957416372506963
  (1342, 0)	0.021628856085564065
  (326, 0)	0.005451175110995821
  (923, 0)	0.011166116759620473
  (2054, 0)	0.011957416372506963
  (2537, 0)	0.021628856085564065
  (1808, 0)	0.011957416372506963
  (1902, 0)	0.021628856085564065
  (171, 0)	0.025585354149996516
  (2710, 0)	0.011957416372506963
  (560, 0)	0.011957416372506963
  (1608, 0)	0.021628856085564065
  (2812, 0)	0.021628856085564065
  (2733, 0)	0.021628856085564065
  :	:
  (2413, 5)	-0.01303142053748795
  (2732, 5)	0.020850272859980718
  (2322, 5)	0.010425136429990359
  (1299, 5)	0.014334562591236744
  (1339, 5)	0.01824398875248313
  (86, 5)	0.0104

In [13]:
# 计算用户相似度矩阵，之后用户推荐系统
userSimMatrix = ss.dok_matrix((n_users, n_users))

#读取在测试集和训练集中出现的用户对
uniqueUserPairs = pickle.load(open(dpath+"FE_uniqueUserPairs.pkl", 'rb'))

#对角线元素
for i in range(0, n_users):
    userSimMatrix[i, i] = 1.0
    
#对称
for u1, u2 in uniqueUserPairs:
    #i = userIndex[u1]
    #j = userIndex[u2]
    i = u1
    j = u2
    if not (i, j) in userSimMatrix.keys():
        #Person相关系数做为相似度度量
        #特征：国家（locale、location）、年龄、性别、时区、地点
        #usim = ssd.correlation(userMatrix[i,:],
            #userMatrix[j,:])
    
        usim = ssd.correlation(userMatrix.getrow(i).todense(),
          userMatrix.getrow(j).todense())
        userSimMatrix[i, j] = usim
        userSimMatrix[j, i] = usim

In [14]:
dict(userSimMatrix.getrow(88))

{(0, 0): 0.0,
 (0, 1): 0.0,
 (0, 2): 0.0,
 (0, 3): 0.0,
 (0, 4): 0.0,
 (0, 5): 0.0,
 (0, 6): 0.0,
 (0, 7): 0.0,
 (0, 8): 0.0,
 (0, 9): 0.0,
 (0, 10): 0.0,
 (0, 11): 0.0,
 (0, 12): 0.0,
 (0, 13): 0.0,
 (0, 14): 0.0,
 (0, 15): 0.0,
 (0, 16): 0.0,
 (0, 17): 0.0,
 (0, 18): 0.0,
 (0, 19): 0.0,
 (0, 20): 0.0,
 (0, 21): 0.0,
 (0, 22): 0.0,
 (0, 23): 0.0,
 (0, 24): 0.0,
 (0, 25): 0.0,
 (0, 26): 0.0,
 (0, 27): 0.0,
 (0, 28): 0.0,
 (0, 29): 0.0,
 (0, 30): 0.0,
 (0, 31): 0.0,
 (0, 32): 0.0,
 (0, 33): 0.0,
 (0, 34): 0.0,
 (0, 35): 0.0,
 (0, 36): 0.0,
 (0, 37): 0.0,
 (0, 38): 0.0,
 (0, 39): 0.0,
 (0, 40): 0.0,
 (0, 41): 0.0,
 (0, 42): 0.0,
 (0, 43): 0.0,
 (0, 44): 0.0,
 (0, 45): 0.0,
 (0, 46): 0.0,
 (0, 47): 0.0,
 (0, 48): 0.0,
 (0, 49): 0.0,
 (0, 50): 0.0,
 (0, 51): 0.0,
 (0, 52): 0.0,
 (0, 53): 0.0,
 (0, 54): 0.0,
 (0, 55): 0.0,
 (0, 56): 0.0,
 (0, 57): 0.0,
 (0, 58): 0.0,
 (0, 59): 0.0,
 (0, 60): 0.0,
 (0, 61): 0.0,
 (0, 62): 0.0,
 (0, 63): 0.0,
 (0, 64): 0.0,
 (0, 65): 0.0,
 (0, 66): 0.0,
 (0, 

In [15]:
sio.mmwrite(dpath+"US_userSimMatrix", userSimMatrix)