# 用户和活动关联关系处理


整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引

In [1]:
#保存数据
import pickle

import itertools

#处理事件字符串
import datetime

import numpy as np
import scipy.io as sio
import scipy.sparse as ss

#相似度/距离
import scipy.spatial.distance as ssd

from collections import defaultdict
from sklearn.preprocessing import normalize

In [2]:
"""
我们只关心train和test中出现的user和event，因此重点处理这部分关联数据

train.csv 有6列：
user：用户ID
event：活动ID
invited：是否被邀请（0/1）
timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间
interested, and not_interested

Test.csv 除了没有interested, and not_interested，其余列与train相同
 """
# 数据地址
dpath = './data/'


# 统计训练集中有多少不同的用户的events
uniqueUsers = set()
uniqueEvents = set()

  
for filename in ["train.csv", "test.csv"]:
    f = open(dpath+filename, 'r')
    
    #忽略第一行（列名字）
    f.readline().strip().split(",")
    
    for line in f:    #对每条记录
        cols = line.strip().split(",")
        uniqueUsers.add(cols[0])   #第一列为用户ID
        uniqueEvents.add(cols[1])   #第二列为活动ID
        
        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动
        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加
    f.close()


n_uniqueUsers = len(uniqueUsers)
n_uniqueEvents = len(uniqueEvents)

print("number of uniqueUsers :%d" % n_uniqueUsers)
print("number of uniqueEvents :%d" % n_uniqueEvents)

number of uniqueUsers :3391
number of uniqueEvents :13418


In [3]:
type(uniqueUsers),type(uniqueEvents)

(set, set)

In [4]:

#用户关系矩阵表，可用于后续LFM/SVD++处理的输入
#这是一个稀疏矩阵，记录用户对活动感兴趣
userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))
userIndex = dict()
eventIndex = dict()

#倒排表
#统计每个用户参加的活动   / 每个活动参加的用户
#设计为字典型, values是set, 不重复
eventsForUser = defaultdict(set)
usersForEvent = defaultdict(set)

#重新编码用户索引字典
#enumerate(),返回可迭代对象的索引和值
for i, u in enumerate(uniqueUsers):
    userIndex[u] = i
    
#重新编码活动索引字典    
for i, e in enumerate(uniqueEvents):
    eventIndex[e] = i

In [5]:
#保存用户索引表
pickle.dump(userIndex, open(dpath+"PE_userIndex.pkl", 'wb'))
#保存活动索引表
pickle.dump(eventIndex, open(dpath+"PE_eventIndex.pkl", 'wb'))


In [6]:
eventIndex

{'3347988500': 0,
 '280784670': 1,
 '188836861': 2,
 '2768687948': 3,
 '268816201': 4,
 '95076039': 5,
 '3874363313': 6,
 '1560792011': 7,
 '1914806770': 8,
 '4195924644': 9,
 '3175914811': 10,
 '3618085464': 11,
 '2659777896': 12,
 '3539439890': 13,
 '1944992146': 14,
 '3292387762': 15,
 '188428993': 16,
 '477878289': 17,
 '2867570803': 18,
 '2892429018': 19,
 '712567298': 20,
 '406602676': 21,
 '1529117437': 22,
 '2491530958': 23,
 '1410302557': 24,
 '946418622': 25,
 '4035812423': 26,
 '2253311961': 27,
 '3038741300': 28,
 '2450647909': 29,
 '2324728487': 30,
 '3638176256': 31,
 '1692359541': 32,
 '1373783311': 33,
 '4193697263': 34,
 '1774186113': 35,
 '1952482771': 36,
 '231567578': 37,
 '940258698': 38,
 '1512828747': 39,
 '1520803271': 40,
 '1767211808': 41,
 '2608593001': 42,
 '1397608202': 43,
 '2731345509': 44,
 '792876040': 45,
 '2660205855': 46,
 '452939638': 47,
 '3738186641': 48,
 '1371103064': 49,
 '1748310329': 50,
 '61104529': 51,
 '790687673': 52,
 '3988717856': 53,
 

In [7]:
userIndex

{'4246010516': 0,
 '3231004341': 1,
 '3421026299': 2,
 '4063804162': 3,
 '4190165036': 4,
 '3936585198': 5,
 '4202112938': 6,
 '964184183': 7,
 '968336394': 8,
 '48222018': 9,
 '2848778083': 10,
 '2271075773': 11,
 '1023717643': 12,
 '1554993130': 13,
 '3890676452': 14,
 '3333431092': 15,
 '2489943153': 16,
 '1953311106': 17,
 '666534021': 18,
 '2722968909': 19,
 '3279061945': 20,
 '1566021834': 21,
 '3029988578': 22,
 '336493147': 23,
 '990116823': 24,
 '4007974189': 25,
 '2882325905': 26,
 '304983697': 27,
 '661907327': 28,
 '1824088561': 29,
 '2259554717': 30,
 '555631167': 31,
 '2023721073': 32,
 '2869467989': 33,
 '2674968860': 34,
 '901207935': 35,
 '3554343061': 36,
 '3908243396': 37,
 '2902724458': 38,
 '3772044787': 39,
 '2357605380': 40,
 '1641960627': 41,
 '1945364081': 42,
 '1544537243': 43,
 '2771438232': 44,
 '310296684': 45,
 '1586029420': 46,
 '3811264923': 47,
 '3474165907': 48,
 '4156591803': 49,
 '3556989632': 50,
 '253594379': 51,
 '3926920467': 52,
 '4051807703': 5

In [8]:
#添加数据
n_records = 0
ftrain = open(dpath+"train.csv", 'r')
ftrain.readline()

for line in ftrain:
    cols = line.strip().split(",")
    i = userIndex[cols[0]]  #用户索引
    j = eventIndex[cols[1]] #活动索引
    
    eventsForUser[i].add(j)    #该用户参加了这个活动
    usersForEvent[j].add(i)    #该活动被用户参加
        
    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested
    score = int(cols[4])
    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0
    #userEventScores[i, j] = -1
    #else:
    userEventScores[i, j] = score
ftrain.close()


In [9]:
##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户
pickle.dump(eventsForUser, open(dpath+"PE_eventsForUser.pkl", 'wb'))
##统计活动参加的用户
pickle.dump(usersForEvent, open(dpath+"PE_usersForEvent.pkl", 'wb'))

In [10]:
eventsForUser

defaultdict(set,
            {1548: {2830, 6100, 7874, 10875, 12902, 12929},
             769: {1443, 2879, 9375, 9953, 10365, 11766, 12164},
             269: {2588, 5942, 8237, 8852, 11169, 11415},
             251: {7769, 8402, 8633, 9166, 11409, 13236},
             2833: {82, 2213, 7122, 10159, 10661, 12091},
             111: {2089,
              5106,
              5918,
              7087,
              8308,
              8675,
              9372,
              10024,
              10058,
              10081,
              11095,
              11144,
              11854,
              12563},
             2763: {3363, 4920, 5026, 5541, 8193, 8489},
             1429: {712, 954, 1276, 11144, 11591, 12570},
             1304: {1068, 5517, 7816, 9491, 11823, 11986, 12984},
             3295: {239, 281, 3001, 3577, 4226, 9763},
             2073: {1772, 2860, 3500, 4542, 10641, 10957},
             2671: {2286, 2456, 4014, 6635, 7811, 8997},
             1023: {370, 2758, 4232, 11

In [11]:
usersForEvent

defaultdict(set,
            {6100: {1543, 1548},
             12902: {1548, 1580, 1654, 1920},
             2830: {4,
              12,
              32,
              47,
              57,
              60,
              72,
              85,
              123,
              138,
              156,
              158,
              217,
              227,
              241,
              284,
              336,
              337,
              355,
              368,
              371,
              394,
              411,
              420,
              437,
              455,
              477,
              483,
              534,
              538,
              559,
              563,
              592,
              609,
              610,
              615,
              619,
              625,
              651,
              682,
              706,
              712,
              713,
              722,
              797,
              809,
              855,
              

In [12]:
#保存用户-活动关系矩阵R，以备后用
sio.mmwrite(dpath+"PE_userEventScores", userEventScores)

In [13]:
userEventScores.keys()
#(userIndex, eventIndex)

dict_keys([(1548, 2830), (769, 11766), (769, 9953), (769, 1443), (269, 11415), (251, 8402), (2833, 82), (111, 11854), (111, 9372), (111, 11095), (111, 12563), (2763, 8193), (2763, 5541), (1429, 12570), (1304, 9491), (1304, 12984), (1304, 5517), (3295, 4226), (2073, 3500), (2671, 6635), (1023, 2758), (534, 7598), (534, 7874), (534, 2830), (534, 4200), (534, 12285), (534, 10875), (534, 3234), (534, 7308), (534, 184), (534, 11144), (534, 10497), (1423, 2304), (2417, 10497), (2417, 11144), (2417, 10875), (2417, 184), (2217, 6299), (1794, 7754), (1479, 13347), (1386, 42), (310, 6609), (336, 2830), (336, 11144), (336, 10497), (2203, 5274), (2203, 7084), (1978, 6747), (1102, 9953), (488, 2504), (2915, 10304), (1099, 13244), (2223, 4973), (452, 7339), (452, 2903), (2997, 11144), (2997, 1721), (2997, 10497), (2997, 9762), (1454, 7543), (1454, 4175), (3308, 13052), (1742, 5002), (1742, 4489), (1742, 7131), (1742, 1584), (1742, 5392), (3205, 7508), (3205, 3475), (86, 13232), (958, 7219), (138, 63

In [14]:
userEventScores.values()
#interested=1

dict_values([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.

In [15]:
# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event
# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair
# 关联的event指的是至少同一个user有行为的event pair

# combinations(iterable, r)  创建一个迭代器，返回iterable中所有长度为r的子序列，
# 返回的子序列中的项按输入iterable中的顺序排序

uniqueUserPairs = set()
uniqueEventPairs = set()
for event in uniqueEvents:
    i = eventIndex[event]
    users = usersForEvent[i]
    if len(users) > 2:
        uniqueUserPairs.update(itertools.combinations(users, 2))
                                        #combinations('ABCD',   2) --> AB AC AD BC BD CD
                                        #combinations(range(4), 3) --> 012 013 023 123
for user in uniqueUsers:
    u = userIndex[user]
    events = eventsForUser[u]
    if len(events) > 2:
        uniqueEventPairs.update(itertools.combinations(events, 2))

In [16]:
a=list(itertools.combinations(usersForEvent[123], 2))
print(a)

[]


In [17]:
pickle.dump(uniqueUserPairs, open(dpath+"FE_uniqueUserPairs.pkl", 'wb'))
pickle.dump(uniqueEventPairs, open(dpath+"PE_uniqueEventPairs.pkl", 'wb'))

In [18]:
#训练集和测试集中出现的用户数目和事件数目远小于users.csv出现的用户数和events.csv出现的事件数
len(uniqueEventPairs), len(uniqueUserPairs)

(57527, 81005)

In [19]:
(11845, 11928) in uniqueEventPairs, (11928, 11845) in uniqueEventPairs, (11845, 11928)==(11928, 11845)

(False, False, False)

In [20]:
uniqueEventPairs

{(8280, 6428),
 (8240, 7235),
 (12613, 9836),
 (9060, 9198),
 (6681, 729),
 (2985, 8575),
 (9029, 6262),
 (4662, 2879),
 (3945, 11949),
 (3470, 6014),
 (4526, 2546),
 (10470, 12202),
 (3262, 13130),
 (10787, 8604),
 (6625, 1289),
 (8613, 6869),
 (162, 4502),
 (514, 10447),
 (5046, 8936),
 (12023, 2266),
 (10246, 12763),
 (12385, 1918),
 (13200, 2791),
 (1593, 5501),
 (11022, 7344),
 (8465, 11644),
 (11382, 11542),
 (8237, 8270),
 (3425, 5337),
 (8002, 7157),
 (11048, 5367),
 (4116, 11454),
 (9992, 7423),
 (2258, 13338),
 (11144, 9246),
 (11109, 11028),
 (7977, 9995),
 (3399, 3326),
 (9541, 4921),
 (7330, 5839),
 (4662, 796),
 (10433, 8989),
 (9955, 11377),
 (8455, 10990),
 (10489, 12286),
 (11095, 6236),
 (11688, 7606),
 (3096, 6341),
 (7236, 9979),
 (4102, 11701),
 (3902, 9073),
 (12547, 7110),
 (12417, 12618),
 (2187, 1561),
 (618, 6236),
 (8275, 789),
 (10497, 9846),
 (4835, 4990),
 (1068, 10328),
 (12092, 7106),
 (10149, 762),
 (9955, 551),
 (12259, 6212),
 (3905, 7892),
 (6150, 12

In [21]:
uniqueUserPairs

{(284, 2948),
 (328, 748),
 (1403, 2351),
 (303, 2273),
 (1204, 576),
 (3093, 895),
 (1976, 3309),
 (1807, 2887),
 (32, 1988),
 (2494, 1496),
 (2309, 3336),
 (57, 2423),
 (440, 2409),
 (534, 2787),
 (853, 1935),
 (2226, 406),
 (2597, 229),
 (1812, 502),
 (797, 2419),
 (1337, 2002),
 (543, 2419),
 (617, 2159),
 (1824, 2808),
 (1143, 482),
 (1678, 1307),
 (3072, 1041),
 (520, 1992),
 (1706, 2292),
 (2336, 809),
 (1152, 457),
 (105, 760),
 (3037, 111),
 (534, 1429),
 (1564, 1106),
 (1582, 2335),
 (1110, 856),
 (317, 435),
 (3390, 2932),
 (2564, 3044),
 (1562, 1251),
 (2997, 3165),
 (600, 3365),
 (3320, 484),
 (2128, 1462),
 (2787, 1961),
 (596, 2409),
 (932, 1452),
 (1933, 1864),
 (153, 751),
 (844, 2192),
 (1750, 1788),
 (706, 389),
 (526, 168),
 (2582, 2688),
 (1420, 2997),
 (2803, 895),
 (85, 2324),
 (3333, 1380),
 (2183, 2383),
 (2662, 134),
 (722, 1391),
 (1056, 371),
 (57, 2867),
 (534, 2863),
 (1913, 2726),
 (3178, 2765),
 (1106, 2766),
 (2540, 978),
 (3333, 985),
 (2516, 2524),
 (