In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('/nfs/zixuan/MMRec/Amazon_Elec_Dataset')
os.getcwd()

'/nfs/zixuan/MMRec/Amazon_Elec_Dataset'

In [3]:
df = pd.read_csv('ratings_Electronics.csv', names=['userID', 'itemID', 'rating', 'timestamp'], header=None)
print(f'shape: {df.shape}')
df[:5]

shape: (7824482, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


## 先5-core过滤
## 5-core filtering

In [4]:
df = pd.read_csv('ratings_Electronics.csv', names=['userID', 'itemID', 'rating', 'timestamp'], header=None)
print(f'shape: {df.shape}')
df[:5]

shape: (7824482, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [5]:
k_core = 5
learner_id, course_id, tmstmp_str = 'userID', 'itemID', 'timestamp'

df.dropna(subset=[learner_id, course_id, tmstmp_str], inplace=True)
df.drop_duplicates(subset=[learner_id, course_id, tmstmp_str], inplace=True)
print(f'After dropped: {df.shape}')
df[:3]

After dropped: (7824482, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600


In [6]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 5, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)



## k-core

In [7]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:2]

3947632 illegal_ids_by_inter_num, field=userID
318219 illegal_ids_by_inter_num, field=itemID
7824482 dropped interactions
27241 illegal_ids_by_inter_num, field=userID
74577 illegal_ids_by_inter_num, field=itemID
2109869 dropped interactions
29071 illegal_ids_by_inter_num, field=userID
3183 illegal_ids_by_inter_num, field=itemID
1847652 dropped interactions
2034 illegal_ids_by_inter_num, field=userID
3667 illegal_ids_by_inter_num, field=itemID
1727114 dropped interactions
2648 illegal_ids_by_inter_num, field=userID
318 illegal_ids_by_inter_num, field=itemID
1705108 dropped interactions
218 illegal_ids_by_inter_num, field=userID
366 illegal_ids_by_inter_num, field=itemID
1693333 dropped interactions
294 illegal_ids_by_inter_num, field=userID
28 illegal_ids_by_inter_num, field=itemID
1691006 dropped interactions
23 illegal_ids_by_inter_num, field=userID
45 illegal_ids_by_inter_num, field=itemID
1689720 dropped interactions
38 illegal_ids_by_inter_num, field=userID
6 illegal_ids_by_inter_n

Unnamed: 0,userID,itemID,rating,timestamp
13,AO94DHGC771SJ,528881469,5.0,1370131200
14,AMO214LNFCEI4,528881469,1.0,1290643200


## Re-index

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:

i_mapping_file = 'i_id_mapping.csv'
u_mapping_file = 'u_id_mapping.csv'

splitting = [0.7, 0.1, 0.2]
uid_field, iid_field = learner_id, course_id

uni_users = pd.unique(df[uid_field])
uni_items = pd.unique(df[iid_field])

# start from 0
u_id_map = {k: i for i, k in enumerate(uni_users)}
i_id_map = {k: i for i, k in enumerate(uni_items)}

df[uid_field] = df[uid_field].map(u_id_map)
df[iid_field] = df[iid_field].map(i_id_map)
df[uid_field] = df[uid_field].astype(int)
df[iid_field] = df[iid_field].astype(int)

# dump
rslt_dir = './'
u_df = pd.DataFrame(list(u_id_map.items()), columns=['user_id', 'userID'])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', 'itemID'])

u_df.to_csv(os.path.join(rslt_dir, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(rslt_dir, i_mapping_file), sep='\t', index=False)
print(f'mapping dumped...')

mapping dumped...


In [10]:

# =========2. splitting
print(f'splitting ...')
tot_ratio = sum(splitting)
# remove 0.0 in ratios
ratios = [i for i in splitting if i > .0]
ratios = [_ / tot_ratio for _ in ratios]
split_ratios = np.cumsum(ratios)[:-1]

#df[tmstmp_str] = df[tmstmp_str].map(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
split_ratios

splitting ...


array([0.7, 0.8])

In [11]:
ts_id = 'timestamp'

split_timestamps = list(np.quantile(df[ts_id], split_ratios))
# get df training dataset unique users/items
df_train = df.loc[df[ts_id] < split_timestamps[0]].copy()
df_val = df.loc[(split_timestamps[0] <= df[ts_id]) & (df[ts_id] < split_timestamps[1])].copy()
df_test = df.loc[(split_timestamps[1] <= df[ts_id])].copy()

x_label, rslt_file = 'x_label', 'elec-indexed.inter'
df_train[x_label] = 0
df_val[x_label] = 1
df_test[x_label] = 2
temp_df = pd.concat([df_train, df_val, df_test])
temp_df = temp_df[[learner_id, course_id, 'rating', ts_id, x_label]]
print(f'columns: {temp_df.columns}')

temp_df.columns = [learner_id, course_id, 'rating', ts_id, x_label]

temp_df.to_csv(os.path.join(rslt_dir, rslt_file), sep='\t', index=False)
temp_df[:5]
#print('done!')

columns: Index(['userID', 'itemID', 'rating', 'timestamp', 'x_label'], dtype='object')


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,0,5.0,1370131200,0
1,1,0,1.0,1290643200,0
2,2,0,3.0,1283990400,0
3,3,0,2.0,1290556800,0
4,4,0,1.0,1317254400,0


## Reload

In [12]:
indexed_df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:4]

shape: (1689188, 5)


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,0,5.0,1370131200,0
1,1,0,1.0,1290643200,0
2,2,0,3.0,1283990400,0
3,3,0,2.0,1290556800,0


In [13]:
u_uni = indexed_df[learner_id].unique()
c_uni = indexed_df[course_id].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 192403
# of unique courses: 63001
min/max of unique learners: 0/192402
min/max of unique courses: 0/63000
