In [1]:
import numpy as np
import datetime
import torch
import torch.utils
from datetime import datetime, timezone


class EventsDataset(torch.utils.data.Dataset):
    '''
    Base class for event datasets
    '''
    def __init__(self, TZ=None):
        self.TZ = TZ  # timezone.utc
        # self.FIRST_DATE = datetime.now()
        # self.TEST_TIMESLOTS = []
        # self.N_nodes = 100
        # self.A_initial = np.random.randint(0, 2, size=(self.N_nodes, self.N_nodes))
        # self.A_last = np.random.randint(0, 2, size=(self.N_nodes, self.N_nodes))
        # self.all_events = []
        # self.n_events = len(self.all_events)
        # self.event_types = ['communication event']
        # self.event_types_num = {'association event': 0}
        # self.time_bar = np.zeros(self.N_nodes)  # Initialize time_bar with zeros
        # k = 1  # k >= 1 for communication events
        # for t in self.event_types:
        #     self.event_types_num[t] = k
        #     k += 1

    def get_Adjacency(self, multirelations=False):
        return None, None, None

    def __len__(self):
        return self.n_events

    def __getitem__(self, index):

        tpl = self.all_events[index]
        u, v, rel, time_cur = tpl

        # Compute time delta in seconds (t_p - \bar{t}_p_j) that will be fed to W_t
        time_delta_uv = np.zeros((2, 4))  # two nodes x 4 values

        # most recent previous time for all nodes
        time_bar = self.time_bar.copy()
        assert u != v, (tpl, rel)

        for c, j in enumerate([u, v]):
            t = datetime.fromtimestamp(self.time_bar[j], tz=self.TZ)
            if t.toordinal() >= self.FIRST_DATE.toordinal():  # assume no events before FIRST_DATE
                td = time_cur - t
                time_delta_uv[c] = np.array([td.days,  # total number of days, still can be a big number
                                             td.seconds // 3600,  # hours, max 24
                                             (td.seconds // 60) % 60,  # minutes, max 60
                                             td.seconds % 60],  # seconds, max 60
                                            np.float64)
                # assert time_delta_uv.min() >= 0, (index, tpl, time_delta_uv[c], node_global_time[j])
            else:
                raise ValueError('unexpected result', t, self.FIRST_DATE)
            self.time_bar[j] = time_cur.timestamp()  # last time stamp for nodes u and v

        k = self.event_types_num[rel]

        # sanity checks
        assert np.float64(time_cur.timestamp()) == time_cur.timestamp(), (np.float64(time_cur.timestamp()), time_cur.timestamp())
        time_cur = np.float64(time_cur.timestamp())
        time_bar = time_bar.astype(np.float64)
        time_cur = torch.from_numpy(np.array([time_cur])).double()
        if time_bar.max() > time_cur:
            print(f"Assertion Error Details: index={index}, tpl={tpl}, u={u}, v={v}, rel={rel}, time_cur={time_cur}, time_bar={time_bar}")
        assert time_bar.max() <= time_cur, (time_bar.max(), time_cur)
        return u, v, time_delta_uv, k, time_bar, time_cur

In [2]:
import os
import numpy as np
import datetime
import pickle
import torch
import pandas
import itertools
from datetime import datetime, timezone
import dateutil.parser

class CSVReader:
    '''
    General class to read any relationship csv in this dataset
    '''
    def __init__(self, csv_path, split, MIN_EVENT_PROB, event_type=None, N_subjects=None, test_slot=1):
        self.csv_path = csv_path
        print(os.path.basename(csv_path))

        if split == 'train':
            time_start = 0
            time_end = datetime(2009, 4, 30).toordinal()
        elif split == 'test':
            if test_slot != 1:
                raise NotImplementedError('test on time slot 1 for now')
            time_start = datetime(2009, 5, 1).toordinal()
            time_end = datetime(2009, 6, 30).toordinal()
        else:
            time_start = 0
            time_end = np.Inf

        csv = pandas.read_csv(csv_path)
        self.data = {}
        to_date1 = lambda s: datetime.strptime(s, '%Y-%m-%d')
        to_date2 = lambda s: datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
        user_columns = list(filter(lambda c: c.find('user') >= 0 or c.find('id') >= 0, list(csv.keys())))
        assert len(user_columns) == 2, (list(csv.keys()), user_columns)
        self.time_column = list(filter(lambda c: c.find('time') >= 0 or c.find('date') >= 0, list(csv.keys())))
        assert len(self.time_column) == 1, (list(csv.keys()), self.time_column)
        self.time_column = self.time_column[0]

        self.prob_column = list(filter(lambda c: c.find('prob') >= 0, list(csv.keys())))

        for column in list(csv.keys()):
            values = csv[column].tolist()
            for fn in [int, float, to_date1, to_date2]:
                try:
                    values = list(map(fn, values))
                    break
                except Exception as e:
                    continue
            self.data[column] = values

        n_rows = len(self.data[self.time_column])

        time_stamp_days = np.array([d.toordinal() for d in self.data[self.time_column]], dtype=np.int)

        # skip data where one of users is missing (nan) or interacting with itself or timestamp not in range
        conditions = [~np.isnan(self.data[user_columns[0]]),
                      ~np.isnan(self.data[user_columns[1]]),
                      np.array(self.data[user_columns[0]]) != np.array(self.data[user_columns[1]]),
                      time_stamp_days >= time_start,
                      time_stamp_days <= time_end]

        if len(self.prob_column) == 1:
            print(split, event_type, self.prob_column)
            # skip data if the probability of event is 0 or nan (available for some event types)
            conditions.append(np.nan_to_num(np.array(self.data[self.prob_column[0]])) > MIN_EVENT_PROB)

        valid_ids = np.ones(n_rows, dtype=np.bool)
        for cond in conditions:
            valid_ids = valid_ids & cond

        self.valid_ids = np.where(valid_ids)[0]

        time_stamps_sec = [self.data[self.time_column][i].timestamp() for i in self.valid_ids]
        self.valid_ids = self.valid_ids[np.argsort(time_stamps_sec)]

        print(split, len(self.valid_ids), n_rows)

        for column in list(csv.keys()):
            values = csv[column].tolist()
            key = column + '_unique'
            for fn in [int, float, to_date1, to_date2]:
                try:
                    values = list(map(fn, values))
                    break
                except Exception as e:
                    continue

            self.data[column] = values

            values_valid = [values[i] for i in self.valid_ids]
            self.data[key] = np.unique(values_valid)
            print(key, type(values[0]), len(self.data[key]), self.data[key])

        self.subjects, self.time_stamps = [], []
        for usr_col in range(len(user_columns)):
            self.subjects.extend([self.data[user_columns[usr_col]][i] for i in self.valid_ids])
            self.time_stamps.extend([self.data[self.time_column][i] for i in self.valid_ids])

        # set O={(u, v, k, t)}
        self.tuples = []
        if N_subjects is not None:
            # Compute frequency of communcation between users
            print('user_columns', user_columns)
            self.Adj = np.zeros((N_subjects, N_subjects))
            for row in self.valid_ids:
                subj1 = self.data[user_columns[0]][row]
                subj2 = self.data[user_columns[1]][row]

                assert subj1 != subj2, (subj1, subj2)
                assert subj1 > 0 and subj2 > 0, (subj1, subj2)
                try:
                    self.Adj[int(subj1) - 1, int(subj2) - 1] += 1
                    self.Adj[int(subj2) - 1, int(subj1) - 1] += 1
                except:
                    print(subj1, subj2)
                    raise

                self.tuples.append((int(subj1) - 1,
                                    int(subj2) - 1,
                                    event_type,
                                    self.data[self.time_column][row]))

        n1 = len(self.tuples)
        self.tuples = list(set(itertools.chain(self.tuples)))
        self.tuples = sorted(self.tuples, key=lambda t: t[3].timestamp())
        n2 = len(self.tuples)
        print('%d/%d duplicates removed' % (n1 - n2, n1))


class SubjectsReader:
    '''
    Class to read Subjects.csv in this dataset
    '''
    def __init__(self, csv_path):
        self.csv_path = csv_path
        print(os.path.basename(csv_path))

        csv = pandas.read_csv(csv_path)
        subjects = csv[list(filter(lambda column: column.find('user') >= 0, list(csv.keys())))[0]].tolist()
        print('Number of subjects', len(subjects))
        features = []
        for column in list(csv.keys()):
            if column.find('user') >= 0:
                continue
            values = list(map(str, csv[column].tolist()))
            features_unique = np.unique(values)
            features_onehot = np.zeros((len(subjects), len(features_unique)))
            for subj, feat in enumerate(values):
                ind = np.where(features_unique == feat)[0]
                assert len(ind) == 1, (ind, features_unique, feat, type(feat))
                features_onehot[subj, ind[0]] = 1
            features.append(features_onehot)

        features_onehot = np.concatenate(features, axis=1)
        print('features', features_onehot.shape)
        self.features_onehot = features_onehot


class SocialEvolution():
    '''
    Class to read all csv in this dataset
    '''
    def __init__(self, data_dir, split, MIN_EVENT_PROB):
        self.data_dir = data_dir
        self.split = split
        self.MIN_EVENT_PROB = MIN_EVENT_PROB

        self.relations = CSVReader(pjoin(data_dir, 'RelationshipsFromSurveys.csv'), split=split, MIN_EVENT_PROB=MIN_EVENT_PROB)
        self.relations.subject_ids = np.unique(self.relations.data['id.A'] + self.relations.data['id.B'])
        self.N_subjects = len(self.relations.subject_ids)
        print('Number of subjects', self.N_subjects)

        # Read communicative events
        self.EVENT_TYPES = {}
        for t in SocialEvolutionDataset.EVENT_TYPES:
            self.EVENT_TYPES[t] = CSVReader(pjoin(data_dir, '%s.csv' % t),
                                            split=split,
                                            MIN_EVENT_PROB=MIN_EVENT_PROB,
                                            event_type=t,
                                            N_subjects=self.N_subjects)

        # Compute adjacency matrices for associative relationship data
        self.Adj = {}
        dates = self.relations.data['survey.date']
        rels = self.relations.data['relationship']
        for date_id, date in enumerate(self.relations.data['survey.date_unique']):
            self.Adj[date] = {}
            ind = np.where(np.array([d == date for d in dates]))[0]
            for rel_id, rel in enumerate(self.relations.data['relationship_unique']):
                ind_rel = np.where(np.array([r == rel for r in [rels[i] for i in ind]]))[0]
                A = np.zeros((self.N_subjects, self.N_subjects))
                for j in ind_rel:
                    row = ind[j]
                    A[self.relations.data['id.A'][row] - 1, self.relations.data['id.B'][row] - 1] = 1
                    A[self.relations.data['id.B'][row] - 1, self.relations.data['id.A'][row] - 1] = 1
                self.Adj[date][rel] = A
                # sanity check
                for row in range(len(dates)):
                    if rels[row] == rel and dates[row] == date:
                        assert self.Adj[dates[row]][rels[row]][
                                   self.relations.data['id.A'][row] - 1, self.relations.data['id.B'][row] - 1] == 1
                        assert self.Adj[dates[row]][rels[row]][
                                   self.relations.data['id.B'][row] - 1, self.relations.data['id.A'][row] - 1] == 1


class SocialEvolutionDataset(EventsDataset):
    '''
    Class to load batches for training and testing
    '''

    FIRST_DATE = datetime(2008, 9, 11)  # consider events starting from this time
    EVENT_TYPES =  ['SMS', 'Proximity', 'Calls']

    def __init__(self, subj_features, data, MainAssociation, data_train=None, verbose=False):
        super(SocialEvolutionDataset, self).__init__()

        self.subj_features = subj_features
        self.data = data
        self.verbose = verbose
        self.all_events = []
        self.event_types_num = {}
        self.MainAssociation = MainAssociation
        self.TEST_TIMESLOTS = [datetime(2009, 5, 10), datetime(2009, 5, 20), datetime(2009, 5, 31),
                               datetime(2009, 6, 10), datetime(2009, 6, 20), datetime(2009, 6, 30)]
        self.FIRST_DATE = SocialEvolutionDataset.FIRST_DATE
        self.event_types = SocialEvolutionDataset.EVENT_TYPES

        k = 1  # k >= 1 for communication events
        print(data.split.upper())
        for t in self.event_types:
            print('Event type={}, k={}, number of events={}'.format(t, k, len(data.EVENT_TYPES[t].tuples)))

            events = list(filter(lambda x: x[3].toordinal() >= self.FIRST_DATE.toordinal(),
                                 data.EVENT_TYPES[t].tuples))
            self.all_events.extend(events)
            self.event_types_num[t] = k
            k += 1

        n = len(self.all_events)
        self.N_nodes = subj_features.shape[0]

        if data.split == 'train':
            Adj_all, keys, Adj_all_last = self.get_Adjacency()

            if self.verbose:
                print('initial and final associations', self.MainAssociation, Adj_all.sum(), Adj_all_last.sum(),
                      np.allclose(Adj_all, Adj_all_last))

        # Initial topology
        if len(list(data.Adj.keys())) > 0:
            keys = sorted(list(data.Adj[list(data.Adj.keys())[0]].keys()))  # relation keys
            keys.remove(MainAssociation)
            keys = [MainAssociation] + keys  # to make sure CloseFriend goes first

            k = 0  # k <= 0 for association events
            for rel in keys:
                if rel != MainAssociation:
                    continue
                if data_train is None:
                    date = sorted(list(data.Adj.keys()))[0]  # first date
                    Adj_prev = data.Adj[date][rel]
                else:
                    date = sorted(list(data_train.Adj.keys()))[-1]  # last date of the training set
                    Adj_prev = data_train.Adj[date][rel]
                self.event_types_num[rel] = k

                N = Adj_prev.shape[0]

                # Associative events
                for date_id, date in enumerate(sorted(list(data.Adj.keys()))):  # start from the second survey
                    if date.toordinal() >= self.FIRST_DATE.toordinal():
                        assert data.Adj[date][rel].shape[0] == N
                        for u in range(N):
                            for v in range(u + 1, N):
                                if data.Adj[date][rel][u, v] > 0 and Adj_prev[u, v] == 0:
                                    assert u != v, (u, v, k)
                                    self.all_events.append((u, v, rel, date))
                    Adj_prev = data.Adj[date][rel]

                print('Event type={}, k={}, number of events={}'.format(rel, k, len(self.all_events) - n))
                n = len(self.all_events)
                k -= 1

        self.all_events = sorted(self.all_events, key=lambda x: int(x[3].timestamp()))  

        if self.verbose:
            print('%d events' % len(self.all_events))
            print('last 10 events:')
            for event in self.all_events[-10:]:
                print(event)

        self.n_events = len(self.all_events)

        H_train = np.zeros((N, N))
        c = 0
        for e in self.all_events:
            H_train[e[0], e[1]] += 1
            H_train[e[1], e[0]] += 1
            c += 1
        if self.verbose:
            print('H_train', c, H_train.max(), H_train.min(), H_train.std())
        self.H_train = H_train

        self.time_bar = np.full(self.N_nodes, self.FIRST_DATE.timestamp())

    @staticmethod
    def load_data(data_dir, prob, dump=True):
        data_file = os.path.join(data_dir, 'data_prob%s.pkl' % prob)
        if os.path.isfile(data_file):
            print('loading data from %s' % data_file)
            with open(data_file, 'rb') as f:
                data = pickle.load(f)
        else:
            data = {'initial_embeddings': SubjectsReader(os.path.join(data_dir, 'Subjects.csv')).features_onehot}
            for split in ['train', 'test']:
                data.update(
                    {split: SocialEvolution(data_dir, split=split, MIN_EVENT_PROB=prob)})
            if dump:
                # dump data files to avoid their generation again
                print('saving data to %s' % data_file)
                with open(data_file, 'wb') as f:
                    pickle.dump(data, f, protocol=2)  # for compatibility
        return data

    def get_Adjacency(self, multirelations=False):
        dates = sorted(list(self.data.Adj.keys()))
        Adj_all = self.data.Adj[dates[0]]
        Adj_all_last = self.data.Adj[dates[-1]]
        if multirelations:
            keys = sorted(list(Adj_all.keys()))
            keys.remove(self.MainAssociation)
            keys = [self.MainAssociation] + keys  # to make sure CloseFriend goes first
            Adj_all = np.stack([Adj_all[rel].copy() for rel in keys], axis=2)
            Adj_all_last = np.stack([Adj_all_last[rel].copy() for rel in keys], axis=2)
        else:
            keys = [self.MainAssociation]
            Adj_all = Adj_all[self.MainAssociation].copy()
            Adj_all_last = Adj_all_last[self.MainAssociation].copy()

        return Adj_all, keys, Adj_all_last

#### The form of dataset: 

for each event: (u,v,k,t)
- u,v are the nodes happen in this event
- k: name of the action
- t: timestamp for the event happen: year-month-date-hour-minute-second

In [3]:
# Paths to the dataset files
data_dir = '/Users/amberrrrrr/Desktop/trials/dyrep_torch-main/SocialEvolution/'
prob = 0.8
association = 'CloseFriend'

# Load the data
data = SocialEvolutionDataset.load_data(data_dir, prob)

# Initialize train and test sets
train_set = SocialEvolutionDataset(data['initial_embeddings'], data['train'], association, verbose=False)
test_set = SocialEvolutionDataset(data['initial_embeddings'], data['test'], association, data_train=data['train'], verbose=False)

# Preview the first few lines of the train set and test set
print("Train set preview (first 5 events):")
for event in train_set.all_events[:5]:
    print(event)

print("\nTest set preview (first 5 events):")
for event in test_set.all_events[:5]:
    print(event)

loading data from /Users/amberrrrrr/Desktop/trials/dyrep_torch-main/SocialEvolution/data_prob0.8.pkl
TRAIN
Event type=SMS, k=1, number of events=4319
Event type=Proximity, k=2, number of events=31011
Event type=Calls, k=3, number of events=8187
Event type=CloseFriend, k=0, number of events=365
TEST
Event type=SMS, k=1, number of events=288
Event type=Proximity, k=2, number of events=9094
Event type=Calls, k=3, number of events=1080
Event type=CloseFriend, k=0, number of events=73
Train set preview (first 5 events):
(42, 50, 'Calls', datetime.datetime(2008, 9, 11, 3, 16, 14))
(42, 50, 'Calls', datetime.datetime(2008, 9, 19, 0, 31, 33))
(42, 21, 'Calls', datetime.datetime(2008, 9, 19, 0, 58, 2))
(42, 54, 'Calls', datetime.datetime(2008, 9, 19, 1, 21, 4))
(42, 50, 'Calls', datetime.datetime(2008, 9, 19, 18, 20, 43))

Test set preview (first 5 events):
(0, 60, 'Proximity', datetime.datetime(2009, 5, 1, 0, 3, 29))
(60, 0, 'Proximity', datetime.datetime(2009, 5, 1, 0, 3, 51))
(59, 66, 'Proxi

In [4]:
initial_embeddings = data['initial_embeddings'].copy()
A_initial = train_set.get_Adjacency()[0]

In [5]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_set, batch_size=200, shuffle=False)
test_loader = DataLoader(test_set, batch_size=200, shuffle=False)

In [6]:
import datetime
from datetime import datetime, timezone
for batch_idx, data in enumerate(test_loader):
    data[2] = data[2].float()
    data[4] = data[4].double()
    data[5] = data[5].double()