# Przygotowanie środowiska Collab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get install lzop --quiet

Reading package lists...
Building dependency tree...
Reading state information...
lzop is already the newest version (1.03-4).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


# GraphRec demo

## Data

In [83]:
import pickle
from functools import reduce

history_u_lists, \
    history_ur_lists, \
    history_v_lists, \
    history_vr_lists, \
    train_u, train_v, \
    train_r, \
    test_u, \
    test_v, \
    test_r, \
    social_adj_lists, \
    ratings_list = pickle.load(open('/content/drive/Shareddrives/RecSys21/GraphRec-WWW19/data/toy_dataset.pickle', 'rb'))

In [50]:
c = 0
for i in range(len(history_u_lists)):
    c += len(history_u_lists[i])

print("Total amount of purchased items in the dataset: %d" % c)

Total amount of purchased items in the dataset: 14091


In [57]:
ratings_list

{0.5: 7, 1.0: 1, 1.5: 6, 2.0: 0, 2.5: 4, 3.0: 2, 3.5: 5, 4.0: 3}

## Training

In [7]:
%%time
import os
os.chdir('/content/drive/Shareddrives/RecSys21/GraphRec-WWW19/')

%run run_GraphRec_example.py

[1,     0] loss: 0.084, The best rmse/mae: 9999.000000 / 9999.000000
[1,   100] loss: 6.090, The best rmse/mae: 9999.000000 / 9999.000000
rmse: 2.0166, mae:1.8560 
[2,     0] loss: 0.030, The best rmse/mae: 2.016613 / 1.856008
[2,   100] loss: 2.717, The best rmse/mae: 2.016613 / 1.856008
rmse: 1.2570, mae:1.0672 
[3,     0] loss: 0.018, The best rmse/mae: 1.257041 / 1.067247
[3,   100] loss: 1.656, The best rmse/mae: 1.257041 / 1.067247
rmse: 0.9673, mae:0.7598 
[4,     0] loss: 0.015, The best rmse/mae: 0.967343 / 0.759752
[4,   100] loss: 1.360, The best rmse/mae: 0.967343 / 0.759752
rmse: 0.9709, mae:0.7629 
[5,     0] loss: 0.012, The best rmse/mae: 0.967343 / 0.759752
[5,   100] loss: 1.174, The best rmse/mae: 0.967343 / 0.759752
rmse: 0.8622, mae:0.6641 
[6,     0] loss: 0.010, The best rmse/mae: 0.862157 / 0.664132


KeyboardInterrupt: ignored

CPU times: user 5min 21s, sys: 641 ms, total: 5min 22s
Wall time: 5min 23s


# Parsowanie danych do postaci .pickle dla potrzeb GraphRec

In [1]:
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import random
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [2]:
all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "enaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager"]
labels = ["reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))
labels_to_idx = dict(zip(labels, range(len(all_features), len(all_features)+len(labels))))

In [9]:
def unpack_to_df(package_name, sslice=None):
    out = subprocess.Popen(['lzop', '-dc', package_name],
            stdout=subprocess.PIPE,
            # stderr=subprocess.DEVNULL,
            )
    lines, stderr = out.communicate()
    lines = lines.decode('utf-8').split('\n')

    out.terminate() 
    del out, stderr

    if sslice is not None:
        lines = lines[sslice]
    for i in range(len(lines)-1):
        line = lines[i].split("\x01")
        line = [
            line[all_features_to_idx['tweet_id']],
            line[all_features_to_idx['engaged_with_user_id']],
            line[all_features_to_idx['enaging_user_id']],
            line[all_features_to_idx['engagee_follows_engager']],
            [
             bool(line[labels_to_idx['reply_timestamp']]),
             bool(line[labels_to_idx['retweet_timestamp']]),
             bool(line[labels_to_idx['retweet_with_comment_timestamp']]),
             bool(line[labels_to_idx['like_timestamp']]),
            ]]
        lines[i] = line
        
    lines = pd.DataFrame(lines, columns=[
        'tweet_id',
        'engaged_with_user_id',
        'enaging_user_id',
        'engagee_follows_engager',
        'reaction'
    ])
    # lines['reaction'] = lines["reaction"].apply(lambda r: reduce(lambda out, bit: (out << 1) | bit, r)) 
    return lines

In [4]:
df = unpack_to_df("/content/drive/Shareddrives/RecSys21/training/part-00000.lzo")
df.memory_usage(deep=True)

## Write pickle

In [None]:
%time

random.seed(4)

p = 0.8
parts = ["part-00000.lzo", "part-00001.lzo", "part-00002.lzo", "part-00003.lzo", "part-00004.lzo"]

history_u_lists = {}
history_ur_lists = {}
history_v_lists = {}
history_vr_lists = {}
train_u  = []
test_u  = []
train_v  = []
test_v  = []
train_r = [] 
test_r = []
social_adj_lists = {}
ratings = list(itertools.product([False, True], repeat=4))
ratings_list = dict(zip(ratings, np.arange(16)))

for part in parts:
    df = unpack_to_df("/content/drive/Shareddrives/RecSys21/training/" + part)
    n = len(df)
    train_samples = np.ones(n, dtype=bool)
    train_samples[int(n*p):] = False
    random.shuffle(train_samples)
    pbar = tqdm(total=n)

    for i, row in df.loc[:n-1, :].iterrows():
        v_id = row["tweet_id"]
        u_id = row["enaging_user_id"]
        n_id = row["engaged_with_user_id"]
        undirected = row["engagee_follows_engager"]
        r_value = row["reaction"]
        is_train_sample = train_samples[i]
        
        if is_train_sample:
            # build user-item graph
            if u_id not in history_u_lists:
                history_u_lists[u_id] = []
                history_ur_lists[u_id] = []
            history_u_lists[u_id].append(v_id)
            history_ur_lists[u_id].append(r_value)

            # build item-user graph
            if v_id not in history_v_lists:
                history_v_lists[v_id] = []
                history_vr_lists[v_id] = []
            history_v_lists[v_id].append(u_id)
            history_vr_lists[v_id].append(r_value)

            # build train set
            train_u.append(u_id)
            train_v.append(v_id)
            train_r.append(r_value)
        else:
            # build test set
            test_u.append(u_id)
            test_v.append(v_id)
            test_r.append(r_value)
        
        # build social graph
        if u_id not in social_adj_lists:
            social_adj_lists[u_id] = set()
        social_adj_lists[u_id].add(n_id)

        if undirected:
            if n_id not in social_adj_lists:
                social_adj_lists[n_id] = set()

            social_adj_lists[n_id].add(u_id)
        
        pbar.update(1)
    
    pbar.close()
    del df

    with open('dataset.pickle', 'wb') as handle:
        pickle.dump(
            [history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list]
        , handle, protocol=pickle.HIGHEST_PROTOCOL)


CPU times: user 0 ns, sys: 2 µs, total: 2 µs
Wall time: 5.48 µs


HBox(children=(FloatProgress(value=0.0, max=3011286.0), HTML(value='')))




In [None]:
# !cp part1.pickle /content/drive/Shareddrives/RecSys21/pickle/part-00000.pickle

## Read pickle

In [7]:
with open('/content/drive/Shareddrives/RecSys21/pickle/part-00000.pickle', 'rb') as handle:
    ds = pickle.load(handle)

history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list = ds
del ds

## Memory test

In [44]:
size = 10000
a = np.random.choice(a=[False, True], size=(size,4))
b = np.random.randint(0,15,size)
sys.getsizeof(a), sys.getsizeof(b), sys.getsizeof(a.tolist()), sys.getsizeof(b.tolist())

(40112, 80096, 80072, 80072)