# Przygotowanie środowiska Collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt-get install lzop --quiet

# GraphRec demo

## Data

In [None]:
import pickle
from functools import reduce

history_u_lists, \
    history_ur_lists, \
    history_v_lists, \
    history_vr_lists, \
    train_u, train_v, \
    train_r, \
    test_u, \
    test_v, \
    test_r, \
    social_adj_lists, \
    ratings_list = pickle.load(open('/content/drive/Shareddrives/RecSys21/GraphRec-WWW19/data/toy_dataset.pickle', 'rb'))

In [None]:
c = 0
for i in range(len(history_u_lists)):
    c += len(history_u_lists[i])

print("Total amount of purchased items in the dataset: %d" % c)

Total amount of purchased items in the dataset: 14091


In [None]:
ratings_list

{0.5: 7, 1.0: 1, 1.5: 6, 2.0: 0, 2.5: 4, 3.0: 2, 3.5: 5, 4.0: 3}

## Training

In [None]:
%%time
import os
os.chdir('/content/drive/Shareddrives/RecSys21/GraphRec-WWW19/')

%run run_GraphRec_example.py

# LZOP

Parsowanie danych do postaci .pickle dla potrzeb GraphRec korzystająć z paczek lzop


In [None]:
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import random
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "enaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager"]
labels = ["reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))
labels_to_idx = dict(zip(labels, range(len(all_features), len(all_features)+len(labels))))

In [None]:
def unpack_to_df(package_name, sslice=None):
    out = subprocess.Popen(['lzop', '-dc', package_name],
            stdout=subprocess.PIPE,
            # stderr=subprocess.DEVNULL,
            )
    lines, stderr = out.communicate()
    lines = lines.decode('utf-8').split('\n')

    out.terminate() 
    del out, stderr

    if sslice is not None:
        lines = lines[sslice]
    for i in range(len(lines)-1):
        line = lines[i].split("\x01")
        line = [
            line[all_features_to_idx['tweet_id']],
            line[all_features_to_idx['engaged_with_user_id']],
            line[all_features_to_idx['enaging_user_id']],
            line[all_features_to_idx['engagee_follows_engager']],
            [
             bool(line[labels_to_idx['reply_timestamp']]),
             bool(line[labels_to_idx['retweet_timestamp']]),
             bool(line[labels_to_idx['retweet_with_comment_timestamp']]),
             bool(line[labels_to_idx['like_timestamp']]),
            ]]
        lines[i] = line
        
    lines = pd.DataFrame(lines, columns=[
        'tweet_id',
        'engaged_with_user_id',
        'enaging_user_id',
        'engagee_follows_engager',
        'reaction'
    ])
    # lines['reaction'] = lines["reaction"].apply(lambda r: reduce(lambda out, bit: (out << 1) | bit, r)) 
    return lines

In [None]:
df = unpack_to_df("/content/drive/Shareddrives/RecSys21/training/part-00000.lzo")
df.memory_usage(deep=True)

## Write pickle

In [None]:
%time

random.seed(4)

p = 0.8
parts = ["part-00000.lzo", "part-00001.lzo"]

history_u_lists = {}
history_ur_lists = {}
history_v_lists = {}
history_vr_lists = {}
train_u  = []
test_u  = []
train_v  = []
test_v  = []
train_r = [] 
test_r = []
social_adj_lists = {}
ratings = list(itertools.product([False, True], repeat=4))
ratings_list = dict(zip(ratings, np.arange(16)))

for part in parts:
    df = unpack_to_df("/content/drive/Shareddrives/RecSys21/training/" + part)
    n = len(df)
    train_samples = np.ones(n, dtype=bool)
    train_samples[int(n*p):] = False
    random.shuffle(train_samples)
    pbar = tqdm(total=n)

    for i, row in df.loc[:n-1, :].iterrows():
        v_id = row["tweet_id"]
        u_id = row["enaging_user_id"]
        n_id = row["engaged_with_user_id"]
        undirected = row["engagee_follows_engager"]
        r_value = row["reaction"]
        is_train_sample = train_samples[i]
        
        if is_train_sample:
            # build user-item graph
            if u_id not in history_u_lists:
                history_u_lists[u_id] = []
                history_ur_lists[u_id] = []
            history_u_lists[u_id].append(v_id)
            history_ur_lists[u_id].append(r_value)

            # build item-user graph
            if v_id not in history_v_lists:
                history_v_lists[v_id] = []
                history_vr_lists[v_id] = []
            history_v_lists[v_id].append(u_id)
            history_vr_lists[v_id].append(r_value)

            # build train set
            train_u.append(u_id)
            train_v.append(v_id)
            train_r.append(r_value)
        else:
            # build test set
            test_u.append(u_id)
            test_v.append(v_id)
            test_r.append(r_value)
        
        # build social graph
        if u_id not in social_adj_lists:
            social_adj_lists[u_id] = set()
        social_adj_lists[u_id].add(n_id)

        if undirected:
            if n_id not in social_adj_lists:
                social_adj_lists[n_id] = set()

            social_adj_lists[n_id].add(u_id)
        
        pbar.update(1)
    
    pbar.close()
    del df

    with open('dataset.pickle', 'wb') as handle:
        pickle.dump(
            [history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list]
        , handle, protocol=pickle.HIGHEST_PROTOCOL)
    del handle


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.54 µs


HBox(children=(FloatProgress(value=0.0, max=3011125.0), HTML(value='')))

In [15]:
# !cp dataset.pickle "/content/drive/Shareddrives/RecSys21/pickle/part-0000[0_1].pickle"

## Read pickle

In [20]:
# with open('/content/drive/Shareddrives/RecSys21/pickle/part-00000.pickle', 'rb') as handle:
with open('dataset.pickle', 'rb') as handle:
    history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, \
     train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list = pickle.load(handle)

## Memory test

In [None]:
size = 10000
a = np.random.choice(a=[False, True], size=(size,4))
b = np.random.randint(0,15,size)
sys.getsizeof(a), sys.getsizeof(b), sys.getsizeof(a.tolist()), sys.getsizeof(b.tolist())

(40112, 80096, 80072, 80072)

# NEO4J API
Wyciągnięcie danych dla potrzeb GraphRec korzystająć z paczek neo4j API

In [107]:
!pip install neo4j --quiet

In [108]:
from neo4j import GraphDatabase

In [111]:
host = '46.101.246.118'
user = 'neo4j'
password = 'kochamPWR411'
uri = f'bolt://{host}:7687'
driver = GraphDatabase.driver(uri, auth=(user, password))
db = driver.session()

In [112]:
DATASET_SIZE = 10000

In [114]:
# ==== u, v, r ====
query = 'MATCH (u:User)-[r:Reply|Retweet|RetweetComment|Like]->(t:Tweet)\
         RETURN u.id as user, t.id as tweet, [type(r)="Reply", type(r)="Retweet", type(r)="RetweetComment",type(r)="Like"] as engagement \
         LIMIT %d' % (DATASET_SIZE)
results = db.run(query)

u, v, r = np.array([]), np.array([]), np.empty((0, 4), bool)

pbar = tqdm(total=DATASET_SIZE)
for res in results:
    values = res.values()
    u = np.append(u, values[0])
    v = np.append(v, values[1])
    r = np.append(r, [values[2]], axis=0)
    pbar.update(1)
    
pbar.close()

# ==== SPLIT TO TRAIN AND TEST ====
train_u, test_u, train_v, test_v, train_r, test_r = train_test_split(u, v, r, test_size=0.33, random_state=42)
del u, v, r

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [115]:
# ==== history_u_lists, history_ur_lists ====

# UWAGA: Wektory one-hot. Trzeba dać OR dla wektorów rankingu z tym samym id
query_template = 'MATCH (u:User {id: "%s"})-[r:Like|Reply|Retweet|RetweetComment]->(t:Tweet)\
         RETURN collect(t.id), collect([type(r)="Reply", type(r)="Retweet", type(r)="RetweetComment",type(r)="Like"])'

history_u_lists = {}
history_ur_lists = {}

ids = set(train_u)
pbar = tqdm(total=len(ids))

for u_id in ids:
    results = db.run(query_template % (u_id))

    for res in results:
        values = res.values()
        v_list = values[0]
        r_list = values[1]

        history_u_lists[u_id] = v_list
        history_ur_lists[u_id] = r_list 

    pbar.update(1)
pbar.close()

HBox(children=(FloatProgress(value=0.0, max=1262.0), HTML(value='')))




In [None]:
# ==== history_v_lists, history_vr_lists ====

query_template = 'MATCH (u:User)-[r:Like|Reply|Retweet|RetweetComment]->(t:Tweet {id: "%s"}) \
                  RETURN collect(u.id), collect([type(r)="Reply", type(r)="Retweet", type(r)="RetweetComment", type(r)="Like"])'

history_v_lists = {}
history_vr_lists = {}

ids = set(train_v)
pbar = tqdm(total=len(ids))

for v_id in ids:
    results = db.run(query_template % (v_id))

    for res in results:
        values = res.values()
        u_list = values[0]
        r_list = values[1]

        history_v_lists[v_id] = v_list
        history_ur_lists[u_id] = r_list

    pbar.update(1)
pbar.close() 

HBox(children=(FloatProgress(value=0.0, max=6407.0), HTML(value='')))

In [77]:
# ==== social_adj_lists ====

query_template = 'MATCH (u:User {id: "%s"})-[:Follow]->(n: User) RETURN collect(n.id)'

social_adj_lists = {}

ids = set(train_u)
pbar = tqdm(total=len(ids))

for u_id in ids:
    results = db.run(query_template % (u_id))

    for res in results:
        values = res.values()
        n_list = values[0]
        social_adj_lists[u_id] = n_list
    pbar.update(1)
pbar.close() 

In [None]:
# ==== ratings_list ====

ratings = list(itertools.product([False, True], repeat=4))
ratings_list = dict(zip(ratings, np.arange(16)))

## Write pickle

In [None]:
with open('/content/drive/Shareddrives/RecSys21/pickle/neo4j_%d.pickle' % (DATASET_SIZE), 'wb') as handle:
    pickle.dump(
        [history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list]
    , handle, protocol=pickle.HIGHEST_PROTOCOL)
del handle

In [None]:
driver.close()