# Import necessary libraries:

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib as plt
import networkx as nx
import itertools
import collections
from pandas.errors import SettingWithCopyWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings

warnings.filterwarnings("ignore")

# Define functions:

In [3]:
def convert_categorical(df_X, _X):
    values = np.array(df_X[_X])
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    df_X = df_X.drop(_X, 1)
    for j in range(integer_encoded.max() + 1):
        df_X.insert(loc=j + 1, column=str(_X) + str(j + 1), value=onehot_encoded[:, j])
    return df_X

# Read datasets:

In [4]:
dataPath = 'datasets/seen/'

print("reading datasets...")

# User-Item-Rating
df = pd.read_csv(dataPath + 'ratings_before_5941.dat', sep='\::', engine='python', names=['UID', 'MID', 'rate', 'time'])

# User-Side-Information
df_user = pd.read_csv(dataPath + 'users_before_5941.dat', sep='\::', engine='python', names=['UID', 'gender', 'age',
                                                                                             'job', 'zip'])

print("reading datasets done.\n")

print("handling dataset datasets...")
df_user = convert_categorical(df_user, 'job')
df_user = convert_categorical(df_user, 'gender')
df_user['bin'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 100], labels=['1', '2', '3', '4', '5', '6'])
df_user['age'] = df_user['bin']

df_user = df_user.drop('bin', 1)
df_user = convert_categorical(df_user, 'age')
df_user = df_user.drop('zip', 1)
print(df_user.columns.values)

print("handling dataset datasets done.\n")


reading datasets...
reading datasets done.

handling dataset datasets...
['UID' 'age1' 'age2' 'age3' 'age4' 'age5' 'age6' 'gender1' 'gender2'
 'job1' 'job2' 'job3' 'job4' 'job5' 'job6' 'job7' 'job8' 'job9' 'job10'
 'job11' 'job12' 'job13' 'job14' 'job15' 'job16' 'job17' 'job18' 'job19'
 'job20' 'job21']
handling dataset datasets done.



In [8]:
df_user.head()

Unnamed: 0,UID,age1,age2,age3,age4,age5,age6,gender1,gender2,job1,...,job12,job13,job14,job15,job16,job17,job18,job19,job20,job21
0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Create graph:

In [5]:
alpha_coefs = [0.005]  # [0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045]

for alpha_coef in alpha_coefs:
    print(f'using coef {alpha_coef}')
    pairs = []
    grouped = df.groupby(['MID', 'rate'])
    for key, group in grouped:
        pairs.extend(list(itertools.combinations(group['UID'], 2)))
    counter = collections.Counter(pairs)
    alpha = alpha_coef * 3883  # param*i_no
    edge_list = map(list, collections.Counter(el for el in counter.elements() if counter[el] >= alpha).keys())
    G = nx.Graph()

using coef 0.005


In [6]:
print("add edges")
for el in edge_list:
    G.add_edge(el[0], el[1], weight=1)
    G.add_edge(el[0], el[0], weight=1)
    G.add_edge(el[1], el[1], weight=1)


add edges


In [7]:
alpha_coef = 0.005

print("start pagerank")
pr = nx.pagerank(G.to_directed())
df_user['PR'] = df_user['UID'].map(pr)
df_user['PR'] /= float(df_user['PR'].max())
print("start degree_centrality")
dc = nx.degree_centrality(G)
df_user['CD'] = df_user['UID'].map(dc)
df_user['CD'] /= float(df_user['CD'].max())
print("start closeness_centrality")
cc = nx.closeness_centrality(G)
df_user['CC'] = df_user['UID'].map(cc)
df_user['CC'] /= float(df_user['CC'].max())
print("start betweenness_centrality")
bc = nx.betweenness_centrality(G)
df_user['CB'] = df_user['UID'].map(bc)
df_user['CB'] /= float(df_user['CB'].max())
print("start load_centrality")
lc = nx.load_centrality(G)
df_user['LC'] = df_user['UID'].map(lc)
df_user['LC'] /= float(df_user['LC'].max())
print("start average_neighbor_degree")
nd = nx.average_neighbor_degree(G, weight='weight')
df_user['AND'] = df_user['UID'].map(nd)
df_user['AND'] /= float(df_user['AND'].max())
X_train = df_user[df_user.columns[1:]]
X_train.fillna(0, inplace=True)

start pagerank
start degree_centrality
start closeness_centrality
start betweenness_centrality
start load_centrality
start average_neighbor_degree
file [0.005] done.
******************



# Save datasets:

In [None]:
X_train.to_pickle("datasets/combined-u/x_train_alpha(" + str(alpha_coef) + ").pkl")
print(f'file {alpha_coef} done.\n******************\n')