# Rating Profiles
## Network Analysis - University of Helsinki
* Teemu Koivisto (teemu.koivisto@helsinki.fi)
* Tomáš Vopat (tomas.vopat@helsinki.fi)

Dataset: http://www.occamslab.com/petricek/data/

## Data Preprocessing

In [None]:
# imports
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# loading files
PATH_RATINGS = "data/ratings.csv"
PATH_GENDER = "data/gender.csv"

ratings = pd.read_csv(PATH_RATINGS, names=['user_id', 'profile_id', 'rating'])
gender = pd.read_csv(PATH_GENDER, names=['user_id', 'gender'])

# merging datasets
df = pd.merge(ratings, gender, how = "left", on = "user_id")
df = pd.merge(df, gender, how = "left", left_on = "profile_id", right_on = "user_id").drop("user_id_y", axis = 1)
df.columns = ["user_id", "rating_profile", "rating_weight", "user_gender", "rating_gender"]
df = df[["user_id", "user_gender", "rating_profile", "rating_gender", "rating_weight"]]

# converting gender
df["user_gender"] = df["user_gender"].astype("category")
df["rating_gender"] = df["rating_gender"].astype("category")
print("gender vals:", df["user_gender"].unique())

display(df.info())
print(df.nunique())
display(df)

In [None]:
# There are 135,359 profiles in the dataset, but there are 168,791 unique ids of rated profiles. 
# There are ratings of accounts that do not exist...
# Dropping that...

max_id = df["user_id"].max()
df = df[df["rating_profile"] <= max_id]

In [None]:
# statistics about INCOMING ratings
group_profile = df.groupby(by = ["rating_profile"])
agg_profile = group_profile.agg({"rating_weight": "mean", "user_id": "nunique"})
agg_profile = agg_profile.reset_index()
agg_profile.columns = ["user_id", "inRating_mean", "inRating_count"]

group_profile_gender = df.groupby(by = ["rating_profile", "user_gender"])
agg_profile_gender = group_profile_gender.agg({"rating_weight": "mean", "user_id": "nunique"})
agg_profile_gender = agg_profile_gender.unstack()
agg_profile_gender = agg_profile_gender.reset_index()
agg_profile_gender.columns = ["user_id", "inRating_mean_female", "inRating_mean_male", "inRating_mean_unknown", "inRating_count_female", "inRating_count_male", "inRating_count_unknown"]

# statistics about OUTGOING ratings
group_user = df.groupby(by = ["user_id"])
agg_user = group_user.agg({"rating_weight": "mean", "rating_profile": "nunique"})
agg_user = agg_user.reset_index()
agg_user.columns = ["user_id", "outRating_mean", "outRating_count"]

group_user_gender = df.groupby(by = ["user_id", "rating_gender"])
agg_user_gender = group_user_gender.agg({"rating_weight": "mean", "rating_profile": "nunique"})
agg_user_gender = agg_user_gender.unstack()
agg_user_gender = agg_user_gender.reset_index()
agg_user_gender.columns = ["user_id", "outRating_mean_female", "outRating_mean_male", "outRating_mean_unknown", "outRating_count_female", "outRating_count_male", "outRating_count_unknown"]

# merging aggregations
df_detailed = pd.merge(agg_user, agg_profile, how = "left", on = "user_id")
df_detailed = pd.merge(df_detailed, agg_user_gender, how = "left", on = "user_id")
df_detailed = pd.merge(df_detailed, agg_profile_gender, how = "left", on = "user_id")
df_detailed = pd.merge(df_detailed, df[["user_id", "user_gender"]].drop_duplicates(), how = "left", on = "user_id", validate = "one_to_one")

print(df_detailed["user_id"].nunique())
display(df_detailed)

In [None]:
# filling NaN
cols = [
    "inRating_count",
    "inRating_count_female", 
    "inRating_count_male",
    "inRating_count_unknown",
    "outRating_count",
    "outRating_count_female", 
    "outRating_count_male", 
    "outRating_count_unknown"]
for col in cols:
    df_detailed[col] = df_detailed[col].fillna(0)
display(df_detailed)

In [None]:
# reodering columns + creating concise dataset

df_detailed = df_detailed[[
    "user_id", 
    "user_gender", 
    "outRating_count",
    "outRating_count_male",
    "outRating_count_female",
    "outRating_count_unknown",
    "outRating_mean",
    "outRating_mean_male",
    "outRating_mean_female",
    "outRating_mean_unknown",
    "inRating_count",
    "inRating_count_male",
    "inRating_count_female",
    "inRating_count_unknown",
    "inRating_mean",
    "inRating_mean_male",
    "inRating_mean_female",
    "inRating_mean_unknown"]]

df_concise = df_detailed[[
    "user_id", 
    "user_gender", 
    "inRating_count", 
    "outRating_count", 
    "inRating_mean", 
    "outRating_mean"]]

display(df_concise)

In [None]:
# saving datasets
df.to_csv("data/df.csv", index = False)
df_detailed.to_csv("data/df_detailed.csv", index = False)
df_concise.to_csv("data/df_concise.csv", index = False)

In [None]:
# loading from files
df = pd.read_csv("data/df.csv")
df_detailed = pd.read_csv("data/df_detailed.csv")
df_concise = pd.read_csv("data/df_concise.csv")

# Data Analysis

In [None]:
print("Total nodes:", df_concise["user_id"].nunique())
print("\tfemales:", df_concise[df_concise["user_gender"] == "F"].shape[0])
print("\tmales:", df_concise[df_concise["user_gender"] == "M"].shape[0])
print("\tunknown:", df_concise[df_concise["user_gender"] == "U"].shape[0])
print('Total edges:', len(df))

In [None]:
gender_portion = df_concise.groupby("user_gender")["user_id"].nunique()
plt.figure(figsize=(5, 5))
plt.pie(gender_portion, 
        labels = ["Female", "Male", "Unknown"], 
        autopct = "%1.1f%%")
plt.title("Users' gender")
plt.show()

In [None]:
# number of ratings (general)
plt.figure(figsize = (15, 5))
plt.suptitle("Number of ratings (log-scaled)")

ax1 = plt.subplot(121)
plt.hist(df_concise["inRating_count"], bins = 25, log = True)
plt.title("Received")
plt.xlabel("Number of ratings")
plt.ylabel("Number of users")

plt.subplot(122, sharey = ax1)
plt.hist(df_concise["outRating_count"], bins = 25, log = True)
plt.title("Given")
plt.xlabel("Number of ratings")

plt.show()

print("Received ratings")
print("\tmean: {0:.2f}".format(df_concise["inRating_count"].mean()))
print("\tvariance: {0:.2f}".format(df_concise["inRating_count"].std()))
print("Given ratings")
print("\tmean: {0:.2f}".format(df_concise["outRating_count"].mean()))
print("\tvariance: {0:.2f}".format(df_concise["outRating_count"].std()))

# number of ratings (by gender)
plt.figure(figsize = (15,15))
plt.suptitle("Number of ratings by gender (log-scaled)")

idx = 1
ax1 = None
for i in ["M", "F", "U"]:
    inRating = df_concise[df_concise["user_gender"] == i]["inRating_count"]
    outRating = df_concise[df_concise["user_gender"] == i]["outRating_count"]
    
    if ax1 is None:
        ax1 = plt.subplot(3, 2, idx)
    else:
        plt.subplot(3, 2, idx, sharey = ax1)
    plt.hist(inRating, bins = 25, log = True, )
    plt.title("Received ({})".format(i))
    plt.xlabel("Number of ratings")
    plt.ylabel("Number of users")

    plt.subplot(3, 2, idx + 1, sharey = ax1)
    plt.hist(outRating, bins = 25, log = True)
    plt.title("Given ({})".format(i))
    plt.xlabel("Number of ratings")
    plt.ylabel("Number of users")
    
    print("Received ratings ({})".format(i))
    print("\tmean: {0:.2f}".format(inRating.mean()))
    print("\tvariance: {0:.2f}".format(inRating.std()))
    print("Given ratings ({})".format(i))
    print("\tmean: {0:.2f}".format(outRating.mean()))
    print("\tvariance: {0:.2f}".format(outRating.std()))
    
    idx += 2

plt.show()

In [None]:
inRating = df_concise[df_concise["inRating_mean"].notna()]["inRating_mean"]
outRating = df_concise[df_concise["outRating_mean"].notna()]["outRating_mean"]

plt.figure(figsize = (15, 5))
plt.suptitle("Mean of ratings")

ax1 = plt.subplot(121)
plt.hist(inRating, bins = 25)
plt.title("Received")
plt.xlabel("Mean of the rating")
plt.ylabel("Number of accounts")

plt.subplot(122, sharey = ax1)
plt.hist(outRating, bins = 25)
plt.title("Given")
plt.xlabel("Mean of the rating")
plt.ylabel("Number of accounts")

plt.show()

print("Received ratings")
print("\tmean: {0:.2f}".format(inRating.mean()))
print("\tvariance: {0:.2f}".format(inRating.std()))
print("Given ratings")
print("\tmean: {0:.2f}".format(outRating.mean()))
print("\tvariance: {0:.2f}".format(outRating.std()))

# separated by gender
plt.figure(figsize = (15,15))
plt.suptitle("Mean of ratings by gender")

idx = 1
ax1 = None
for i in ["M", "F", "U"]:
    inRating = df_concise[(df_concise["user_gender"] == i) & (df_concise["inRating_mean"].notna())]["inRating_mean"]
    outRating = df_concise[(df_concise["user_gender"] == i) & (df_concise["outRating_mean"].notna())]["outRating_mean"]
    
    if ax1 is None:
        ax1 = plt.subplot(3, 2, idx)
    else:
        plt.subplot(3, 2, idx, sharey = ax1)
    plt.hist(inRating, bins = 25)
    plt.title("Received ({})".format(i))
    plt.xlabel("Mean of the rating")
    plt.ylabel("Number of accounts")

    plt.subplot(3, 2, idx + 1, sharey = ax1)
    plt.hist(outRating, bins = 25)
    plt.title("Given ({})".format(i))
    plt.xlabel("Mean of the rating")
    plt.ylabel("Number of users")
    
    print("Received ratings ({})".format(i))
    print("\tmean: {0:.2f}".format(inRating.mean()))
    print("\tvariance: {0:.2f}".format(inRating.std()))
    print("Given ratings ({})".format(i))
    print("\tmean: {0:.2f}".format(outRating.mean()))
    print("\tvariance: {0:.2f}".format(outRating.std()))
    
    idx += 2

plt.show()

In [None]:
# Given ratings
def get_title(name, df):
    mu = np.round(df.mean(), 2)
    sd = np.round(df.std(), 2)
    return "{} - mu = {}, sd = {}".format(name, mu, sd)

data = [
    df["rating_weight"],
    df[df["user_gender"] == "F"]["rating_weight"],
    df[df["user_gender"] == "M"]["rating_weight"],
    df[df["user_gender"] == "U"]["rating_weight"]
]

titles = [
    "All",
    "Females",
    "Males",
    "Unknown"
]

plt.figure(figsize = (15, 10))
for i in range(len(data)):
    plt.subplot(2, 2, i + 1)
    plt.hist(data[i], bins = 10)
    plt.title(get_title(titles[i], data[i]))

plt.show()

In [None]:
display(df_concise.sort_values("outRating_count", ascending = False).head(10))
display(df_concise.sort_values("inRating_count", ascending = False).head(10))

In [None]:
df_sample = df.sample(1000)
H = nx.from_pandas_edgelist(df_sample, source = "user_id", target = "rating_profile", edge_attr = "user_gender", create_using=nx.DiGraph())
print("average clustering:", nx.average_clustering(H))

## Graph Analysis

In [None]:
# centralities
degc = nx.degree_centrality(H)
#betw = nx.betweenness_centrality(H)
clos = nx.closeness_centrality(H)
eig = nx.eigenvector_centrality_numpy(H)

centrality = pd.DataFrame({"degree": degc, "closeness": clos, "eigenvector": eig})
sns.pairplot(centrality)

In [None]:
# connected components
print("number of strongly connected components:", nx.number_strongly_connected_components(H))
print("Is weakly connected:", nx.is_weakly_connected(H))
components = list(nx.strongly_connected_components(H))

In [None]:
len(H.nodes)
print(len(components))

In [None]:
# Takes 5-10 minutes

plt.figure(3, figsize=(20,20)) 
nx.draw_spring(H, node_size=6, width=0.08)
plt.show()
#plt.savefig('ratings_sub100.png')