In [None]:
import gdown
import networkx as nx
import pandas as pd
from sklearn.metrics import mean_absolute_error

In [None]:
# Set options to display all columns and rows
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 10)

# Graph projection

In [None]:
gdown.download(
    "https://drive.google.com/uc?id=195ltI0Jgg9zsWatxUP4UlPf5sgsY9Rzw",
    "df_train.parquet",
)
gdown.download(
    "https://drive.google.com/uc?id=1-65WSGzYhFToxureZ-Rhg9POZNzfJSWJ",
    "df_test.parquet",
)
df_train = pd.read_parquet("df_train.parquet")
df_test = pd.read_parquet("df_test.parquet")

In [None]:
# Sampling down the dataset to 30% for faster computation
df_train = df_train.sample(frac=0.3, random_state=42)
df_test = df_test.sample(frac=0.3, random_state=42)

In [None]:
df_train.head(2)

In [None]:
df_train.shape, df_train.reviewerID.nunique(), df_train.asin.nunique()

In [None]:
# Add nodes and edges to the bipartite graph
G = nx.Graph()
for _, row in df_train.iterrows():
    G.add_edge(row["reviewerID"], row["asin"], overall=row["overall"])

# Assign the 'bipartite' attribute to nodes
reviewers = set(df_train["reviewerID"])
items = set(df_train["asin"])

G.add_nodes_from(reviewers, bipartite=0)
G.add_nodes_from(items, bipartite=1)

In [None]:
nx.is_bipartite(G), G.number_of_nodes(), G.number_of_edges()

In [None]:
# Create the projection onto the 'reviewers' nodes
items_projection = nx.bipartite.weighted_projected_graph(G, items)
df_items_projection = nx.to_pandas_edgelist(items_projection)
df_items_projection.to_parquet("df_items_projection.parquet")

# Create the projection onto the 'reviewers' nodes
reviewers_projection = nx.bipartite.weighted_projected_graph(G, reviewers)
df_reviewers_projection = nx.to_pandas_edgelist(reviewers_projection)
df_reviewers_projection.to_parquet("df_reviewers_projection.parquet")

In [None]:
# # Please download the files from the links below
# gdown.download(
#     "https://drive.google.com/uc?id=1zjXlCs5iuVvub2ZtflPgD7iG_HVdsxIp",
#     "df_items_projection.parquet",
# )
# gdown.download(
#     "https://drive.google.com/uc?id=1xWopHfR5_PAuPh9RPctCORmwwlA-7uNo",
#     "df_reviewers_projection.parquet",
# )
# df_items_projection = pd.read_parquet("df_items_projection.parquet")
# df_reviewers_projection = pd.read_parquet("df_reviewers_projection.parquet")

In [None]:
items_projection = nx.from_pandas_edgelist(df_items_projection, edge_attr=True)
list(items_projection.edges(data=True))[:3]

In [None]:
reviewers_projection = nx.from_pandas_edgelist(
    df_reviewers_projection, edge_attr=True
)
list(reviewers_projection.edges(data=True))[:3]

In [None]:
node_to_check = "A3F12VYDGU9M70"
if reviewers_projection.has_node(node_to_check):
    print(f"The node {node_to_check} exists in the graph.")
else:
    print(f"The node {node_to_check} does not exist in the graph.")

# Community Detection Algorithms
## 1. Louvain Method

In [None]:
community_users = list(
    nx.community.louvain_communities(reviewers_projection, seed=123)
)

In [None]:
community_user_dict = {}
for community_id, community in enumerate(community_users):
    community_user_dict[community_id] = community

## 2. Clauset-Newman-Moore greedy modularity maximization Method

In [None]:
community_users = list(
    nx.community.greedy_modularity_communities(reviewers_projection)
)

In [None]:
community_user_dict = {}
for community_id, community in enumerate(community_users):
    community_user_dict[community_id] = community

## 3. Label Propagation Method

In [None]:
community_users = list(
    nx.community.asyn_lpa_communities(reviewers_projection, seed=123)
)

In [None]:
community_user_dict = {}
for community_id, community in enumerate(community_users):
    community_user_dict[community_id] = community

# Inference from Community

In [None]:
# Transform the dictionary into a list of dictionaries
list_of_dicts = [
    {"Community_ID": community_id, "Customer_ID": product_id}
    for community_id, product_ids in community_user_dict.items()
    for product_id in product_ids
]

# Create a DataFrame
df_community_user = pd.DataFrame(list_of_dicts)

In [None]:
df_community_user = df_community_user.set_index(["Customer_ID"])

In [None]:
set_cus_ids = set(df_community_user.index)
set_item_ids = set(df_train["asin"])

In [None]:
def inference(
    customer_id,
    product_id,
    df_community_user,
    df_train,
    set_cus_ids,
    set_item_ids,
):
    if customer_id not in set_cus_ids or product_id not in set_item_ids:
        return None

    try:
        community_id = df_community_user.loc[customer_id, "Community_ID"]
        df_train = df_train[
            (df_train["asin"] == product_id)
            & (
                df_train["reviewerID"].isin(
                    df_community_user.loc[
                        df_community_user["Community_ID"] == community_id
                    ].index
                )
            )
        ]
    except KeyError:
        # Log the error or handle it in a more suitable way
        print(f"KeyError: {customer_id}")
        return None

    return df_train["overall"].mean()


# Assuming df_test is a DataFrame with columns "reviewerID" and "asin"
predict_result = df_test.apply(
    lambda row: [
        row["reviewerID"],
        row["asin"],
        inference(
            row["reviewerID"],
            row["asin"],
            df_community_user,
            df_train,
            set_cus_ids,
            set_item_ids,
        ),
    ],
    axis=1,
)

# Remove rows with None values
predict_result = [result for result in predict_result if result[2] is not None]

# Convert to DataFrame
predict_result_df = pd.DataFrame(
    predict_result, columns=["reviewerID", "asin", "predicted_overall"]
)

In [None]:
predict_result_df.to_parquet("label_propagation_result_df.parquet")

# Evaluation

In [None]:
predict_result_df = pd.read_parquet("label_propagation_result_df.parquet")
predict_result_df = predict_result_df[
    predict_result_df["predicted_overall"].isna() == False
]

In [None]:
df_evaluate = pd.merge(
    predict_result_df, df_test, on=["reviewerID", "asin"], how="inner"
)
df_evaluate["predicted_overall"] = df_evaluate["predicted_overall"].astype(int)
df_evaluate.head(2)

In [None]:
ground_truth = df_evaluate.overall.values
predicted_values = df_evaluate.predicted_overall.values

mae = mean_absolute_error(ground_truth, predicted_values)

print(f"Mean Absolute Error (MAE): {mae}")