In [None]:
brew install apt-get
apt-get update # Update apt-get repository.
apt-get openjdk-8-jdk-headless -qq > /dev/null # Install Java.
wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.
pip install pyspark

In [None]:
# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

# Append the directory containing the config module to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'config')))


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, explode, size, array, sort_array, struct

# Append the directory containing the config module to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'config')))


spark = SparkSession.builder \
    .appName("Friend Recommendations") \
    .config("spark.jars.packages", "mysql:mysql-connector-java:8.0.11") \
    .getOrCreate()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, explode, size, array, sort_array, struct
from pyspark.sql.window import Window

# Load data from CSV
df = spark.read.csv("User_Friends.csv", header=True, inferSchema=True).selectExpr("cast(user_id as int)", "cast(friend_id as int)")

# Create symmetric pairs (bi-directional relationships)
friends = df.union(df.select(col("friend_id").alias("user_id"), col("user_id").alias("friend_id")))

# Join on user_id to find friends of friends
connections = friends.alias("f1").join(friends.alias("f2"), col("f1.friend_id") == col("f2.user_id")) \
    .select(col("f1.user_id"), col("f2.friend_id").alias("fof_id")) \
    .where(col("f1.user_id") != col("f2.friend_id"))

# Deduplicate and count mutual friends
mutual_friends = connections.groupBy("user_id", "fof_id").count()

recommendations_struct = mutual_friends.select(
    "user_id",
    struct(col("fof_id"), col("count").alias("mutual_friends")).alias("recommendation")
)

# Order the DataFrame by user_id and mutual_friends count descending
ordered_recommendations = recommendations_struct.orderBy("user_id", col("recommendation.mutual_friends").desc())

# Group by user_id and collect recommendations into a list
final_recommendations = ordered_recommendations.groupBy("user_id").agg(
    collect_list("recommendation").alias("recommendations")
)

# Show results to verify
final_recommendations.show(truncate=False)

In [None]:
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.sql.types import StringType

# Convert the array of structs to a string for better readability
def format_recommendations(recs):
    return ", ".join([f"({x['fof_id']}, {x['mutual_friends']})" for x in recs])

# Register the UDF (User Defined Function)
format_udf = udf(format_recommendations, StringType())

# Apply UDF to convert array of structs to a formatted string
formatted_df = final_recommendations.withColumn("recommendations", format_udf(col("recommendations")))

# Select the necessary columns, you might want to keep user_id as integer, it's supported in CSV
final_df = formatted_df.select("user_id", "recommendations")

pandas_df = final_df.toPandas()

output_file_path = "output/friends_rec.csv"
pandas_df.to_csv(output_file_path, index=False) 
spark.stop()
