In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DateType, FloatType, IntegerType, StructField, ArrayType
from pyspark.sql.functions import col, split, explode

In [2]:
spark = SparkSession.builder.appName('UsersExplore').getOrCreate()

In [3]:
# user_id: The id of the user
# user_url: The url of the user on myanimelist
# lastonlinedate: Datetime of the last time the user logged into myanimelist.net
# num_watching: Number of animes the user is watching
# num_completed: Number of animes the user completed
# numonhold: Number of animes the user has on hold
# num_dropped: Number of animes the user has dropped
# numplanto_watch: Number of animes the user plans to watch
# num_days: Number of days the user has spent watching anime
# mean_score: Mean score the user has given to animes
# clubs: List of MAL clubs the user is member of
users_schema = StructType([
    StructField('user_id', StringType(), True),
    StructField('user_url', StringType(), True),
    StructField('last_online_date', DateType(), False),
    StructField('num_watching', IntegerType(), False),
    StructField('num_completed', IntegerType(), False),
    StructField('num_on_hold', IntegerType(), False),
    StructField('num_dropped', IntegerType(), False),
    StructField('num_plan_to_watch', IntegerType(), False),
    StructField('num_days', FloatType(), False),
    StructField('mean_score', StringType(), False),
    StructField('clubs', StringType(), False)
])

In [4]:
df_users = spark.read.options(header=True, delimiter='\t').schema(users_schema).csv('../data/unziped/user.csv')

In [5]:
df_users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- last_online_date: date (nullable = true)
 |-- num_watching: integer (nullable = true)
 |-- num_completed: integer (nullable = true)
 |-- num_on_hold: integer (nullable = true)
 |-- num_dropped: integer (nullable = true)
 |-- num_plan_to_watch: integer (nullable = true)
 |-- num_days: float (nullable = true)
 |-- mean_score: string (nullable = true)
 |-- clubs: string (nullable = true)



In [6]:
print(df_users.take(3))

[Row(user_id='kir1yama', user_url='https://myanimelist.net/profile/Kir1yama', last_online_date=datetime.date(2021, 8, 27), num_watching=28, num_completed=606, num_on_hold=48, num_dropped=48, num_plan_to_watch=113, num_days=228.39999389648438, mean_score='6.69', clubs=None), Row(user_id='smatster', user_url='https://myanimelist.net/profile/smatster', last_online_date=datetime.date(2022, 2, 25), num_watching=46, num_completed=1188, num_on_hold=209, num_dropped=31, num_plan_to_watch=2759, num_days=212.10000610351562, mean_score='8.38', clubs='29299|70446|379|36473|907'), Row(user_id='suzuhrevv', user_url='https://myanimelist.net/profile/SuzuhRevv', last_online_date=datetime.date(2022, 2, 21), num_watching=47, num_completed=517, num_on_hold=180, num_dropped=52, num_plan_to_watch=211, num_days=216.89999389648438, mean_score='6.04', clubs=None)]


In [20]:
df_users.select(['clubs']).take(3)

[Row(clubs=None),
 Row(clubs=['29299', '70446', '379', '36473', '907']),
 Row(clubs=None)]

In [19]:
df_users = df_users.withColumn('clubs', split(col('clubs'), '\|'))

In [27]:
df_users_clubs = df_users.select('user_id', explode(col('clubs')).alias('club_id'))

In [28]:
df_users_clubs.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- club_id: string (nullable = true)



In [31]:
df_users_clubs.filter(col('club_id').isNull()).collect()

[]

In [33]:
df_users_clubs.write.parquet('../data/refined/users_clubs')