#### Installation

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# if download returns error, check the more recent version in the link below
# https://dlcdn.apache.org/spark/
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# don't forget to change these if download return errors
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

spark = (SparkSession
            .builder
            .master("local")
            .appName("Colab")
            .config('spark.ui.port', '4050')
            # .config("spark.executor.memory", "6g")
            .config("spark.driver.memory", "10g")
            .getOrCreate()
)

In [5]:
spark

In [None]:
'''
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels
'''

#### Download Dataset

In [6]:
!gdown "https://drive.google.com/uc?id=1_2GmGcd_m75yqJRRQmzQsvHUMnqWKtOi"

Downloading...
From: https://drive.google.com/uc?id=1_2GmGcd_m75yqJRRQmzQsvHUMnqWKtOi
To: /content/favs.tar.gz
  0% 0.00/2.24M [00:00<?, ?B/s]100% 2.24M/2.24M [00:00<00:00, 146MB/s]


In [7]:
!tar xvf 'favs.tar.gz'
!mkdir favs-data
!mv *.csv favs-data

./
./81.csv
./102.csv
./104.csv
./82.csv
./31.csv
./41.csv
./01.csv
./103.csv
./61.csv
./11.csv
./83.csv
./101.csv
./72.csv
./71.csv
./21.csv
./91.csv
./51.csv


#### Data Loading

In [8]:
import pyspark.sql.functions as F
from pyspark.ml.fpm import FPGrowth

In [9]:
df = (spark
        .read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv("favs-data")
      )

print("Number of records in the dataset: ", df.count())

Number of records in the dataset:  267924


In [10]:
df.printSchema()

root
 |-- username: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name/title: string (nullable = true)



#### Simple Cleaning

In [10]:
df = (df
        .withColumnRenamed('type', 'section')
        .withColumnRenamed('name/title', 'name')
)
df.printSchema()

root
 |-- username: string (nullable = true)
 |-- section: string (nullable = true)
 |-- name: string (nullable = true)



In [11]:
df = df.distinct()

In [12]:
df = (df
        .withColumn("type_name", 
                    F.concat_ws('_', F.col("section"), F.col("name"))
                )
)
df.printSchema()

root
 |-- username: string (nullable = true)
 |-- section: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type_name: string (nullable = false)



In [17]:
df.show(10)

+-------------+----------+--------------------+--------------------+
|     username|   section|                name|           type_name|
+-------------+----------+--------------------+--------------------+
|        IN1RA|characters|              Senkuu|   characters_Senkuu|
|    INERSIA03|     anime|Youkoso Jitsuryok...|anime_Youkoso Jit...|
|    INERSIA03|characters|       Mihate, Hiura|characters_Mihate...|
|  InfernaPuma|characters|       Walker, Allen|characters_Walker...|
|    Inrageous|characters|     Oreki, Houtarou|characters_Oreki,...|
|     Irene002|characters|              Madara|   characters_Madara|
|         Iskk|characters|     Kanbaru, Suruga|characters_Kanbar...|
|itsTomokocchi|     manga|          Lucky☆Star|    manga_Lucky☆Star|
|     Ivan007_|     anime|Log Horizon 2nd S...|anime_Log Horizon...|
|       Ivryxn|characters|     Miyazono, Kaori|characters_Miyazo...|
+-------------+----------+--------------------+--------------------+
only showing top 10 rows



##### Grouped per-section

In [13]:
anime = df.filter(F.col("section") == "anime")
manga = df.filter(F.col("section") == "manga")
characters = df.filter(F.col("section") == "characters")
people = df.filter(F.col("section") == "people")

In [40]:
anime.show(10), anime.count()

+------------+-------+--------------------+--------------------+
|    username|section|                name|           type_name|
+------------+-------+--------------------+--------------------+
|   INERSIA03|  anime|Youkoso Jitsuryok...|anime_Youkoso Jit...|
|    Ivan007_|  anime|Log Horizon 2nd S...|anime_Log Horizon...|
|     J4kub07|  anime|Tengen Toppa Gurr...|anime_Tengen Topp...|
|      jackkl|  anime|     No Game No Life|anime_No Game No ...|
|Jackmoose123|  anime|         Steins;Gate|   anime_Steins;Gate|
|       Jannn|  anime|            Odd Taxi|      anime_Odd Taxi|
|    JensenM_|  anime|  Shingeki no Kyojin|anime_Shingeki no...|
|       jmp27|  anime|        Cowboy Bebop|  anime_Cowboy Bebop|
|kamehamehaha|  anime|Hunter x Hunter (...|anime_Hunter x Hu...|
|      kaz_77|  anime|Shouwa Genroku Ra...|anime_Shouwa Genr...|
+------------+-------+--------------------+--------------------+
only showing top 10 rows



(None, 80653)

In [29]:
manga.show(10), manga.count()

+---------------+-------+--------------------+--------------------+
|       username|section|                name|           type_name|
+---------------+-------+--------------------+--------------------+
|  itsTomokocchi|  manga|          Lucky☆Star|    manga_Lucky☆Star|
|        Ixilora|  manga|     Seirei Gensouki|manga_Seirei Gens...|
|       JapaBase|  manga|       Solo Leveling| manga_Solo Leveling|
|     Jasonifier|  manga|Ijiranaide, Nagat...|manga_Ijiranaide,...|
|         Jiet91|  manga|               Fuuka|         manga_Fuuka|
|   jonniedonnie|  manga|Kaguya-sama wa Ko...|manga_Kaguya-sama...|
|joyousawakening|  manga| Fullmetal Alchemist|manga_Fullmetal A...|
|       kaah_m_s|  manga|           Ten Count|     manga_Ten Count|
|       Katzurra|  manga|             Gintama|       manga_Gintama|
|        Kickyuz|  manga|5-toubun no Hanayome|manga_5-toubun no...|
+---------------+-------+--------------------+--------------------+
only showing top 10 rows



(None, 40141)

In [30]:
characters.show(10), characters.count()

+------------+----------+---------------+--------------------+
|    username|   section|           name|           type_name|
+------------+----------+---------------+--------------------+
|       IN1RA|characters|         Senkuu|   characters_Senkuu|
|   INERSIA03|characters|  Mihate, Hiura|characters_Mihate...|
| InfernaPuma|characters|  Walker, Allen|characters_Walker...|
|   Inrageous|characters|Oreki, Houtarou|characters_Oreki,...|
|    Irene002|characters|         Madara|   characters_Madara|
|        Iskk|characters|Kanbaru, Suruga|characters_Kanbar...|
|      Ivryxn|characters|Miyazono, Kaori|characters_Miyazo...|
|Jackmoose123|characters|Mashima, Taichi|characters_Mashim...|
|     JacobPJ|characters|  Oumae, Kumiko|characters_Oumae,...|
| jassycatx33|characters|   Honda, Tooru|characters_Honda,...|
+------------+----------+---------------+--------------------+
only showing top 10 rows



(None, 92132)

In [31]:
people.show(10), people.count()

+---------------+-------+--------------------+--------------------+
|       username|section|                name|           type_name|
+---------------+-------+--------------------+--------------------+
| jacklikesmanga| people|      Urasawa, Naoki|people_Urasawa, N...|
|Jamesbond000142| people|       Hikasa, Youko|people_Hikasa, Youko|
|         Jiet91| people|Matsuoka, Yoshitsugu|people_Matsuoka, ...|
|        Jinzaki| people|     Kamiya, Hiroshi|people_Kamiya, Hi...|
|        JRKNoff| people|     Shinkou, Shotou|people_Shinkou, S...|
|      Kamishoto| people|          Maeda, Jun|   people_Maeda, Jun|
|         Kevten| people|         Baker, Troy|  people_Baker, Troy|
|        Khukuri| people|    Hokazono, Masaya|people_Hokazono, ...|
|     lilcapalot| people|     Tomatsu, Haruka|people_Tomatsu, H...|
|lockemsockem123| people|   Sugiyama, Noriaki|people_Sugiyama, ...|
+---------------+-------+--------------------+--------------------+
only showing top 10 rows



(None, 52731)

#### Initial Run

In [None]:
"""
Support
The support of a set of items is the proportion of transactions in the dataset
that contain that set of items. In other words, it measures how frequently the
items appear together. A high support value indicates that the itemset is popular
or frequent in the dataset.

Confidence
Confidence measures the strength of the association between two items.
It is the proportion of transactions containing the antecedent (the first item)
that also contain the consequent (the second item). A high confidence value
indicates that the two items are strongly associated.

Lift
Lift measures the strength of the association between two items relative to the
expected frequency of their co-occurrence. A lift value greater than 1 indicates
a positive association (the items are more likely to occur together than expected),
while a value less than 1 indicates a negative association (the items are less
likely to occur together than expected). A lift value of 1 indicates no association.

In general, higher support and confidence values indicate stronger relationships
between items, while higher lift values indicate more significant associations.

The number of data points in the dataset can affect the value of support and confidence,
as more data points can result in smaller support and confidence values. However,
lift is less sensitive to the number of data points and is more dependent on the
strength of the association between the items. In general, as the number of data
points increases, the support and confidence values may decrease, while the lift
value may stay the same or increase if the association between the items is strong.
"""

##### All Sections

In [18]:
favs_data = (df
                .groupBy("username")
                .agg(F.collect_list("type_name"))
                .sort('username')
            )

In [20]:
fpGrowth = FPGrowth(itemsCol="collect_list(type_name)",
                    minSupport=0.001,
                    minConfidence=0.5)

model = fpGrowth.fit(favs_data)

In [None]:
"""
From FP-Growth's docs
https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html

The FPGrowthModel provides:

- freqItemsets:
frequent itemsets in the format of a DataFrame with the following columns:
    - items: array: A given itemset.
    - freq: long: A count of how many times this itemset was seen, given the configured model parameters.

- associationRules:
association rules generated with confidence above minConfidence,
in the format of a DataFrame with the following columns:
    - antecedent: array: The itemset that is the hypothesis of the association rule.
    - consequent: array: An itemset that always contains a single element representing
                         the conclusion of the association rule.
    - confidence: double: Refer to minConfidence above for a definition of confidence.
    - lift: double: A measure of how well the antecedent predicts the consequent, calculated as:
                    support(antecedent U consequent) / (support(antecedent) x support(consequent))
    - support: double: Refer to minSupport above for a definition of support.

- transform: For each transaction in itemsCol, the transform method will compare
its items against the antecedents of each association rule. If the record contains
all the antecedents of a specific association rule, the rule will be considered as
applicable and its consequents will be added to the prediction result. The transform
method will summarize the consequents from all the applicable rules as prediction.
The prediction column has the same data type as itemsCol and does not contain existing
items in the itemsCol.
"""

In [21]:
# Display frequent itemsets.
items = model.freqItemsets
items.count()

59635

In [22]:
items.show(20, False)

+--------------------------------------------------------------------------------------------+----+
|items                                                                                       |freq|
+--------------------------------------------------------------------------------------------+----+
|[people_Kusanagi, Mizuho]                                                                   |14  |
|[anime_Sukitte Ii na yo.]                                                                   |17  |
|[people_Fairouz, Ai]                                                                        |17  |
|[people_Miyamoto, Shigeru]                                                                  |20  |
|[characters_Nakano, Miku]                                                                   |292 |
|[characters_Nakano, Miku, anime_Kono Subarashii Sekai ni Shukufuku wo!]                     |16  |
|[characters_Nakano, Miku, anime_Shingeki no Kyojin: The Final Season]                       |15  |


In [23]:
# Display generated association rules.
rules = model.associationRules
rules.count()

30797

In [24]:
rules.show(20, False)

+-------------------------------------------------------------------------------------------------------------------+----------------------------------------+------------------+------------------+---------------------+
|antecedent                                                                                                         |consequent                              |confidence        |lift              |support              |
+-------------------------------------------------------------------------------------------------------------------+----------------------------------------+------------------+------------------+---------------------+
|[characters_Yeager, Eren, anime_Shingeki no Kyojin Season 3 Part 2, anime_Code Geass: Hangyaku no Lelouch R2]      |[characters_Lamperouge, Lelouch]        |0.9               |8.341114058355437 |0.0012879221522610188|
|[characters_Nishimiya, Shouko, manga_Koe no Katachi]                                                               |[anime_

In [25]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
transformed = model.transform(favs_data)
transformed.count()

13976

In [26]:
transformed.show(20, False)

+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### Utility Function

In [14]:
def run_fpGrowth(data, n=15, min_confidence=.5):
    '''data must either be anime, manga, characters, or people df'''
    favs = data.groupBy("username") \
                    .agg(F.collect_list("name")) \
                    .sort('username')

    min_support = n / data.count()

    fpGrowth = FPGrowth(itemsCol="collect_list(name)",
                    minSupport=min_support,
                    minConfidence=min_confidence)

    model = fpGrowth.fit(favs)

    transformed = model.transform(favs)

    return model.freqItemsets, model.associationRules, transformed


def print_result(sets, key=None, n_rows=20):
    print(f'Total: {sets.count()}\n')

    if key:
        sets.sort(F.col(key).desc()).show(n_rows, truncate=False)
    else:
        sets.show(n_rows, truncate=False)

##### Try Run Groups

In [None]:
items, rules, transformed = run_fpGrowth(anime)

In [None]:
print_result(items, 'freq')

Total: 51671

+---------------------------------------+----+
|items                                  |freq|
+---------------------------------------+----+
|[Steins;Gate]                          |1694|
|[Fullmetal Alchemist: Brotherhood]     |1681|
|[Hunter x Hunter (2011)]               |1633|
|[One Piece]                            |1523|
|[Death Note]                           |1230|
|[Shingeki no Kyojin]                   |1125|
|[Code Geass: Hangyaku no Lelouch]      |893 |
|[Neon Genesis Evangelion]              |858 |
|[Koe no Katachi]                       |797 |
|[Clannad: After Story]                 |783 |
|[Gintama]                              |763 |
|[Code Geass: Hangyaku no Lelouch R2]   |741 |
|[Naruto: Shippuuden]                   |729 |
|[Shigatsu wa Kimi no Uso]              |718 |
|[Kimetsu no Yaiba]                     |716 |
|[Tengen Toppa Gurren Lagann]           |691 |
|[Kimi no Na wa.]                       |676 |
|[Re:Zero kara Hajimeru Isekai Seikatsu]|619 |

In [None]:
print_result(rules, 'confidence', 30)

Total: 19737

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------+----------+------------------+---------------------+
|antecedent                                                                                                                                                                               |consequent                                             |confidence|lift              |support              |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------+----------+------------------+---------------------+
|[Shingeki no Kyojin Season 3, Ansatsu Kyoushitsu 2nd Season, Ansatsu Kyoushitsu]                 

In [None]:
print_result(transformed)

Total: 13976

+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|username        |collect_list(name)                                                                                                                                                                                  

#### Dump

In [15]:
def dump(sets, name):
    tmp = sets.select("*").toPandas()
    tmp.to_excel(f'{name}.xlsx', index=False)

In [16]:
SAVE_PATH = 'favs-rules'
!mkdir $SAVE_PATH

##### Save Joined Favs

In [37]:
for result, name in [(items, "items"), (rules, "rules"), (transformed, 'transformed')]:
    fn = f'all__{name}'
    fpath = os.path.join(SAVE_PATH, fn)
    
    print(f'Got {result.count()} of {fn}')
    print(f'Saving {fpath}')
    
    dump(result, fpath)

Got 59635 of all__items
Saving favs-rules/all__items
Got 30797 of all__rules
Saving favs-rules/all__rules
Got 13976 of all__transformed
Saving favs-rules/all__transformed


##### Save Groups

In [19]:
from collections import namedtuple

datasets = [
            (anime, 'anime', 15), 
            (manga, 'manga', 15),
            (characters, 'characters', 20),
            (people, 'people', 15)
        ]

for data, section, n_similar_people in datasets:
    print(f'Processing {section}')

    items, rules, transformed = run_fpGrowth(data, n_similar_people)
    results = [
               (items, 'items'),
               (rules, 'rules'),
               (transformed, 'transformed')
            ]
    
    for result, name in results:
        fn = f'{section}__{name}'
        fpath = os.path.join(SAVE_PATH, fn)

        print(f'Got {result.count()} of {fn}')
        print(f'Saving {fpath}')
        
        dump(result, fpath)
    
    print("Done!")

Processing characters
Got 40169 of characters__items
Saving favs-rules/characters__items
Got 14333 of characters__rules
Saving favs-rules/characters__rules
Got 12011 of characters__transformed
Saving favs-rules/characters__transformed
Done!
Processing people
Got 42595 of people__items
Saving favs-rules/people__items
Got 36498 of people__rules
Saving favs-rules/people__rules
Got 8861 of people__transformed
Saving favs-rules/people__transformed
Done!
