# Comics Rx
## [A comic book recommendation system](https://github.com/MangrobanGit/comics_rx)
<img src="https://images.unsplash.com/photo-1514329926535-7f6dbfbfb114?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=2850&q=80" width="400" align='left'>

---

# Libraries

In [2]:
%matplotlib inline
%load_ext autoreload
# %autoreload 1 #would be where you need to specify the files
# %aimport comic_recs

import pandas as pd # dataframes
import os
import pickle

# Data storage
from sqlalchemy import create_engine # SQL helper
#import psycopg2 as psql #PostgreSQL DBs

# import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.sql.types import (StructType, StructField, IntegerType
#                                ,FloatType, LongType, StringType)
from pyspark.sql.types import *

import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, lit, isnan, when, count
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import DataFrame

# Custom
import lib.data_fcns as dfc
import lib.keys  # Custom keys lib
import lib.comic_recs as cr

import time
import itertools
from functools import reduce
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# spark config
spark = SparkSession \
    .builder \
    .appName("movie recommendation") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.master", "local[*]") \
    .getOrCreate()
# get spark context
#sc = spark.sparkContext

## Import Data

We've previously set aside the dataset into a `json` file.

In [4]:
# We have previously created a version of the transactions table and filtered it down.
sold = spark.read.json('raw_data/als_input_filtered.json')

In [5]:
# Persist the data
sold.persist()

DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

# New Users

Let's develop an input process that uses titles, rather than pulling the specific person's info from the modeling data.

In [5]:
comics_df = spark.read.json('support_data/comics.json')
comics_df.persist()

DataFrame[comic_id: bigint, comic_title: string]

In [6]:
sample_buys = ['Sweet Tooth', 'Paper Girls']

In [7]:
sample_buys = ['Spider-Man', 'Paper Girls']

### Get matching 'official titles'

In [8]:
def get_comic_ids_for_user(comics_df, read_comics_list):
    """
    Given spark DF of existing comics and list of comics to 'match'
    Return list of like comics from the DF
    """
    # Initialize
    similar_comics_list = []
    
    for comic in read_comics_list:
        # print(comic)
        # Search for comic in df
        matched_comics = (comics_df.filter(lower(comics_df['comic_title'])
                                 .contains(str.lower(comic)))
                                 .select('comic_id').rdd
                                 .flatMap(lambda x: x).collect()
                         )
        similar_comics_list.extend(matched_comics)
        
    return similar_comics_list


#### Testing

In [9]:
curr_comic_ids = get_comic_ids_for_user(comics_df, sample_buys)

NameError: name 'lower' is not defined

### Add user

In [210]:
def create_acct_id(model_data):
    """
    Given model data, create new account id that is just the max existing +1
    """
    # Get max account id
    max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

    # New Account id
    new_acct_id = max_acct_id + 1
    
    return new_acct_id

In [211]:
def add_new_user(model_data, new_comic_ids, new_acct_id):
    """
    Given existing model data and the comic ids for new user,
    add rows for the new user to model data
    """
#     # Get max account id
#     max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

#     # New Account id
#     new_acct_id = max_acct_id + 1
    
    # Create spark Df of new rows
    new_rows = spark.createDataFrame([
                (new_acct_id, 1, comic_id) for comic_id in new_comic_ids])

    # Append to existing model data
    model_data_new = model_data.union(new_rows)
    
    return model_data_new

#### Testing

In [297]:
new_id = create_acct_id(sold)

In [305]:
test_sold = add_new_user(sold, curr_comic_ids, new_id)

In [306]:
test_sold.count()

61945

In [307]:
sold.count()

61871

### Train On New Data

In [178]:
# Create dictionary of candidate parameters
current_params = {'maxIter': 20
                  ,'rank': 10
                  ,'regParam': 0.1
                  ,'alpha': 40
                  ,'seed': 41916
                 }

In [331]:
def train_als(model_data, current_params):
    """
    Given training data and set of parameters
    Returns trained ALS model
    """
    # Create ALS instance for cv with our chosen parametrs
    als_train = ALS(maxIter=current_params.get('maxIter'),
              rank=current_params.get('rank'),
              userCol='account_id',
              itemCol='comic_id',
              ratingCol='bought',
              implicitPrefs=True,
              regParam=current_params.get('regParam'),
              alpha=current_params.get('alpha'),
              coldStartStrategy='nan', # we want to drop so can get through CV
              seed=41916)

    model_train = als_train.fit(model_data)
    return model_train

In [330]:
als_model = train_als(test_sold, current_params)

### Create DF of new user's unbought comics

In [332]:
def get_comics_to_rate(comics_df, training_comic_ids):
    """
    Given list of comic ids, 
    returns list of ids from master list that don't match
    """
    new_comic_ids = (comics_df.select('comic_id').distinct()
                      .filter(~col('comic_id').isin(curr_comic_ids))
                      .select('comic_id').rdd.flatMap(lambda x: x).collect()
                     )
    return new_comic_ids

In [345]:
def recommend_n_comics(top_n, new_comics_ids, account_id, als_model, comics_df):
    """
    Given a list of new comics (to the user) and requested number N
    Return list of N comics, ordered descending by recommendation score
    """

    # Create spark Df of new rows
    comics_to_predict = (spark.createDataFrame([
                        (account_id, 1, comic_id) for comic_id in new_comics_ids])
                        .select(col('_1').alias('account_id')
                        ,col('_2').alias('bought')
                        ,col('_3').alias('comic_id'))
                        )

    # Get predictions
    test_preds = als_model.transform(comics_to_predict)
    test_preds.persist()

    # Alias
    cdf = comics_df.alias('cdf')
    tp = test_preds.alias('tp')

    # Query results
    results = (tp.join(cdf, tp.comic_id==cdf.comic_id)
                .filter(~isnan(col('prediction')))
                .orderBy('prediction', ascending=False)
                .select('comic_title')
                .limit(top_n)
              ).toPandas()

    return results

In [360]:
def make_comic_recommendations(read_comics_list, top_n, comics_df, train_data 
                               ,best_params):
    """
    Given a list of comic titles and request for N
    Return list of comics recommendations as a pandas dataframe
    """
    start_time = time.time()
    
    # Get best-matching comic IDs
    train_comic_ids = get_comic_ids_for_user(comics_df, read_comics_list)
        
    # Create new account number
    new_id = create_acct_id(train_data)
    
    # Add new account to training data
    train_data_new = add_new_user(train_data, train_comic_ids, new_id)
    train_data_new.persist()
    
    # Train new ALS model
    als_model = train_als(train_data_new, best_params)
    
    # Get list of comics to rate, exclude those already matched
    new_comics_ids = get_comics_to_rate(comics_df, train_comic_ids)

    # Get pandas df of top n recommended comics!
    top_n_comics_df = recommend_n_comics(top_n, new_comics_ids, new_id
                                        ,als_model
                                        ,comics_df
                                        )
    
    print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))
    return top_n_comics_df

## BIG TEST!

In [364]:
my_list = ['Transformers', 'GI Joe', 'Y The Last Man', 'Saga', 'Avengers'
           ,'Paper Girls', 'Star Wars']

In [365]:
df = make_comic_recommendations(my_list
                               ,20
                               ,comics_df
                               ,sold
                               ,current_params)

Total Runtime: 9.53 seconds


In [366]:
df

Unnamed: 0,comic_title
0,Carnage (Marvel)
1,Deadpool Annual (Marvel)
2,Hulk (Marvel)
3,Thunderbolts (Marvel)
4,Spongebob Comics (Other)
5,Astonishing X-Men Annual (Marvel)
6,Nova (Marvel)
7,Origin Ii (Marvel)
8,Shield (Marvel)
9,Fear Itself Deadpool (Marvel)


# GRAVEYARD

In [16]:
comics_df.show(2)

+--------+--------------------+
|comic_id|         comic_title|
+--------+--------------------+
|       1|0Secret Wars (Mar...|
|       2|100 Bullets Broth...|
+--------+--------------------+
only showing top 2 rows



In [32]:
from pyspark.sql.functions import lower, upper

In [25]:
sample_comic = 'Sweet Tooth'

In [99]:
sample_comic = 'Paper Girls'

In [101]:
test = comics_df.filter(lower(comics_df['comic_title']).contains(str.lower(sample_comic))).select('comic_title')

In [102]:
test.show()

+--------------------+
|         comic_title|
+--------------------+
|Image Firsts Pape...|
| Paper Girls (Image)|
+--------------------+



In [109]:
test2 = test.withColumn('temp', col('comic_title'))

In [111]:
test2.show()

+--------------------+--------------------+
|         comic_title|                temp|
+--------------------+--------------------+
|Image Firsts Pape...|Image Firsts Pape...|
| Paper Girls (Image)| Paper Girls (Image)|
+--------------------+--------------------+



In [129]:
sold.show(2)

+----------+------+--------+
|account_id|bought|comic_id|
+----------+------+--------+
|      2247|     1|     995|
|       487|     1|    1102|
+----------+------+--------+
only showing top 2 rows



In [157]:
# Get max account id
max_acct_id = sold.agg({'account_id':'max'}).collect()[0][0]
max_acct_id

3074

In [158]:
new_acct_id = max_acct_id + 1

In [154]:
curr_comic_ids

[6105, 3235, 4701]

In [159]:
new_rows = [(new_acct_id, 1, comic_id) for comic_id in curr_comic_ids]

In [160]:
new_rows

[(3075, 1, 6105), (3075, 1, 3235), (3075, 1, 4701)]

In [161]:
sold_test = sold
sold_test.persist()


DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

In [164]:
sold_test.count()

61871

In [166]:
sold_new = sold_test.union(spark.createDataFrame(new_rows))

In [167]:
sold_new.count()

61874

In [None]:
newRow = spark.createDataFrame([(15,'Alk','Dhl')])
df = df.union(newRow)
df.show()

In [170]:
test_sold = sold

In [171]:
test_sold_2 = add_new_user(sold, curr_comic_ids)

In [172]:
print(test_sold.count())

61871


In [173]:
print(test_sold_2.count())

61874
