# Comics Rx
## [A comic book recommendation system](https://github.com/MangrobanGit/comics_rx)
<img src="https://images.unsplash.com/photo-1514329926535-7f6dbfbfb114?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=2850&q=80" width="400" align='left'>

---

# Libraries

In [47]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
# %autoreload 1 #would be where you need to specify the files
# %aimport comic_recs

import pandas as pd # dataframes
import os
import pickle

# Data storage
from sqlalchemy import create_engine # SQL helper
#import psycopg2 as psql #PostgreSQL DBs

# import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import (StructType, StructField, IntegerType
                               ,FloatType, LongType, StringType)
# from pyspark.sql.types import *

import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, lit, isnan, when, count, lower
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import DataFrame

import time
import itertools
from functools import reduce
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import sys

In [12]:
sys.path.append('..')

In [13]:
# Custom
import data_fcns as dfc
import keys  # Custom keys lib
import comic_recs as cr

In [14]:
# spark config
spark = SparkSession \
    .builder \
    .appName("movie recommendation") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.master", "local[*]") \
    .getOrCreate()
# get spark context
#sc = spark.sparkContext

## Import Data

We've previously set aside the dataset into a `json` file.

In [15]:
# We have previously created a version of the transactions table and filtered it down.
sold = spark.read.json('raw_data/als_input_filtered.json')

In [16]:
# Persist the data
sold.persist()

DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

# New Users

Let's develop an input process that uses titles, rather than pulling the specific person's info from the modeling data.

In [48]:
comics_df = spark.read.json('support_data/comics.json')
comics_df.persist()

DataFrame[comic_id: bigint, comic_title: string, img_url: string]

Get lay of land of `comics_df`

In [49]:
comics_df.show()

+--------+--------------------+--------------------+
|comic_id|         comic_title|             img_url|
+--------+--------------------+--------------------+
|      15|1 For $1 Action P...|https://comrx.s3-...|
|      17|1 For $1 Axe Cop ...|https://comrx.s3-...|
|      20|1 For $1 Conan th...|https://comrx.s3-...|
|      22|1 For $1 Mass Eff...|https://comrx.s3-...|
|      24|1 For $1 Star War...|https://comrx.s3-...|
|      27|1 For $1 Usagi Yo...|https://comrx.s3-...|
|      18|1 For 1 Baltimore...|https://comrx.s3-...|
|       2|100 Bullets Broth...|https://comrx.s3-...|
|       4|100 Penny Press S...|https://comrx.s3-...|
|       6|100 Penny Press T...|https://comrx.s3-...|
|       8|12 Reasons To Die...|https://comrx.s3-...|
|       9|    13 Coins (Other)|https://comrx.s3-...|
|      11|1602 Witch Hunter...|https://comrx.s3-...|
|      29|2021 Lost Childre...|https://comrx.s3-...|
|      31|23 Skidoo One Sho...|https://comrx.s3-...|
|      36|3 Floyds Alpha Ki...|https://comrx.s

In [50]:
comics_pandas = comics_df.toPandas()

In [19]:
sample_buys = ['Spider-Man', 'Paper Girls']

In [28]:
sample_buys = ['Sweet Tooth', 'Paper Girls']

### Get matching 'official titles'

In [25]:
def get_comic_ids_for_user(comics_df, read_comics_list):
    """
    Given spark DF of existing comics and list of comics to 'match'
    Return list of like comics from the DF
    """
    # Initialize
    similar_comics_list = []
    
    for comic in read_comics_list:
        # print(comic)
        # Search for comic in df
        matched_comics = (comics_df.filter(lower(comics_df['comic_title'])
                                 .contains(str.lower(comic)))
                                 .select('comic_id').rdd
                                 .flatMap(lambda x: x).collect()
                         )
        similar_comics_list.extend(matched_comics)
        
    return similar_comics_list


#### Testing

In [29]:
curr_comic_ids = get_comic_ids_for_user(comics_df, sample_buys)

In [30]:
curr_comic_ids

[6105, 3235, 4701]

### Add user

In [31]:
def create_acct_id(model_data):
    """
    Given model data, create new account id that is just the max existing +1
    """
    # Get max account id
    max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

    # New Account id
    new_acct_id = max_acct_id + 1
    
    return new_acct_id

In [32]:
def add_new_user(model_data, new_comic_ids, new_acct_id):
    """
    Given existing model data and the comic ids for new user,
    add rows for the new user to model data
    """
#     # Get max account id
#     max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

#     # New Account id
#     new_acct_id = max_acct_id + 1
    
    # Create spark Df of new rows
    new_rows = spark.createDataFrame([
                (new_acct_id, 1, comic_id) for comic_id in new_comic_ids])

    # Append to existing model data
    model_data_new = model_data.union(new_rows)
    
    return model_data_new

#### Testing

In [33]:
new_id = create_acct_id(sold)

In [34]:
test_sold = add_new_user(sold, curr_comic_ids, new_id)

In [35]:
test_sold.count()

61874

In [36]:
sold.count()

61871

### Train On New Data

In [37]:
# Create dictionary of candidate parameters
current_params = {'maxIter': 10
                  ,'rank': 5
                  ,'regParam': 0.1
                  ,'alpha': 40
                  ,'seed': 41916
                 }

In [38]:
def train_als(model_data, current_params):
    """
    Given training data and set of parameters
    Returns trained ALS model
    """
    # Create ALS instance for cv with our chosen parametrs
    als_train = ALS(maxIter=current_params.get('maxIter'),
              rank=current_params.get('rank'),
              userCol='account_id',
              itemCol='comic_id',
              ratingCol='bought',
              implicitPrefs=True,
              regParam=current_params.get('regParam'),
              alpha=current_params.get('alpha'),
              coldStartStrategy='nan', # we want to drop so can get through CV
              seed=41916)

    model_train = als_train.fit(model_data)
    return model_train

In [39]:
als_model = train_als(test_sold, current_params)

### Create DF of new user's unbought comics

In [40]:
def get_comics_to_rate(comics_df, training_comic_ids):
    """
    Given list of comic ids, 
    returns list of ids from master list that don't match
    """
    new_comic_ids = (comics_df.select('comic_id').distinct()
                      .filter(~col('comic_id').isin(curr_comic_ids))
                      .select('comic_id').rdd.flatMap(lambda x: x).collect()
                     )
    return new_comic_ids

In [52]:
def recommend_n_comics(top_n, new_comics_ids, account_id, als_model, comics_df):
    """
    Given a list of new comics (to the user) and requested number N
    Return list of N comics, ordered descending by recommendation score
    """

    # Create spark Df of new rows
    comics_to_predict = (spark.createDataFrame([
                        (account_id, 1, comic_id) for comic_id in new_comics_ids])
                        .select(col('_1').alias('account_id')
                        ,col('_2').alias('bought')
                        ,col('_3').alias('comic_id'))
                        )

    # Get predictions
    test_preds = als_model.transform(comics_to_predict)
    test_preds.persist()

    # Alias
    cdf = comics_df.alias('cdf')
    tp = test_preds.alias('tp')

    # Query results
    results = (tp.join(cdf, tp.comic_id==cdf.comic_id)
                .filter(~isnan(col('prediction')))
                .orderBy('prediction', ascending=False)
                .select('comic_title', 'img_url')
                .limit(top_n)
              ).toPandas()

    return results

In [53]:
def make_comic_recommendations(read_comics_list, top_n, comics_df, train_data 
                               ,best_params):
    """
    Given a list of comic titles and request for N
    Return list of comics recommendations as a pandas dataframe
    """
    start_time = time.time()
    
    # Get best-matching comic IDs
    train_comic_ids = get_comic_ids_for_user(comics_df, read_comics_list)
        
    # Create new account number
    new_id = create_acct_id(train_data)
    
    # Add new account to training data
    train_data_new = add_new_user(train_data, train_comic_ids, new_id)
    train_data_new.persist()
    
    # Train new ALS model
    als_model = train_als(train_data_new, best_params)
    
    # Get list of comics to rate, exclude those already matched
    new_comics_ids = get_comics_to_rate(comics_df, train_comic_ids)

    # Get pandas df of top n recommended comics!
    top_n_comics_df = recommend_n_comics(top_n, new_comics_ids, new_id
                                        ,als_model
                                        ,comics_df
                                        )
    
    print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))
    return top_n_comics_df

## BIG TEST!

In [43]:
my_list = ['Transformers', 'GI Joe', 'Y The Last Man', 'Saga', 'Avengers'
           ,'Paper Girls', 'Star Wars']

In [54]:
df = make_comic_recommendations(my_list
                               ,5
                               ,comics_df
                               ,sold
                               ,current_params)

Total Runtime: 9.06 seconds


In [55]:
df

Unnamed: 0,comic_title,img_url
0,Spider-Verse (Marvel),https://comrx.s3-us-west-2.amazonaws.com/cover...
1,Harley Quinn Valentines Day S (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...
2,Convergence Harley Quinn (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...
3,Harley Quinn & Power Girl (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...
4,Spider-Island (Marvel),https://comrx.s3-us-west-2.amazonaws.com/cover...


# ALTERNATIVE 

Use Matrices, as inspired by John Naujoks, suggested by Miles Erickson

1. Get the item factors 

In [62]:
item_factors = als_model.itemFactors.toPandas()

In [63]:
item_factors.head()

Unnamed: 0,id,features
0,10,"[-0.04702767729759216, -0.13710705935955048, -..."
1,20,"[-0.06460945308208466, -0.07806924730539322, 0..."
2,30,"[-0.19240108132362366, -0.27372074127197266, 0..."
3,40,"[-0.12126117944717407, -0.01254610437899828, -..."
4,50,"[-0.012951415032148361, -0.1716042011976242, 0..."


In [64]:
item_factors.columns = ['item_id', 'features']

In [67]:
item_factors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6416 entries, 0 to 6415
Data columns (total 2 columns):
item_id     6416 non-null int32
features    6416 non-null object
dtypes: int32(1), object(1)
memory usage: 75.3+ KB


In [85]:
item_factors.features[0]

[-0.04702767729759216,
 -0.13710705935955048,
 -0.18831919133663177,
 -0.23926596343517303,
 -0.12388940155506134]

2. Create a fake user 

In [76]:
# 2171 = Fables, 2637 = Gideon Falls
fake_user = [{'id': 2171, 'rating': 1}, {'id': 2637, 'rating': 1}] 
fake_user_df = pd.DataFrame(fake_user)
fake_user_df

Unnamed: 0,id,rating
0,2171,1
1,2637,1


3. Create item matrix

In [86]:
comic_ids = fake_user_df.id.tolist()
comics_mtx = np.zeros(shape=(len(comic_ids),5)) 

3. Create rating matrix

In [87]:
ratings = fake_user_df.rating.tolist()

In [88]:
ratings_mtx = np.array((ratings,)).T

In [89]:
ratings_mtx

array([[1],
       [1]])

In [90]:
for index, comic in enumerate(comic_ids):
    comics_mtx[index, :] = np.array(item_factors.loc[comic, 'features'])


In [91]:
comics_mtx

array([[ 0.3607977 ,  0.07123993,  0.23716298, -0.0791665 ,  0.50475931],
       [-0.09457219, -0.13997468, -0.14156191, -0.08010286, -0.07182016]])

In [92]:
ratings_mtx.shape, comics_mtx.shape

((2, 1), (2, 5))

In [93]:
# Least squares solution to get user features
fake_user_matrix = np.linalg.lstsq(comics_mtx, ratings_mtx, rcond=None)

# New users matrix!
fake_user_matrix = fake_user_matrix[0].reshape((5,))
fake_user_matrix.shape

(5,)

# GRAVEYARD

In [73]:
comics_df.show(2000)

+--------+--------------------+--------------------+
|comic_id|         comic_title|             img_url|
+--------+--------------------+--------------------+
|      15|1 For $1 Action P...|https://comrx.s3-...|
|      17|1 For $1 Axe Cop ...|https://comrx.s3-...|
|      20|1 For $1 Conan th...|https://comrx.s3-...|
|      22|1 For $1 Mass Eff...|https://comrx.s3-...|
|      24|1 For $1 Star War...|https://comrx.s3-...|
|      27|1 For $1 Usagi Yo...|https://comrx.s3-...|
|      18|1 For 1 Baltimore...|https://comrx.s3-...|
|       2|100 Bullets Broth...|https://comrx.s3-...|
|       4|100 Penny Press S...|https://comrx.s3-...|
|       6|100 Penny Press T...|https://comrx.s3-...|
|       8|12 Reasons To Die...|https://comrx.s3-...|
|       9|    13 Coins (Other)|https://comrx.s3-...|
|      11|1602 Witch Hunter...|https://comrx.s3-...|
|      29|2021 Lost Childre...|https://comrx.s3-...|
|      31|23 Skidoo One Sho...|https://comrx.s3-...|
|      36|3 Floyds Alpha Ki...|https://comrx.s

In [32]:
from pyspark.sql.functions import lower, upper

In [25]:
sample_comic = 'Sweet Tooth'

In [99]:
sample_comic = 'Paper Girls'

In [101]:
test = comics_df.filter(lower(comics_df['comic_title']).contains(str.lower(sample_comic))).select('comic_title')

In [102]:
test.show()

+--------------------+
|         comic_title|
+--------------------+
|Image Firsts Pape...|
| Paper Girls (Image)|
+--------------------+



In [109]:
test2 = test.withColumn('temp', col('comic_title'))

In [111]:
test2.show()

+--------------------+--------------------+
|         comic_title|                temp|
+--------------------+--------------------+
|Image Firsts Pape...|Image Firsts Pape...|
| Paper Girls (Image)| Paper Girls (Image)|
+--------------------+--------------------+



In [129]:
sold.show(2)

+----------+------+--------+
|account_id|bought|comic_id|
+----------+------+--------+
|      2247|     1|     995|
|       487|     1|    1102|
+----------+------+--------+
only showing top 2 rows



In [157]:
# Get max account id
max_acct_id = sold.agg({'account_id':'max'}).collect()[0][0]
max_acct_id

3074

In [158]:
new_acct_id = max_acct_id + 1

In [154]:
curr_comic_ids

[6105, 3235, 4701]

In [159]:
new_rows = [(new_acct_id, 1, comic_id) for comic_id in curr_comic_ids]

In [160]:
new_rows

[(3075, 1, 6105), (3075, 1, 3235), (3075, 1, 4701)]

In [161]:
sold_test = sold
sold_test.persist()


DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

In [164]:
sold_test.count()

61871

In [166]:
sold_new = sold_test.union(spark.createDataFrame(new_rows))

In [167]:
sold_new.count()

61874

In [None]:
newRow = spark.createDataFrame([(15,'Alk','Dhl')])
df = df.union(newRow)
df.show()

In [170]:
test_sold = sold

In [171]:
test_sold_2 = add_new_user(sold, curr_comic_ids)

In [172]:
print(test_sold.count())

61871


In [173]:
print(test_sold_2.count())

61874


In [56]:
supertest = [(3,4),(5,6)]

In [57]:
supertest

[(3, 4), (5, 6)]

In [58]:
superlist = dict(supertest)

In [59]:
superlist

{3: 4, 5: 6}

In [60]:
comm = [('Batman','www.batman'),('XMen','www.s3.commen')]

In [61]:
dict(comm)

{'Batman': 'www.batman', 'XMen': 'www.s3.commen'}