# Comics Rx
## [A comic book recommendation system](https://github.com/MangrobanGit/comics_rx)
<img src="https://images.unsplash.com/photo-1514329926535-7f6dbfbfb114?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=2850&q=80" width="400" align='left'>

---

# Libraries

In [64]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
# %autoreload 1 #would be where you need to specify the files
# %aimport comic_recs

import pandas as pd # dataframes
import os
import pickle

# Data storage
from sqlalchemy import create_engine # SQL helper
#import psycopg2 as psql #PostgreSQL DBs

# import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.sql.types import (StructType, StructField, IntegerType
#                                ,FloatType, LongType, StringType)
from pyspark.sql.types import *

import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, lit, isnan, when, count, lower
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import DataFrame


import time
import itertools
from functools import reduce
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import sys

In [66]:
sys.path.append('..')

In [67]:
# Custom
import data_fcns as dfc
import keys  # Custom keys lib
import comic_recs as cr

In [68]:
spark = pyspark.sql.SparkSession.builder.master("local[*]").getOrCreate()

In [69]:
# spark config
spark = SparkSession \
    .builder \
    .appName("movie recommendation") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.master", "local[*]") \
    .getOrCreate()
# get spark context
#sc = spark.sparkContext

## Import Data

We've previously set aside the dataset into a `json` file.

In [70]:
# We have previously created a version of the transactions table and filtered it down.
sold = spark.read.json('raw_data/als_input_filtered.json')

In [71]:
# Persist the data
sold.persist()

DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

# New Users

Let's develop an input process that uses titles, rather than pulling the specific person's info from the modeling data.

In [72]:
comics_df = spark.read.json('support_data/comics.json')
comics_df.persist()

DataFrame[comic_id: bigint, comic_title: string, img_url: string]

In [7]:
sample_buys = ['Sweet Tooth', 'Paper Girls']

### Get matching 'official titles'

In [74]:
def get_comic_ids_for_user(comics_df, read_comics_list):
    """
    Given spark DF of existing comics and list of comics to 'match'
    Return list of like comics from the DF
    """
    # Initialize
    similar_comics_list = []
    
    for comic in read_comics_list:
        # print(comic)
        # Search for comic in df
        matched_comics = (comics_df.filter(lower(comics_df['comic_title'])
                                 .contains(str.lower(comic)))
                                 .select('comic_id').rdd
                                 .flatMap(lambda x: x).collect()
                         )
        similar_comics_list.extend(matched_comics)
        
    return similar_comics_list


#### Testing

In [75]:
curr_comic_ids = get_comic_ids_for_user(comics_df, sample_buys)

### Add user

In [76]:
def create_acct_id(model_data):
    """
    Given model data, create new account id that is just the max existing +1
    """
    # Get max account id
    max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

    # New Account id
    new_acct_id = max_acct_id + 1
    
    return new_acct_id

In [77]:
def add_new_user(model_data, new_comic_ids, new_acct_id):
    """
    Given existing model data and the comic ids for new user,
    add rows for the new user to model data
    """
#     # Get max account id
#     max_acct_id = model_data.agg({'account_id':'max'}).collect()[0][0]

#     # New Account id
#     new_acct_id = max_acct_id + 1
    
    # Create spark Df of new rows
    new_rows = spark.createDataFrame([
                (new_acct_id, 1, comic_id) for comic_id in new_comic_ids])

    # Append to existing model data
    model_data_new = model_data.union(new_rows)
    
    return model_data_new

#### Testing

In [78]:
new_id = create_acct_id(sold)

In [79]:
test_sold = add_new_user(sold, curr_comic_ids, new_id)

In [80]:
test_sold.count()

61874

In [81]:
sold.count()

61871

### Train On New Data

In [82]:
# Create dictionary of candidate parameters
current_params = {'maxIter': 10
                  ,'rank': 5
                  ,'regParam': 0.1
                  ,'alpha': 40
                  ,'seed': 41916
                 }

In [83]:
def train_als(model_data, current_params):
    """
    Given training data and set of parameters
    Returns trained ALS model
    """
    # Create ALS instance for cv with our chosen parametrs
    als_train = ALS(maxIter=current_params.get('maxIter'),
              rank=current_params.get('rank'),
              userCol='account_id',
              itemCol='comic_id',
              ratingCol='bought',
              implicitPrefs=True,
              regParam=current_params.get('regParam'),
              alpha=current_params.get('alpha'),
              coldStartStrategy='nan', # we want to drop so can get through CV
              seed=41916)

    model_train = als_train.fit(model_data)
    return model_train

In [84]:
als_model = train_als(test_sold, current_params)

### Create DF of new user's unbought comics

In [24]:
def get_comics_to_rate(comics_df, training_comic_ids):
    """
    Given list of comic ids, 
    returns list of ids from master list that don't match
    """
    new_comic_ids = (comics_df.select('comic_id').distinct()
                      .filter(~col('comic_id').isin(curr_comic_ids))
                      .select('comic_id').rdd.flatMap(lambda x: x).collect()
                     )
    return new_comic_ids

In [25]:
def recommend_n_comics(top_n, new_comics_ids, account_id, als_model, comics_df):
    """
    Given a list of new comics (to the user) and requested number N
    Return list of N comics, ordered descending by recommendation score
    """

    # Create spark Df of new rows
    comics_to_predict = (spark.createDataFrame([
                        (account_id, 1, comic_id) for comic_id in new_comics_ids])
                        .select(col('_1').alias('account_id')
                        ,col('_2').alias('bought')
                        ,col('_3').alias('comic_id'))
                        )

    # Get predictions
    test_preds = als_model.transform(comics_to_predict)
    test_preds.persist()

    # Alias
    cdf = comics_df.alias('cdf')
    tp = test_preds.alias('tp')

    # Query results
    results = (tp.join(cdf, tp.comic_id==cdf.comic_id)
                .filter(~isnan(col('prediction')))
                .orderBy('prediction', ascending=False)
                .select('comic_title')
                .limit(top_n)
              ).toPandas()

    return results

In [26]:
def make_comic_recommendations(read_comics_list, top_n, comics_df, train_data 
                               ,best_params):
    """
    Given a list of comic titles and request for N
    Return list of comics recommendations as a pandas dataframe
    """
    start_time = time.time()
    
    # Get best-matching comic IDs
    train_comic_ids = get_comic_ids_for_user(comics_df, read_comics_list)
        
    # Create new account number
    new_id = create_acct_id(train_data)
    
    # Add new account to training data
    train_data_new = add_new_user(train_data, train_comic_ids, new_id)
    train_data_new.persist()
    
    # Train new ALS model
    als_model = train_als(train_data_new, best_params)
    
    # Get list of comics to rate, exclude those already matched
    new_comics_ids = get_comics_to_rate(comics_df, train_comic_ids)

    # Get pandas df of top n recommended comics!
    top_n_comics_df = recommend_n_comics(top_n, new_comics_ids, new_id
                                        ,als_model
                                        ,comics_df
                                        )
    
    print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))
    return top_n_comics_df

## BIG TEST!

In [27]:
my_list = ['Transformers', 'GI Joe', 'Y The Last Man', 'Saga', 'Avengers'
           ,'Paper Girls', 'Star Wars']

In [28]:
df = make_comic_recommendations(my_list
                               ,20
                               ,comics_df
                               ,sold
                               ,current_params)

Total Runtime: 11.18 seconds


In [29]:
df

Unnamed: 0,comic_title
0,Spider-Verse (Marvel)
1,Harley Quinn Valentines Day S (DC)
2,Convergence Harley Quinn (DC)
3,Harley Quinn & Power Girl (DC)
4,Spider-Island (Marvel)
5,Spider-Verse Team Up (Marvel)
6,Harley Quinn Futures End (DC)
7,Harley Quinn (DC)
8,Amazing Spider-Man Renew You (Marvel)
9,Amazing Spider-Man Special (Marvel)


# ALTERNATIVE 

Use Utility Matrix, as inspired by John Naujoks, suggested by Miles Erickson

1. Get the item factors 

In [366]:
# item_factors = als_model.itemFactors.toPandas()

item_factors = pd.read_pickle('support_data/item_factors.pkl')

In [367]:
item_factors.head()

Unnamed: 0,id,features
0,10,"[-0.7526867389678955, -0.21263617277145386, -1..."
1,20,"[-0.3515812158584595, 0.4757572114467621, -1.2..."
2,30,"[0.1806577742099762, -0.48153993487358093, -0...."
3,40,"[-0.06164746731519699, -0.23286470770835876, -..."
4,50,"[-0.4028661251068115, -0.3713889420032501, -1...."


In [368]:
item_factors.iloc[2171]

id                                                       4413
features    [0.3212127089500427, 0.11431856453418732, -1.1...
Name: 2171, dtype: object

In [369]:
item_factors.columns = ['item_id', 'features']

In [370]:
item_factors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6028 entries, 0 to 6027
Data columns (total 2 columns):
item_id     6028 non-null int32
features    6028 non-null object
dtypes: int32(1), object(1)
memory usage: 70.7+ KB


In [371]:
item_factors.features[0]

[-0.7526867389678955,
 -0.21263617277145386,
 -1.1569229364395142,
 0.19937847554683685,
 0.16002951562404633]

2. Create a fake user 

In [372]:
# 2171 = Fables, 2637 = Gideon Falls
fake_user = [{'id': 2171, 'rating': 1}, {'id': 2637, 'rating': 1}] 
fake_user_df = pd.DataFrame(fake_user)
fake_user_df

Unnamed: 0,id,rating
0,2171,1
1,2637,1


3. Create item matrix

In [373]:
comic_ids = fake_user_df.id.tolist()
comics_mtx = np.zeros(shape=(len(comic_ids),current_params['rank'])) 

In [374]:
comic_ids

[2171, 2637]

In [375]:
comics_mtx.shape

(2, 5)

In [376]:
comics_mtx

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [377]:
item_factors.columns

Index(['item_id', 'features'], dtype='object')

In [378]:
item_factors.loc[item_factors['item_id']==2171].index[0]

793

In [379]:
for index, comic in enumerate(comic_ids):
    idx = item_factors.loc[item_factors['item_id']==comic].index[0]
    comics_mtx[index, :] = np.array(item_factors.loc[idx, 'features'])


In [380]:
comics_mtx

array([[ 0.49142984,  0.22184217, -0.78728408, -0.99056023, -0.36222401],
       [ 0.08937276, -0.12716685, -0.83390325, -0.92093158, -0.57430983]])

# _Make a function_

In [431]:
comics_factors = pd.read_pickle('support_data/comics_factors.pkl')

In [408]:
def create_user_item_matrix(comics_ids, comic_factors):
    """
    Given list of user's comic preferences
    and a pandas df with item (comic) factors
    create a item matrix for the user
    """
    # Get rank
    num_latent_factors = len(comic_factors.features.iloc[0])
    
    # Initialize matrix
    comics_mtx = np.zeros(shape=(len(comics_ids), num_latent_factors)) 

    for index, comic in enumerate(comic_ids):
        #print(comic)
        #com_idx = comics_factors.loc[comics_factors['cxomic_id']==comic].index.values[0]
        comics_mtx[index, :] = np.array(comic_factors.loc[comic, 'features'])
    
    return comics_mtx

In [264]:
cm = create_user_item_matrix(comics_ids=comic_ids, comics_factors=comics_factors)

In [265]:
cm

array([[ 0.49142984,  0.22184217, -0.78728408, -0.99056023, -0.36222401],
       [ 0.08937276, -0.12716685, -0.83390325, -0.92093158, -0.57430983]])

3. Create rating matrix

--- wm ---

In [410]:
def create_user_impl_rate_matrix(comic_ids, ratings_list=None):
    """
    Given item matrix
    create implicit ratings matrix
    """
    if ratings_list is None:
        n = len(comic_ids)
        imp_rat_mtx = np.ones((n,1), 'int')
    else:
        imp_rat_mtx = np.array((ratings_list,)).T
    
    return imp_rat_mtx
    

In [317]:
rm = create_user_implicit_ratings_matrix(comic_ids)

rm

array([[1],
       [1]])

--- wm ---

In [381]:
ratings = fake_user_df.rating.tolist()

In [382]:
ratings_mtx = np.array((ratings,)).T

In [383]:
ratings_mtx

array([[1],
       [1]])

In [384]:
ratings_mtx.shape, comics_mtx.shape

((2, 1), (2, 5))

--- wm ---

In [333]:
def create_user_utility_matrix(comics_matrix, user_rating_matrix):
    
    util_mtx = np.linalg.lstsq(comics_matrix, user_rating_matrix, rcond=None)

    # We just want the factors
    util_mtx = util_mtx[0].reshape((5,))

    return util_mtx

In [334]:
u = create_user_utility_matrix(cm, rm)

In [335]:
u

array([ 0.11542453, -0.0096107 , -0.43939538, -0.50558331, -0.27239854])

--- wm ---

In [385]:
# Least squares solution to get user features
fake_user_matrix = np.linalg.lstsq(comics_mtx, ratings_mtx, rcond=None)

# New users matrix!
fake_user_matrix = fake_user_matrix[0].reshape((current_params['rank'],))
fake_user_matrix.shape

(5,)

In [386]:
fake_user_matrix

array([ 0.11542453, -0.0096107 , -0.43939538, -0.50558331, -0.27239854])

### Check with known user

In [337]:
import sys
import os
import pandas as pd
import boto3

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

sys.path.append("..")

In [338]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [339]:
# Instantiate cursor
cur = conn.cursor()

In [340]:
query = """
    select 
        account_num
        ,c.comic_title
        ,c.comic_id
    from comic_trans ct inner join comics c on ct.comic_title = 
        c.comic_title
    group by
        1,2,3
"""

In [341]:
# Execute the query
cur.execute(query)

In [342]:
conn.commit()

In [343]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [344]:
temp_df.head()

Unnamed: 0,account_num,comic_title,comic_id
0,2,All New X-Men (Marvel),198
1,2,Amazing Spider-Man Annual (Marvel),223
2,2,Amazing Spider-Man (Marvel),224
3,2,A Plus X (Marvel),312
4,2,Astounding Wolf-Man (Image),392


In [345]:
z = temp_df.groupby(by='account_num').comic_id.count().reset_index()

In [346]:
z.loc[z.comic_id==3]

Unnamed: 0,account_num,comic_id
112,00120,3
179,00195,3
191,00209,3
210,00229,3
227,00248,3
297,00335,3
314,00357,3
316,00359,3
336,00405,3
337,00406,3


Let's pick on account `00120` 

In [347]:
temp_df.loc[temp_df['account_num']=='00120']

Unnamed: 0,account_num,comic_title,comic_id
12642,120,Citizen Rex (Dark Horse),1241
12643,120,Donald Duck and Friends SC VO (Boom),1952
12644,120,Powers (Marvel),4828


In [348]:
powers = np.array(item_factors.loc[item_factors['item_id']==4828,'features'])
powers.shape

(1,)

In [387]:
powers[0]

[0.06849418580532074,
 0.0012715712655335665,
 -0.26954200863838196,
 -0.22823674976825714,
 0.532203733921051]

In [388]:
score = np.dot(fake_user_matrix,powers[0])

In [389]:
score

0.09675037650860804

--- wm ---

In [390]:
sc = np.dot(u,powers[0])

In [391]:
sc

0.09675037650860804

--- wm ---

### Test New Preds

In [90]:
item_factors['new_user_pred'] = item_factors['features'].apply(lambda x :np.dot(x, fake_user_matrix))

In [96]:
top_5 = item_factors.sort_values(by=['new_user_pred'], ascending=False).head(5)

In [97]:
top_5

Unnamed: 0,item_id,features,new_user_pred
4327,5196,"[-0.052176252007484436, -0.18112696707248688, ...",5.543218
1064,4701,"[-0.02034643664956093, -0.4128129482269287, -0...",5.255344
1234,6681,"[-0.19204556941986084, -0.5186308026313782, -0...",5.221921
1044,4441,"[0.1053212359547615, -0.45340803265571594, -0....",5.038157
1799,5882,"[-0.2683447599411011, -0.4314334988594055, -0....",5.003135


In [98]:
top_5_list = top_5.item_id.tolist()

In [100]:
flag = temp_df['comic_id'].isin(top_5_list)

In [103]:
temp_df[flag].groupby(by=['comic_title','comic_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,account_num
comic_title,comic_id,Unnamed: 2_level_1
Neil Gaiman American Gods Sha (Dark Horse),4441,72
Paper Girls (Image),4701,119
Saga (Image),5196,287
Star Wars (Marvel),5882,169
Unbeatable Squirrel Girl (Marvel),6681,90


Smells ok. 
##### Thanks John!

> --- wm --- start

In [411]:
def create_user_util_matrix(comics_matrix, user_rating_matrix):
    
    util_mtx = np.linalg.lstsq(comics_matrix, user_rating_matrix, rcond=None)

    # We just want the factors
    util_mtx = util_mtx[0].reshape((5,))

    return util_mtx

In [393]:
comics_factors.head(3)

Unnamed: 0_level_0,features,comic_title,img_url
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,"[-0.7526867389678955, -0.21263617277145386, -1...",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/cover...
20,"[-0.3515812158584595, 0.4757572114467621, -1.2...",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
30,"[0.1806577742099762, -0.48153993487358093, -0....",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/cover...


In [394]:
comics_factors['this_user_pred'] = comics_factors['features'].apply(lambda x: np.dot(x, u))

In [397]:
t5 = comics_factors.sort_values(by=['this_user_pred'], ascending=False).head(5)

In [398]:
t5

Unnamed: 0_level_0,features,comic_title,img_url,this_user_pred
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1214,"[0.33106639981269836, 0.16478432714939117, -0....",Chew (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.178824
5196,"[0.34831228852272034, 0.11370803415775299, -0....",Saga (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.163962
552,"[0.3538980484008789, 0.16178269684314728, -0.8...",Batgirl (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.131813
610,"[0.35470110177993774, 0.15020547807216644, -0....",Batman (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.130366
2041,"[0.325575053691864, 0.12070709466934204, -0.81...",East of West (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.125175


> --- wm --- end

### Put it all together!

In [433]:
def make_n_comic_recommendations(comics, comic_factors, top_n):
    # Create item matrix
    comic_matrix = create_user_item_matrix(comics_ids=comics,
                                           comic_factors=comic_factors
                                          )
    
    # Create user matrix
    user_matrix = create_user_impl_rate_matrix(comic_ids=comics)
    
    # Create utility matrix
    utility_matrix = create_user_util_matrix(comic_matrix, user_matrix)
    
    # Update comic_factors dataframe for this user
    cf = comic_factors.copy()
    u = utility_matrix
    cf['pred'] = cf['features'].apply(lambda x: np.dot(x, u))

    top_n_df = cf.sort_values(by=['pred'], ascending=False).head(top_n)
    
    return top_n_df

In [441]:
new_comics = [20,144]

In [442]:
deef = make_n_comic_recommendations(new_comics, comics_factors, 10)

In [443]:
deef

Unnamed: 0_level_0,features,comic_title,img_url,pred
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1214,"[0.33106639981269836, 0.16478432714939117, -0....",Chew (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.178824
5196,"[0.34831228852272034, 0.11370803415775299, -0....",Saga (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.163962
552,"[0.3538980484008789, 0.16178269684314728, -0.8...",Batgirl (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.131813
610,"[0.35470110177993774, 0.15020547807216644, -0....",Batman (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.130366
2041,"[0.325575053691864, 0.12070709466934204, -0.81...",East of West (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.125175
5882,"[0.3295718729496002, 0.028912074863910675, -0....",Star Wars (Marvel),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.117087
6824,"[0.3278036415576935, 0.19572147727012634, -0.8...",Walking Dead (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.117041
1709,"[0.3662220537662506, 0.11855562776327133, -0.8...",Deadpool (Marvel),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.116029
4248,"[0.3722243309020996, 0.07827568054199219, -0.8...",Mighty Thor (Marvel),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.112784
1807,"[0.3776921331882477, 0.16676929593086243, -0.7...",Detective Comics (DC),https://comrx.s3-us-west-2.amazonaws.com/cover...,1.112185


### Create Official item factors matrix or dataframe

In [8]:
item_factors_df = pd.read_pickle('support_data/item_factors.pkl')

In [399]:
item_factors_df.head()

Unnamed: 0,comic_id,features
0,10,"[-0.7526867389678955, -0.21263617277145386, -1..."
1,20,"[-0.3515812158584595, 0.4757572114467621, -1.2..."
2,30,"[0.1806577742099762, -0.48153993487358093, -0...."
3,40,"[-0.06164746731519699, -0.23286470770835876, -..."
4,50,"[-0.4028661251068115, -0.3713889420032501, -1...."


In [10]:
item_factors_df.columns = ['comic_id', 'features']

In [11]:
comics_df.show()

+--------+--------------------+--------------------+
|comic_id|         comic_title|             img_url|
+--------+--------------------+--------------------+
|      17|1 For $1 Axe Cop ...|https://comrx.s3-...|
|      20|1 For $1 Conan th...|https://comrx.s3-...|
|      22|1 For $1 Mass Eff...|https://comrx.s3-...|
|      24|1 For $1 Star War...|https://comrx.s3-...|
|      27|1 For $1 Usagi Yo...|https://comrx.s3-...|
|      18|1 For 1 Baltimore...|https://comrx.s3-...|
|       2|100 Bullets Broth...|https://comrx.s3-...|
|       4|100 Penny Press S...|https://comrx.s3-...|
|       6|100 Penny Press T...|https://comrx.s3-...|
|       8|12 Reasons To Die...|https://comrx.s3-...|
|       9|    13 Coins (Other)|https://comrx.s3-...|
|      11|1602 Witch Hunter...|https://comrx.s3-...|
|      29|2021 Lost Childre...|https://comrx.s3-...|
|      31|23 Skidoo One Sho...|https://comrx.s3-...|
|      36|3 Floyds Alpha Ki...|https://comrx.s3-...|
|      33|30 Days of Night ...|https://comrx.s

#### Get comics info

In [12]:
comics_pdf = comics_df.toPandas()

In [13]:
comics_pdf.head()

Unnamed: 0,comic_id,comic_title,img_url
0,17,1 For $1 Axe Cop Bad Guy Eart (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
1,20,1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
2,22,1 For $1 Mass Effect Foundati (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
3,24,1 For $1 Star Wars Legacy (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
4,27,1 For $1 Usagi Yojimb (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...


In [14]:
item_factors_df.shape

(6028, 2)

In [15]:
combo = item_factors_df.merge(comics_pdf, left_on='comic_id', right_on='comic_id', how='inner', )

In [172]:
combo.head()

Unnamed: 0,comic_id,features,comic_title,img_url
0,10,"[-0.7526867389678955, -0.21263617277145386, -1...",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/cover...
1,20,"[-0.3515812158584595, 0.4757572114467621, -1.2...",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
2,30,"[0.1806577742099762, -0.48153993487358093, -0....",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/cover...
3,40,"[-0.06164746731519699, -0.23286470770835876, -...",4001 Ad (Other),https://comrx.s3-us-west-2.amazonaws.com/cover...
4,50,"[-0.4028661251068115, -0.3713889420032501, -1....",68 Homefront (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...


In order to make slicing easier down the road, set index to comic_id

In [173]:
coms = combo.copy()

In [178]:
coms.set_index(['comic_id'], inplace=True)

In [180]:
coms.head()

Unnamed: 0_level_0,features,comic_title,img_url
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,"[-0.7526867389678955, -0.21263617277145386, -1...",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/cover...
20,"[-0.3515812158584595, 0.4757572114467621, -1.2...",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...
30,"[0.1806577742099762, -0.48153993487358093, -0....",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/cover...
40,"[-0.06164746731519699, -0.23286470770835876, -...",4001 Ad (Other),https://comrx.s3-us-west-2.amazonaws.com/cover...
50,"[-0.4028661251068115, -0.3713889420032501, -1....",68 Homefront (Image),https://comrx.s3-us-west-2.amazonaws.com/cover...


In [181]:
coms.loc[50]

features       [-0.4028661251068115, -0.3713889420032501, -1....
comic_title                                 68 Homefront (Image)
img_url        https://comrx.s3-us-west-2.amazonaws.com/cover...
Name: 50, dtype: object

In [17]:
comics_pdf.loc[comics_pdf['comic_id']==20]

Unnamed: 0,comic_id,comic_title,img_url
1,20,1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/cover...


In [182]:
combo.shape

(6028, 4)

In [183]:
coms.shape

(6028, 3)

In [20]:
combo.to_pickle('support_data/comics_factors.pkl')

In [184]:
coms.to_pickle('support_data/comics_factors.pkl')

In [21]:
my_list = ['Transformers', 'GI Joe', 'Y The Last Man', 'Saga', 'Avengers'
           ,'Paper Girls', 'Star Wars']

In [59]:
test = "Black Science \(Image\)"

In [60]:
comic_ids_list = comics_pdf[comics_pdf['comic_title'].str.contains(test, case=False)].comic_id.tolist()

In [61]:
comic_ids_list

[3189, 838]

# GRAVEYARD

In [16]:
comics_df.show(2)

+--------+--------------------+
|comic_id|         comic_title|
+--------+--------------------+
|       1|0Secret Wars (Mar...|
|       2|100 Bullets Broth...|
+--------+--------------------+
only showing top 2 rows



In [32]:
from pyspark.sql.functions import lower, upper

In [25]:
sample_comic = 'Sweet Tooth'

In [99]:
sample_comic = 'Paper Girls'

In [101]:
test = comics_df.filter(lower(comics_df['comic_title']).contains(str.lower(sample_comic))).select('comic_title')

In [102]:
test.show()

+--------------------+
|         comic_title|
+--------------------+
|Image Firsts Pape...|
| Paper Girls (Image)|
+--------------------+



In [109]:
test2 = test.withColumn('temp', col('comic_title'))

In [111]:
test2.show()

+--------------------+--------------------+
|         comic_title|                temp|
+--------------------+--------------------+
|Image Firsts Pape...|Image Firsts Pape...|
| Paper Girls (Image)| Paper Girls (Image)|
+--------------------+--------------------+



In [129]:
sold.show(2)

+----------+------+--------+
|account_id|bought|comic_id|
+----------+------+--------+
|      2247|     1|     995|
|       487|     1|    1102|
+----------+------+--------+
only showing top 2 rows



In [157]:
# Get max account id
max_acct_id = sold.agg({'account_id':'max'}).collect()[0][0]
max_acct_id

3074

In [158]:
new_acct_id = max_acct_id + 1

In [154]:
curr_comic_ids

[6105, 3235, 4701]

In [159]:
new_rows = [(new_acct_id, 1, comic_id) for comic_id in curr_comic_ids]

In [160]:
new_rows

[(3075, 1, 6105), (3075, 1, 3235), (3075, 1, 4701)]

In [161]:
sold_test = sold
sold_test.persist()


DataFrame[account_id: bigint, bought: bigint, comic_id: bigint]

In [164]:
sold_test.count()

61871

In [166]:
sold_new = sold_test.union(spark.createDataFrame(new_rows))

In [167]:
sold_new.count()

61874

In [None]:
newRow = spark.createDataFrame([(15,'Alk','Dhl')])
df = df.union(newRow)
df.show()

In [170]:
test_sold = sold

In [171]:
test_sold_2 = add_new_user(sold, curr_comic_ids)

In [172]:
print(test_sold.count())

61871


In [173]:
print(test_sold_2.count())

61874
