## Packages

In [18]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

## Options

In [12]:
# So that items in pandas columns don't show truncated values 
pd.set_option('display.max_colwidth', -1)

## Create Official item factors matrix or dataframe

In [5]:
item_factors_df = pd.read_pickle('support_data/item_factors.pkl')

In [13]:
item_factors_df.head()

Unnamed: 0,id,features
0,10,"[-0.7526867389678955, -0.21263617277145386, -1.1569229364395142, 0.19937847554683685, 0.16002951562404633]"
1,20,"[-0.3515812158584595, 0.4757572114467621, -1.2307846546173096, 0.5941579341888428, -0.11247903108596802]"
2,30,"[0.1806577742099762, -0.48153993487358093, -0.9925048351287842, -0.04510089382529259, -0.6085895299911499]"
3,40,"[-0.06164746731519699, -0.23286470770835876, -0.20415398478507996, 0.3465690016746521, -1.2049529552459717]"
4,50,"[-0.4028661251068115, -0.3713889420032501, -1.1736090183258057, 0.15386144816875458, 0.5422862768173218]"


## For comics info, use the comics_df list we already built for spark

In [22]:
# spark config
spark = SparkSession.builder.appName("comic recommendation").config(
    "spark.driver.maxResultSize", "1g").config(
    "spark.driver.memory", "1g").config(
    "spark.executor.memory", "4g").config(
    "spark.master", "local[*]").getOrCreate()
# get spark context
#sc = spark.sparkContext

#### Read in json of comics data

In [24]:
comics_pdf = spark.read.json('support_data/comics.json').toPandas()

In [25]:
comics_pdf.head()

Unnamed: 0,comic_id,comic_title,img_url
0,17,1 For $1 Axe Cop Bad Guy Eart (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
1,20,1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
2,22,1 For $1 Mass Effect Foundati (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
3,24,1 For $1 Star Wars Legacy (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
4,27,1 For $1 Usagi Yojimb (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg


## Merge item factors and comics

In [28]:
# Change column names for item_factors_df
item_factors_df.columns = ['comic_id', 'features']

In [29]:
combo = item_factors_df.merge(comics_pdf, left_on='comic_id', right_on='comic_id', how='inner', )

In [30]:
combo.set_index(['comic_id'], inplace=True)

In [32]:
combo.head()

Unnamed: 0_level_0,features,comic_title,img_url
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,"[-0.7526867389678955, -0.21263617277145386, -1.1569229364395142, 0.19937847554683685, 0.16002951562404633]",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
20,"[-0.3515812158584595, 0.4757572114467621, -1.2307846546173096, 0.5941579341888428, -0.11247903108596802]",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
30,"[0.1806577742099762, -0.48153993487358093, -0.9925048351287842, -0.04510089382529259, -0.6085895299911499]",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/21st_century_tank_girl.jpg
40,"[-0.06164746731519699, -0.23286470770835876, -0.20415398478507996, 0.3465690016746521, -1.2049529552459717]",4001 Ad (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/4001_ad.jpg
50,"[-0.4028661251068115, -0.3713889420032501, -1.1736090183258057, 0.15386144816875458, 0.5422862768173218]",68 Homefront (Image),https://comrx.s3-us-west-2.amazonaws.com/covers/68_homefront.jpg


In [33]:
comics_pdf.loc[comics_pdf['comic_id']==20]

Unnamed: 0,comic_id,comic_title,img_url
1,20,1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg


In [35]:
combo.shape

(6028, 3)

## Create pickle file

In [36]:
combo.to_pickle('support_data/comics_factors_201908.pkl')

# Test 2019.08.12 fix

In [37]:
# Read new pickle

In [38]:
cf_new = pd.read_pickle('support_data/comics_factors_201908.pkl')

In [None]:
# Read old pickle 

In [39]:
cf_old = pd.read_pickle('support_data/comics_factors.pkl')

In [40]:
cf_new.shape

(6028, 3)

In [41]:
cf_old.shape

(6028, 3)

In [43]:
cf_new.head()

Unnamed: 0_level_0,features,comic_title,img_url
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,"[-0.7526867389678955, -0.21263617277145386, -1.1569229364395142, 0.19937847554683685, 0.16002951562404633]",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
20,"[-0.3515812158584595, 0.4757572114467621, -1.2307846546173096, 0.5941579341888428, -0.11247903108596802]",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
30,"[0.1806577742099762, -0.48153993487358093, -0.9925048351287842, -0.04510089382529259, -0.6085895299911499]",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/21st_century_tank_girl.jpg
40,"[-0.06164746731519699, -0.23286470770835876, -0.20415398478507996, 0.3465690016746521, -1.2049529552459717]",4001 Ad (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/4001_ad.jpg
50,"[-0.4028661251068115, -0.3713889420032501, -1.1736090183258057, 0.15386144816875458, 0.5422862768173218]",68 Homefront (Image),https://comrx.s3-us-west-2.amazonaws.com/covers/68_homefront.jpg


In [44]:
cf_old.head()

Unnamed: 0_level_0,features,comic_title,img_url
comic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,"[-0.7526867389678955, -0.21263617277145386, -1.1569229364395142, 0.19937847554683685, 0.16002951562404633]",13th Artifact One Sho (Topcow),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
20,"[-0.3515812158584595, 0.4757572114467621, -1.2307846546173096, 0.5941579341888428, -0.11247903108596802]",1 For $1 Conan the Barbarian (Dark Horse),https://comrx.s3-us-west-2.amazonaws.com/covers/_no_cover_.jpg
30,"[0.1806577742099762, -0.48153993487358093, -0.9925048351287842, -0.04510089382529259, -0.6085895299911499]",21st Century Tank Girl (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/21st_century_tank_girl.jpg
40,"[-0.06164746731519699, -0.23286470770835876, -0.20415398478507996, 0.3465690016746521, -1.2049529552459717]",4001 Ad (Other),https://comrx.s3-us-west-2.amazonaws.com/covers/4001_ad.jpg
50,"[-0.4028661251068115, -0.3713889420032501, -1.1736090183258057, 0.15386144816875458, 0.5422862768173218]",68 Homefront (Image),https://comrx.s3-us-west-2.amazonaws.com/covers/68_homefront.jpg
