# Parallel

In [3]:
import pandas as pd
import numpy as np
import random

from multiprocessing import Pool
from multiprocessing import cpu_count
from functools import partial

## Generate Data

In [2]:
import pandas as pd
from random import randint

def generate_data(n_books = 3000, n_genres = 10, n_authors = 450, n_publishers = 50, n_readers = 30000, dataset_size = 100000):
    '''
    This function will generate a dataset with features associated to
    book data set. The dataset will have the following columns : 
        - book_id (String) : Unique identified for the book
        - book_rating (Integer) : A value between 0 and 10
        - reader_id (String) : Unique identifier for the user
        - book_genre (Integer) : An integer representing a genre for the book, 
                                 value is between 1 and 15, indicating that 
                                 there are 15 unique genres. Each book can only
                                 have 1 genre
        - author_id (String) : Unique identifier for the author of the book
        - num_pages (Integer) : Random value between 70 and 500
        - publisher_id (String) : A unique identifier for the publisher of the book
        - publish_year (Integer) : The year of book publishing
        - book_price (Integer) : The sale price of the book
        - text_lang (Integer) : The language of the book - returns an integer which 
                                is mapped to some language
        
    params:
        n_books (Integer) : The number of books you want the dataset to have
        n_genres (Integer) : Number of genres to be chosen from
        n_authors (Integer) : Number of authors to be generated
        n_publishers (Integer) : Number of publishers for the dataset
        n_readers (Integer) : Number of readers for the dataset
        dataset_size (Integer) : The number of rows to be generated 
        
    example:
        data = generate_data()
    '''
    
    d = pd.DataFrame(
        {
            'book_id' : [randint(1, n_books) for _ in range(dataset_size)],
            'author_id' : [randint(1, n_authors) for _ in range(dataset_size)],
            'book_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'reader_id' : [randint(1, n_readers) for _ in range(dataset_size)],
            'num_pages' : [randint(75, 700) for _ in range(dataset_size)],
            'book_rating' : [randint(1, 10) for _ in range(dataset_size)],
            'publisher_id' : [randint(1, n_publishers) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)],
            'book_price' : [randint(1, 200) for _ in range(dataset_size)],
            'text_lang' : [randint(1,7) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d
  
d = generate_data(dataset_size = 10000000)
# d.to_csv('data.csv', index = False)

## Parallelize w/ Multiprocessing - Singular Parameter

In [3]:
d.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,2394,249,4,3799,474,1,34,2013,122,1
1,1703,191,10,2355,447,2,7,2020,175,7
2,1600,78,2,13005,399,3,40,2013,129,5
3,347,380,7,1221,636,2,20,2014,45,3
4,1183,269,10,15059,610,2,16,2003,163,5


In [4]:
def avg_price_per_reader(data):
    '''
    This function will calculate the average book price associated to each reader
    
    params:
        data (DataFrame) : The dataset holding the reader history and book price
        
    example:
        r_df = avg_price_per_reader(d)
    '''
    readers = data.reader_id.unique()
    res = []
    for reader in readers:
        avg_price = np.mean(data[data['reader_id'] == reader].book_price.values)
        r_df = {
            'reader_id' : reader,
            'avg_price' : avg_price
        }
        res.append(r_df)
    return pd.DataFrame(res)

In [5]:
%time r_df = avg_price_per_reader(d)

CPU times: user 7min 6s, sys: 5.87 s, total: 7min 12s
Wall time: 1min 44s


In [6]:
%time gb_df = d.groupby('reader_id')['book_price'].mean().reset_index()

CPU times: user 158 ms, sys: 67.8 ms, total: 226 ms
Wall time: 242 ms


In [7]:
def parallel_singular_param(d, fn, n_cores):
    '''
    This function will run a function with a single argument in parallel depending on
    the number of cores the user has specified. If the user specifies more cores than
    availalbe on the computer, the function will raise a value error.
    
    params:
        d (DataFrame) : The argument for the function
        fn (Function) : The function you want to run in parallel
        n_cores (Integer) : The number of cores you want to use
        
    example:
        parallel_singular_param(
            d,
            avg_price_per_reader,
            4
        )
    '''
    if cpu_count() < n_cores:
        raise ValueError("The number of CPU's specified exceed the amount available")

    df_list = np.array_split(d, n_cores)
    pool = Pool(n_cores)
    res = pool.map(fn, df_list)
    pool.close()
    pool.join()
    return pd.concat(res)

In [None]:
%time para_res = parallel_singular_param(d, avg_price_per_reader, 4)

## Parallelize w/ Multiprocessing - Multiple Parameter

In [1]:
def genre_features(d, readers, genre_col = 'book_genre', reader_col = 'reader_id'):
    res = []
    for reader in readers:
        g = d[d[reader_col] == reader][genre_col].value_counts(normalize = True).to_dict()
        df = pd.DataFrame([g], index = [reader])
        res.append(df)
    return pd.concat(res).fillna(0)

def parallelize_section(fn, d, readers, n_cores):
    if cpu_count() < n_cores:
        raise ValueError("The number of CPU's specified exceed the amount available")

    reader_list = np.array_split(readers, n_cores)
    pool = Pool(n_cores)
    calc_df = pool.map(partial(fn, d), reader_list)
    pool.close()
    pool.join()
    return pd.concat(calc_df)

In [None]:
readers = d.reader_id.unique()

In [None]:
%time g_df = genre_features(d, readers)

In [None]:
%time para_g_df = parallelize_section(genre_features, d, readers, 4)

## Parallelize Sci-Kit Learn

Parallel Jobs Available : 
- RFC  
- KNN

Parallel Jobs Unavailable : 
- GBC
- Naieve Bayes

Possible for hyperparameter tuning

In [None]:
ft = ['book_genre', 'num_pages', 'book_rating', 'publish_year', 'text_lang']
target = ['book_price']

X = d[ft].values
y = d[target].values

x_train, x_test, y_train, y_test = train_test_split(X ,y ,test_size = 0.2)

In [None]:
%%time
rfc = RandomForestClassifier(
    max_depth=2, random_state=0, n_jobs = 4
)
rfc.fit(x_train, y_train)

In [None]:
%%time
gbc = GradientBoostingClassifier(
    max_depth=2, random_state=0
)
gbc.fit(x_train, y_train)

## Concluding Remarks

---