# Parallel

In [20]:
import pandas as pd
import numpy as np
import random
import uuid
import multiprocessing as mp

In [2]:
# constants

## Generate Data

In [23]:
import pandas as pd
from random import randint

def generate_data(n_books = 3000, n_genres = 10, n_authors = 450, n_publishers = 50, n_readers = 30000, dataset_size = 100000):
    '''
    This function will generate a dataset with features associated to
    book data set. The dataset will have the following columns : 
        - book_id (String) : Unique identified for the book
        - book_rating (Integer) : A value between 0 and 10
        - reader_id (String) : Unique identifier for the user
        - book_genre (Integer) : An integer representing a genre for the book, 
                                 value is between 1 and 15, indicating that 
                                 there are 15 unique genres. Each book can only
                                 have 1 genre
        - author_id (String) : Unique identifier for the author of the book
        - num_pages (Integer) : Random value between 70 and 500
        - publisher_id (String) : A unique identifier for the publisher of the book
        - publish_year (Integer) : The year of book publishing
        - book_price (Integer) : The sale price of the book
        - text_lang (Integer) : The language of the book - returns an integer which 
                                is mapped to some language
        
    params:
        n_books (Integer) : The number of books you want the dataset to have
        n_genres (Integer) : Number of genres to be chosen from
        n_authors (Integer) : Number of authors to be generated
        n_publishers (Integer) : Number of publishers for the dataset
        n_readers (Integer) : Number of readers for the dataset
        dataset_size (Integer) : The number of rows to be generated 
        
    example:
        data = generate_data()
    '''
    
    d = pd.DataFrame(
        {
            'book_id' : [randint(1, n_books) for _ in range(dataset_size)],
            'author_id' : [randint(1, n_authors) for _ in range(dataset_size)],
            'book_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'reader_id' : [uuid.uuid4() for _ in range(dataset_size)],
            'num_pages' : [randint(75, 700) for _ in range(dataset_size)],
            'book_rating' : [randint(1, 10) for _ in range(dataset_size)],
            'publisher_id' : [randint(1, n_publishers) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)],
            'book_price' : [randint(1, 200) for _ in range(dataset_size)],
            'text_lang' : [randint(1,7) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d
  
d = generate_data(dataset_size = 100000)
d.to_csv('data.csv', index = False)

## Parallelize w/ Multiprocessing - Singular Parameter

In [25]:
d

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,2022,180,9,c38dad04-dfb1-4f7f-bed5-a90947200e8a,577,9,24,2006,152,1
1,1151,171,7,a2661398-debf-483f-a09f-b5756b59413b,240,9,8,2012,165,6
2,2516,89,9,409ab769-8d43-4f80-9347-549e50ec6a53,598,10,41,2021,126,1
3,891,148,4,13e0a2c0-7314-4829-83cd-316177f14493,554,8,39,2007,128,3
4,2880,179,3,54ccd5e0-3c9b-426b-b25a-ea79fb9a016e,234,6,12,2000,181,2
...,...,...,...,...,...,...,...,...,...,...
99995,1446,142,6,41ee9fae-bf6a-44e2-911a-879b7be8e063,583,5,8,2015,159,3
99996,1202,52,1,9ac2fa23-2ea3-4680-b998-03f6a241148f,327,7,47,2001,121,3
99997,1078,211,7,1da1c30b-7ab4-4c6e-b6ac-61dbcc981834,673,6,34,2003,128,1
99998,176,129,8,4d425d33-556b-4be2-b826-341fa1e1eaf4,339,6,46,2018,39,5


## Parallelize w/ Multiprocessing - Multiple Parameter

## Parallelize w/ Swifter

## Parallelize Sci-Kit Learn w/ Ray

## Concluding Remarks

---