# Homework: Simulating Data with Faker

## Task 1: Create data for a genealogy website.
|Genealogy Records|
|:----------------|
|first_name       |
|middle_name      |
|last_name        |
|sex              |
|birth_date       |
|birth_city       |
|birth_country    |
|death_date       |
|death_city       |
|death_country    |


1.	Make a total of at least 800 people.
2.	For the dates, the date range should be between 1700-1900.
3.	Enforce a rule that the death_date must be after the birth_date, not before.  
4.	Practice anonymization by making the first name a non-human random name, e.g. “Book”, “Cat”, etc.
5.	Export the data as “genealogy_fake.csv”



In [None]:
# Install Faker package. The ! means run this command in the terminal
!pip install Faker
!pip install iteration_utilities

from faker import Faker
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
from iteration_utilities import unique_everseen
import pandas as pd
import datetime as dt
import random

# create a faker object
fake = Faker()

Collecting Faker
  Downloading Faker-35.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-35.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m58.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m58.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-35.0.0
Collecting iteration_utilities
  Downloading iteration_utilities-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading iteration_utilities-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
# function for genealogy record generator
def create_genealogy_record():
  gender = ['F', 'M']
  sex = random.choice(gender)
  middle_name = fake.first_name_female() if sex == 'F' else fake.first_name_male()
  # set date ranges to use
  start_date = dt.date(1700, 1, 1)
  end_date = dt.date(1900, 12, 31)
  birth_date = fake.date_between(start_date, end_date)
  death_date = fake.date_between(birth_date, birth_date + relativedelta(years =120)) # limit life span to less than 120 years
  while (death_date > end_date):
      death_date = fake.date_between(birth_date, birth_date + relativedelta(years =120)) # try again for a death date below 12-31-1900

  genealogy_record = {
      'first_name': fake.word().capitalize(),
      'middle_name': middle_name,
      'last_name': fake.last_name(),
      'sex': sex,
      'birth_date': birth_date.strftime('%m/%d/%Y'),
      'birth_city': fake.city(),  # I would like cities and countries be a valid combination,
                                  # but since this is practice, I'm assuming it isn't necessary
      'birth_country': fake.country(),
      'death_date': death_date.strftime('%m/%d/%Y'),
      'death_city': fake.city(),
      'death_country': fake.country()
      }
  return genealogy_record

# Add 800 people to genealogy list
COUNT = 800
temp_genealogy_list = []
for _ in range(COUNT):
  temp_genealogy_list.append(create_genealogy_record())

# verify 800 unique genealogy records
genealogy_list = list(unique_everseen(temp_genealogy_list))
while len(genealogy_list) < COUNT:
  temp_genealogy_list.append(create_genealogy_record()) # try again, add another genealogy record
  genealogy_list = list(unique_everseen(temp_genealogy_list))
# view list
# genealogy_list

# Add list to a pandas dataframe
genealogy_df = pd.DataFrame(genealogy_list)
# view dataframe
# genealogy_df

# Save dataframe as a .csv file
genealogy_df.to_csv("genealogy_fake.csv")


## Task 2: Create data for simulated text messages
| Text Messages    |
|:-----------------|
|datetime_sent     |
|datetime_received |
|from_phone_number |
|to_phone_number   |
|os_platform_sender|
|text_message      |
|emoji             |

1.	Make a total of at least 10,000 text message records.  
2.	For datetime, use datetimes in the last month.
3.	The format of the phone number doesn’t matter.
4.	Limit the text message to approximately 160 characters, which is an industry standard for text message size.  These are simulated messages. The message does not need to make grammatical sense.
5.	Generate an emoji for approximately 50% of the text messages. The emoji can be a separate field and does not need to be included in the text message.
6.	The os_platform_sender describes the simulated operating system of the sender. Randomly select an os using two methods: ios_platform_token() and android_platform_token(). In other words, some senders will be on android and some ios at about a 50% rate of each.
7.	Export the data as “text_messages_fake.csv”



In [None]:
# function for text message generator
def create_text_message_record():
  # set date ranges to use
  last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=1)
  start_day_of_prev_month = date.today().replace(day=1) - timedelta(days=last_day_of_prev_month.day)

  datetime_sent = fake.date_time_between(start_day_of_prev_month, last_day_of_prev_month)
  has_emoji = fake.pybool(truth_probability=50)

  text_message_record = {
      'datetime_sent': datetime_sent,
      'datetime_received': fake.date_time_between(datetime_sent, last_day_of_prev_month), # should we assume a cerain percentage of messages
                                                                                          # are received almost instantanious and a small
                                                                                          # percentage experience a delay?
      'from_phone_number': fake.phone_number(),
      'to_phone_number': fake.phone_number(), # make different than from_phone_number? no, i text myself sometimes
      'os_platform_sender': fake.ios_platform_token() if random.choice([True, False]) else fake.android_platform_token(),
      'text_message': fake.text(160),
      'emoji': fake.emoji() if has_emoji else ''
  }
  return text_message_record

# Add 10,000 messages to text message list
COUNT = 10000
temp_text_message_list = []
for _ in range(COUNT):
  temp_text_message_list.append(create_text_message_record())

# verify 10,000 unique text message records
text_message_list = list(unique_everseen(temp_text_message_list))
while len(text_message_list) < COUNT:
  temp_text_message_list.append(create_text_message_record()) # try again, add another text message record
  text_message_list = list(unique_everseen(temp_text_message_list))

# view list
#text_message_list

# Add list to a pandas dataframe
text_message_df = pd.DataFrame(text_message_list)
# view dataframe
#text_message_df

# Save dataframe as a .csv file
text_message_df.to_csv("text_messages_fake.csv")

## Task 3: Create books for a bookstore website
| Books            |
|:-----------------|
|book_id           |
|isbn13            |
|book_title        |
|book_author_name  |
|book_genre        |
|brief_description |

1.	Make a total of at least 100 books.
2.	book_id should be a random positive integer at least five digits long.
3.	Generate random book titles of approximately one to 7 words. The title does not need to make grammatical sense.
4.	Book_author_name should be a first and last name. Anonymize if desired, but not required since the book title is already nonsensical.
5.	book_genre should be randomly selected from this non-encompassing list: fiction, historical fiction, science fiction, nonfiction, biography, graphic novel, youth, children.
6.	brief_description should be approximately five sentences long. The text can be nonsensical.
7.	Export the data as “books_fake.csv”


In [None]:
# function for book generator
def create_book_record():
  word_count = random.randint(1, 7)
  words = []
  for _ in range(word_count):
    words.append(fake.word().capitalize())
  genre = ['fiction', 'historical fiction', 'science fiction', 'nonfiction', 'biography', 'graphic novel', 'youth', 'children']

  book_record = {
      'book_id': fake.unique.pyint(10000, 999999), # needs uniqueness as to be a primary key candidate
      'isbn13': fake.unique.isbn13(), # needs uniqueness
      'book_title': " ".join(words),
      'book_author_name': fake.name(),
      'book_genre': random.choice(genre),
      'brief_description': fake.paragraph(5)
  }
  return book_record

# Add 100 books to book list
COUNT = 100
book_list = []
for _ in range(COUNT):
  book_list.append(create_book_record())

# 100 unique book records is satisfied by using unique for book_id and isbn13
# it doesn't matter if the rest of the fields are not unique amongst them, the id and isbn13 are the most important

# view list
#book_list

# Add list to a pandas dataframe
book_df = pd.DataFrame(book_list)
# view dataframe
#book_df

# Save dataframe as a .csv file
book_df.to_csv("books_fake.csv")

## Task 4: Create books reviews for the bookstore website
| Book Reviews     |
|:-----------------|
|book_id           |
|user_name         |
|stars             |
|comment           |

1.	Make a total of at least 2000 book reviews.
2.	The book_id should match the book_id records from the books class so that the records match as a foreign key in a database. [Book Reviews].[book_id ] should only come from valid book_id in the [Books] class. Not every book is required to have a book review. A book may have many reviews.  Consider how to randomly sample book_id numbers from [Book Reviews].[book_id ].
3.	user_name is the user_name of the reviewer.
4.	Stars is an integer value from 1-5 with 5 being the highest rating. Randomly select an integer.
5.	Comments should be text of approximately 3 to 5 sentences long.
6.	Export the data as “book_reviews_fake.csv”


In [None]:
# function for book review generator
def create_book_review_record():
  sentence_count = random.randint(1, 3)
  book_review_record = {
      'book_id': random.choice(book_df.loc[:,"book_id"]), # needs to be a valid book_id from Exercise 3,
                                                          # when book count is low and review count is high, all book_id's tend to be used
                                                          # to see some book_id's not get used, increase book count & decrease review count
      'user_name': fake.simple_profile().get('username') , # could be filled in later to guarantee uniqueness
                                                           # decided uniquenes was not necessary, as a user could write more than one review
      'stars': fake.pyint(1, 5),
      'comment': fake.paragraph(sentence_count)
  }
  return book_review_record

# function for unique user names generator
#def create_unique_user_names(count):
#  unique_user_names = []
#  for _ in range(count):
#    user_name =  fake.simple_profile().get('username')
#    while any(d['user_name'] == user_name for d in unique_user_names): # a different way to verify uniqueness othe than list(unique_everseen())
#      user_name = fake.simple_profile().get('username') # try again for a unique username
#    user_name_record = {
#        'user_name': user_name
#    }
#    unique_user_names.append(user_name_record)
#  return unique_user_names

# Add 2000 reviews to book review list
COUNT = 2000
temp_book_review_list = []
for _ in range(COUNT):
  temp_book_review_list.append(create_book_review_record())

# verify 2000 unique book review records
book_review_list = list(unique_everseen(temp_book_review_list))
while len(book_review_list) < COUNT:
  temp_book_review_list.append(create_book_review_record()) # try again, add another book review record
  book_review_list = list(unique_everseen(temp_book_review_list))

# view list
#book_review_list

# Add list to a pandas dataframe
book_review_df = pd.DataFrame(book_review_list)
# view dataframe
#book_review_df

#unique_user_names = []
#unique_user_names = create_unique_user_names(count=COUNT)
# view unique_user_names
#unique_user_names

# update all book reviews user_names with unique_user_names
#book_review_df.update(unique_user_names)
# view dataframe with update
#book_review_df

# Save dataframe as a .csv file
book_review_df.to_csv("book_reviews_fake.csv")