# Fake data

This is a notebook for generating data for use in the exerises.

This uses dataframes, so if it doesn't make sense to you, that's perfectly normal. So far you've only learned about Series objects.

In [None]:
# imports
import os

import numpy as np
import pandas as pd
import requests
import scipy
import scipy.stats as stats

In [None]:
# Define URLs to pull from 
CENSUS_SURNAME_URL = 'https://www2.census.gov/topics/genealogy/2010surnames/Names_2010Census_Top1000.xlsx'
HADLEY_FORENAME_URL = 'https://github.com/hadley/data-baby-names/raw/master/baby-names.csv'

# Note: you probably don't haven't set up a FRED environment variable so this will fail.
UNEMPLOYMENT_URL = 'https://api.stlouisfed.org/fred/series/observations?series_id=UNRATE&api_key={}&file_type=json'.format(os.environ['FRED_API_KEY'])

# Simple generation

In [None]:
simple = pd.Series([
    50,
    100,
    150,
    100,
    50,
],
name='data')

simple.to_csv('./data/simple.csv', header=True, index=False)

# Name generation

In [None]:
# Pandas can read CSVs directly from the internet
forenames = pd.read_csv(HADLEY_FORENAME_URL)
forenames.head()

In [None]:
# And now an Excel
surnames = pd.read_excel(CENSUS_SURNAME_URL, header=1)['SURNAME']

# Drop junk on end
surnames = surnames.head(1000).str.title()

In [None]:
# Get forenames from after 2000.
modern_forenames = forenames.loc[forenames['year'] > 2000, 'name']

# Get a sample of 1,000.
forename_sample = modern_forenames.sample(1000)

# Replace 10% of those with 'Steve'
forename_sample.iloc[:int(len(forename_sample) / 10)] = 'Steve'
forename_sample.index = pd.RangeIndex(0,1000)# Show

In [None]:
# Show
forename_sample.head()

In [None]:
# Show
surnames.head()

In [None]:
# Make combos
full_names = forename_sample + ' ' + surnames

In [None]:
# Show example
full_names.sample(5)

In [None]:
# Write to file.
full_names.name = 'names'
full_names.to_csv('data/subject_names.csv', header=True, index=False)

# Weight Generation

Using a normally distributed height of 172 pounds and a standard deviation of 29 pounds.

In [None]:
# Create random data
weight = pd.Series(
    data=stats.norm.rvs(loc=172, scale=29, size=1000)
)

# Round it
rounded_w = weight.round(2)

# Add nans
nan_ix = rounded_w.sample(frac=.2).index.values
rounded_w.loc[nan_ix] = np.NaN


# Write it to disk.
rounded_w.name = 'pounds'
rounded_w.to_csv('data/weight_in_pounds.csv', header=True, index=False)

# Display
rounded_w.head(5)

# Height Generation

Using a normally distributed height of 5.66 feet (68 inches) and stdev of .33 (4 inches).

In [None]:
# Create random data
height = pd.Series(
    data=stats.norm.rvs(loc=5.66, scale=.33, size=1000)
)

# Round it
rounded_h = height.round(2)

# Add nans
nan_ix = rounded_h.sample(frac=.2).index.values
rounded_h.loc[nan_ix] = np.NaN

# Write it to disk.
rounded_h.name = 'feet'
rounded_h.to_csv('data/height_in_feet.csv', header=True, index=False)

# Display
rounded_h.head()

# Spy and Staff Generation

Create a list of spies and embassy staff.

In [None]:
spies = pd.Series({
    'Mata'    : 40,
    'Casanova': 41,
    'Julius'  : 51,
    'Ethel'   : 48,
    'Klaus'   : 35,
    'Belle'   : 30,
    'Valery'  : 28,
})

embassy_staff = pd.Series({
    'Dave'    : 30,
    'Julius'  : 51,
    'Ethel'   : 48,
    'Jenna'   : 25,
    'Klaus'   : 35,
    'Aloysius': 84,
    'Carlos'  : 40,
    'Michael' : 28,
    'Tito'    : 32,
    'Jermaine': 30,
    'Janet'   : 28,
    'Marlon'  : 25,
    'Jackie'  : 22,
})

# No headers
spies.to_csv('data/spies.csv')
embassy_staff.to_csv('data/embassy_staff.csv')

# Generate some datetime csvs.

In [None]:
# Generate dates
datetimes = pd.Series(pd.date_range(start='2020-01-01', end='2020-01-02', freq='H'), name='dts')
datetimes.to_csv('data/datetimes.csv', index=False, header=True)

# Generate periods
periods = pd.Series(pd.period_range(start='2020-01-01', end='2020-01-02', freq='H'), name='periods')
datetimes.to_csv('data/periods.csv', index=False, header=True)

# Generate morse code CSV

In [None]:
morse_dict = {
    '"': '. _ . . _ .', "'": '. _ _ _ _ .', '(': '_ . _ _ .'  , ')': '_ . _ _ . _',
    'x': '_ . . _'    , '+': '. _ . _ .'  , ',': '_ _ . . _ _', '-': '_ . . . . _',
    '.': '. _ . _ . _', '/': '_ . . _ .'  , '0': '_ _ _ _ _'  , '1': '. _ _ _ _'  ,
    '2': '. . _ _ _'  , '3': '. . . _ _'  , '4': '. . . . _'  , '5': '. . . . .'  ,
    '6': '_ . . . .'  , '7': '_ _ . . .'  , '8': '_ _ _ . .'  , '9': '_ _ _ _ .'  ,
    ':': '_ _ _ . . .', ';': '_ . _ . _ .', '=': '_ . . . _'  , '?': '. . _ _ . .',
    '@': '. _ _ . _ .', 'A': '. _'        , 'B': '_ . . .'    , 'C': '_ . _ .'    ,
    'D': '_ . .'      , 'E': '.'          , 'F': '. . _ .'    , 'G': '_ _ .'      ,
    'H': '. . . .'    , 'I': '. .'        , 'J': '. _ _ _'    , 'K': '_ . _'      ,
    'L': '. _ . .'    , 'M': '_ _'        , 'N': '_ .'        , 'O': '_ _ _'      ,
    'P': '. _ _ .'    , 'Q': '_ _ . _'    , 'R': '. _ .'      , 'S': '. . .'      ,
    'T': '_'          , 'U': '. . _'      , 'V': '. . . _'    , 'W': '. _ _'      ,
    'X': '_ . . _'    , 'Y': '_ . _ _'    , 'Z': '_ _ . .'    , '_': '. . _ _ . _',
}

morse_series            = pd.Series(morse_dict)
morse_series.name       = 'morse_representation'
morse_series.index.name = 'character'
morse_series.sort_index().to_csv('./data/morse.csv', header=True)

In [90]:
# Generate Life of Brian characters
lob_characters = pd.Series(
    name='characters',
    data=[
        'Brian',
        'NA',
        'Centurion of the Yard',
        'Gaoler',
        'NA',
        'Harry the Haggler',
        'Ex-Leper',
        'Gregory',
        'Judith Escariot',
        'Simon the Holy Man',
        'Pontius Pilate',
        'Matthias',
        'Gregory',
        'NA',
        'Gaoler',
        'Brian',
        'Simon the Holy Man',
        'NA',
        'NA',
        'Gregory',
        'Gregory',
        'Gregory',
        'Ex-Leper',
        'Gregory',
        'Simon the Holy Man',
        'Matthias',
        'NA',
    ]
)

# Write
lob_characters.to_csv('./data/lob_characters.csv', index=False, header=True)

# Unemployment Data from FRED

In [None]:
# Fetch data from FRED using API key (won't work for you without setup)
r = requests.get(UNEMPLOYMENT_URL)
data = r.text

In [None]:
# Load data
observations = pd.read_json(data)['observations']
dates = observations.map(lambda x: x['date'])
values = observations.map(lambda x: x['value'])
ix = pd.PeriodIndex(dates.values, freq='M')

# Create series
unemployment = pd.Series(
    index=ix,
    data=percent.values,
)

# chopped
chopped = unemployment.loc['2000':]
chopped.index.name = 'month'
chopped.name = 'unemployment_rate'
chopped.to_csv('./data/unemployment.csv', header=True)

# Make dummy groupby data

In [None]:
DATASET_LENGTH = 1000

# Distribution dict
salary_data = {
    'Tinker' : {'prob': .15, 'mean': 35000, 'std': .05},
    'Tailor' : {'prob': .10, 'mean': 45000, 'std': .20},
    'Soldier': {'prob': .70, 'mean': 40000, 'std': .00},
    'Spy'    : {'prob': .05, 'mean': 80000, 'std': .50},
}

# Create randomly generator professions
professions = pd.Series(
    np.random.choice(
        list(salary_data.keys()),
        p=[salary_data[name]['prob'] for name in salary_data.keys()],
        size=1000, 
    )
)

# Output proefssions
professions.name = 'profession'
professions.to_csv('./data/professions.csv', index=False, header=True)


# Tranform function
def gen_probs(df, data):
    #return df.iloc[0]
    key = df.iloc[0]
    mean = data[key]['mean']
    std = data[key]['std'] * mean
    norm_dist = stats.norm.rvs(loc=mean, scale=std, size=len(df))
    return norm_dist


# Generate salaries.
salaries = professions.groupby(professions).transform(gen_probs, salary_data)
salaries = salaries.round(2)
salaries.name = 'salary'
salaries.to_csv('./data/salaries.csv', index=False, header=True)

# Make mixed type data

In [95]:
mixed_bag = pd.Series(['2018-01-01', 4, 2.0, '2019-01-01', '12/31/2015', 2.8, 'hola!', 2.9 ,'NA'])
mixed_bag.name = 'hot_mess'
mixed_bag.to_csv('./data/mixed_bag.csv', index=False, header=True)