## Exercise 04 - Enrichment and transformations


In [2]:
import pandas as pd
import numpy as np


## Load auto data and configure float display


In [3]:
pd.options.display.float_format = '{:.2f}'.format

df = pd.read_json('../data/auto.json')
df.head()


Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.0,Ford,Focus
1,E432XX77RUS,1,6500.0,Toyota,Camry
2,7184TT36RUS,1,2100.0,Ford,Focus
3,X582HE161RUS,2,2000.0,Ford,Focus
4,92918M178RUS,1,5700.0,Ford,Focus


## Create 200-row sample and concatenate (concat_rows)


In [4]:
# Base sample of existing (CarNumber, Make, Model) combinations
base_sample = df[['CarNumber', 'Make', 'Model']].sample(
    n=200, replace=True, random_state=21
)

# Independent random samples for Refund and Fines
refund_sample = df['Refund'].sample(n=200, replace=True, random_state=21).reset_index(drop=True)
fines_sample = df['Fines'].sample(n=200, replace=True, random_state=21).reset_index(drop=True)

sample = base_sample.reset_index(drop=True).copy()
sample['Refund'] = refund_sample
sample['Fines'] = fines_sample

concat_rows = pd.concat([df, sample], ignore_index=True)
concat_rows.head()


Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.0,Ford,Focus
1,E432XX77RUS,1,6500.0,Toyota,Camry
2,7184TT36RUS,1,2100.0,Ford,Focus
3,X582HE161RUS,2,2000.0,Ford,Focus
4,92918M178RUS,1,5700.0,Ford,Focus


## Add Year column and build fines dataframe


In [5]:
np.random.seed(21)
years = pd.Series(
    np.random.randint(1980, 2020, size=len(concat_rows)),
    name='Year',
)

fines = pd.concat([concat_rows.reset_index(drop=True), years], axis=1)
fines.head()


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## Build owners dataframe from surname.json


In [6]:
surnames_raw = pd.read_json('../data/surname.json')

# First row contains column names
surnames_raw.columns = surnames_raw.iloc[0]
surnames = surnames_raw[1:].reset_index(drop=True)

# Clean surname values (remove any non-letter characters just in case)
surname_series_all = surnames['NAME'].astype(str).str.replace('[^A-Za-z]', '', regex=True)

surname_series_all.head()


0       ADAMS
1       ALLEN
2     ALVAREZ
3    ANDERSON
4      BAILEY
Name: NAME, dtype: object

## Create SURNAME series and owners dataframe


In [7]:
# Number of unique car numbers in the 200-row sample
unique_cars_sample = sample['CarNumber'].drop_duplicates().reset_index(drop=True)
n_unique_cars = len(unique_cars_sample)

# Sample surnames for owners
surnames_sample = surname_series_all.sample(
    n=n_unique_cars,
    replace=True,
    random_state=21,
).reset_index(drop=True)

owners = pd.DataFrame({
    'CarNumber': unique_cars_sample,
    'SURNAME': surnames_sample,
})

owners.head()


Unnamed: 0,CarNumber,SURNAME
0,Y351O8197RUS,RICHARDSON
1,H917TC36RUS,ROSS
2,C589EY154RUS,MORGAN
3,K846YE77RUS,BAILEY
4,X4108H125RUS,LOPEZ


## Manual additions to fines and owners, joins, and pivot table (to be filled)


In [8]:
# Append five more observations to the fines dataframe
extra_fines = pd.DataFrame({
    'CarNumber': [
        'EXTRA001RUS',
        'EXTRA002RUS',
        'EXTRA003RUS',
        'EXTRA004RUS',
        'EXTRA005RUS',
    ],
    'Refund': [1.0, 2.0, 1.0, 2.0, 1.5],
    'Fines': [5000.0, 12000.0, 8000.0, 3000.0, 9500.0],
    'Make': ['Ford', 'Toyota', 'Skoda', 'Volkswagen', 'Honda'],
    'Model': ['Focus', 'Camry', 'Octavia', 'Golf', 'CRV'],
    'Year': [1990, 1995, 2000, 2010, 2015],
})

fines = pd.concat([fines, extra_fines], ignore_index=True)

# Delete the last 20 observations from the owners dataframe (if possible)
if len(owners) > 20:
    owners = owners.iloc[:-20].reset_index(drop=True)

# Add three new owner records with car numbers not used in extra_fines
extra_owners = pd.DataFrame({
    'CarNumber': ['OWNONLY1RUS', 'OWNONLY2RUS', 'OWNONLY3RUS'],
    'SURNAME': ['SMITH', 'JOHNSON', 'BROWN'],
})

owners = pd.concat([owners, extra_owners], ignore_index=True)

# Joins between fines and owners on CarNumber
join_inner = pd.merge(fines, owners, on='CarNumber', how='inner')
join_outer = pd.merge(fines, owners, on='CarNumber', how='outer')
join_left = pd.merge(fines, owners, on='CarNumber', how='left')
join_right = pd.merge(fines, owners, on='CarNumber', how='right')

# Pivot table: sum of fines by make/model and year
pivot = pd.pivot_table(
    fines,
    index=['Make', 'Model'],
    columns='Year',
    values='Fines',
    aggfunc='sum',
)

# Save fines and owners to CSV without index
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)

join_inner.head(), join_outer.head(), join_left.head(), join_right.head(), pivot.head()


(      CarNumber  Refund   Fines    Make  Model  Year  SURNAME
 0  Y163O8161RUS    2.00 3200.00    Ford  Focus  1989    GREEN
 1   E432XX77RUS    1.00 6500.00  Toyota  Camry  1995  JIMENEZ
 2  92918M178RUS    1.00 5700.00    Ford  Focus  2014     GRAY
 3  H234YH197RUS    2.00 6000.00    Ford  Focus  1990    PATEL
 4  E40577152RUS    1.00 8594.59    Ford  Focus  1988   TURNER,
       CarNumber  Refund   Fines  Make  Model    Year SURNAME
 0  704687163RUS    2.00 1400.00  Ford  Focus 2004.00     NaN
 1  704787163RUS    2.00 2800.00  Ford  Focus 1992.00     NaN
 2  704987163RUS    2.00 8594.59  Ford  Focus 1985.00     NaN
 3  705287163RUS    2.00 2000.00  Ford  Focus 1980.00     NaN
 4  705387163RUS    2.00  700.00  Ford  Focus 1987.00     NaN,
       CarNumber  Refund   Fines    Make  Model  Year  SURNAME
 0  Y163O8161RUS    2.00 3200.00    Ford  Focus  1989    GREEN
 1   E432XX77RUS    1.00 6500.00  Toyota  Camry  1995  JIMENEZ
 2   7184TT36RUS    1.00 2100.00    Ford  Focus  1984      