In [25]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

## 5.1 Simulating consumer segment data

In [26]:
import pandas as pd
segment_data = pd.read_csv('http://bit.ly/PMR-ch5')
segment_data.head()

Unnamed: 0,Segment,age,gender,income,kids,own_home,subscribe
0,travelers,60.794945,male,57014.537526,0,True,False
1,travelers,61.764535,female,43796.941252,0,False,False
2,travelers,47.493356,male,51095.344683,0,True,False
3,travelers,60.963694,male,56457.722237,0,True,True
4,travelers,60.594199,female,103020.070798,0,True,False


In [27]:
segment_data.describe()

Unnamed: 0,age,income,kids
count,300.0,300.0,300.0
mean,40.92335,50669.454237,1.273333
std,12.827494,19336.497748,1.413725
min,18.38873,11297.309231,0.0
25%,32.870035,41075.804389,0.0
50%,38.896711,51560.344807,1.0
75%,47.987569,62172.668698,2.0
max,79.650722,108830.388732,7.0


### 5.1.1 Segment data definition

In [28]:
segment_variables = ['age', 'gender', 'income', 'kids', 'own_home',
                     'subscribe']
segment_variables_distribution = dict(zip(segment_variables,
                                          ['normal', 'binomial',
                                           'normal','poisson',
                                           'binomial', 'binomial']))

segment_variables_distribution['age']

'normal'

In [29]:
segment_variables_distribution

{'age': 'normal',
 'gender': 'binomial',
 'income': 'normal',
 'kids': 'poisson',
 'own_home': 'binomial',
 'subscribe': 'binomial'}

In [31]:
segment_means = {'suburb_mix': [40, 0.5, 55000, 2, 0.5, 0.1],
                 'urban_hip':  [24, 0.7, 21000, 1, 0.2, 0.2],
                 'travelers':  [58, 0.5, 64000, 0, 0.7, 0.05],
                 'moving_up':  [36, 0.3, 52000, 2, 0.3, 0.2]}

In [32]:
# standard deviations for each segment 
# None = not applicable for the variable)
segment_stddev = {'suburb_mix': [5, None, 12000, None, None, None],
                  'urban_hip':  [2, None, 5000, None, None, None],
                  'travelers':  [8, None, 21000, None, None, None],
                  'moving_up':  [4, None, 10000, None, None, None]}

In [33]:
segment_names = ['suburb_mix', 'urban_hip', 'travelers', 'moving_up']
segment_sizes = dict(zip(segment_names,[100, 50, 80, 70]))

segment_statistics = {}
for name in segment_names:
  segment_statistics[name] = {'size': segment_sizes[name]}
  for i, variable in enumerate(segment_variables):
    segment_statistics[name][variable] = {
        'mean': segment_means[name][i],
        'stddev': segment_stddev[name][i]
    }

In [34]:
segment_statistics['moving_up']

{'age': {'mean': 36, 'stddev': 4},
 'gender': {'mean': 0.3, 'stddev': None},
 'income': {'mean': 52000, 'stddev': 10000},
 'kids': {'mean': 2, 'stddev': None},
 'own_home': {'mean': 0.3, 'stddev': None},
 'size': 70,
 'subscribe': {'mean': 0.2, 'stddev': None}}



```
# This is formatted as code
```

### 5.1.2 Final segment data generation

In [36]:
import numpy as np
import pandas as pd

np.random.seed(seed=2554)
segment_constructor = {}

# Iterate over segments to create data for each
for name in segment_names:
  segment_data_subset = {}
  print('segment: {0}'.format(name))
  # Within each segment, iterate over the variables and generate data
  for variable in segment_variables:
    print('\tvariable: {0}'.format(variable))
    if segment_variables_distribution[variable] == 'normal':
      # Draw random normals
      segment_data_subset[variable] = np.random.normal(
          loc=segment_statistics[name][variable]['mean'],
          scale=segment_statistics[name][variable]['stddev'],
          size=segment_statistics[name]['size']
      )
    elif segment_variables_distribution[variable] == 'poisson':
      # Draw counts
      segment_data_subset[variable] = np.random.poisson(
          lam=segment_statistics[name][variable]['mean'],
          size=segment_statistics[name]['size']
      )
    elif segment_variables_distribution[variable] == 'binomial':
      # Draw binomials
      segment_data_subset[variable] = np.random.binomial(
          n=1,
          p=segment_statistics[name][variable]['mean'],
          size=segment_statistics[name]['size']
      )
    else:
      # Data type unknown
      print('Bad segment data type: {0}'.format(
          segment_variables_distribution[j])
           )
      raise StopIteration
  segment_data_subset['Segment'] = np.repeat(
      name,
      repeats=segment_statistics[name]['size']
  )
  segment_constructor[name] = pd.DataFrame(segment_data_subset)
segment_data = pd.concat(segment_constructor.values())

segment: suburb_mix
	variable: age
	variable: gender
	variable: income
	variable: kids
	variable: own_home
	variable: subscribe
segment: urban_hip
	variable: age
	variable: gender
	variable: income
	variable: kids
	variable: own_home
	variable: subscribe
segment: travelers
	variable: age
	variable: gender
	variable: income
	variable: kids
	variable: own_home
	variable: subscribe
segment: moving_up
	variable: age
	variable: gender
	variable: income
	variable: kids
	variable: own_home
	variable: subscribe


In [None]:
segment_data.head()

In [None]:
name = 'suburb_mix'
variable = 'age'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.normal(
    loc=segment_statistics[name][variable]['mean'],
    scale=segment_statistics[name][variable]['stddev'],
    size=10
)

In [None]:
variable = 'kids'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.poisson(
    lam=segment_statistics[name][variable]['mean'],
    size=10
)

In [None]:
variable = 'gender'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.binomial(
    n=1,
    p=segment_statistics[name][variable]['mean'],
    size=10
)

In [None]:
np.repeat(name, repeats=10)

In [None]:
segment_data['gender'] = segment_data['gender'].apply(
    lambda x: 'male' if x else 'female'
)
segment_data['own_home'] = segment_data['own_home'].apply(
    lambda x: True if x else False
)
segment_data['subscribe'] = segment_data['subscribe'].apply(
    lambda x: True if x else False
)

In [None]:
segment_data.describe(include='all')

In [None]:
segment_data.head()

In [None]:
from google.colab import files
with open('segment_dataframe_Python_intro_Ch5.csv', 'w') as f:
  segment_data.to_csv(f)

files.download('segment_dataframe_Python_intro_Ch5.csv')