In [272]:
import pandas
import csv
import re
import statistics

# Data Preparation

### MSA and Major MSA population totals

In [197]:
msa = {}
with open('assignment/MSA.csv') as csvfile:
    content = csv.reader(csvfile, delimiter=',')
    firstline = True
    for row in content:
        if firstline:
            firstline = False
            continue
        # make sure zip code is appropriate length before adding to dict
        else:
            z = add_leading_zeros(row[0])
            msa[z] = {'state': row[1], 'name': row[2]}

In [118]:
# creating a dict with the MSA name as key and population as value
msa_population_totals = {}

with open('assignment/major_msa_population_totals.csv') as csvfile:
    content = csv.reader(csvfile, delimiter=',')
    for row in content:
        msa_population_totals[row[0]] = int(row[1].replace(',',''))

### Zip Code Data

The python zipcode library does not contain a complete list of zipcodes, so I downloaded a complete list from https://www.aggdata.com/node/86

In [156]:
zipcode_data = {}
with open('us_postal_codes.csv') as csvfile:
    content = csv.reader(csvfile, delimiter=',')
    firstline = True
    for row in content:
        if firstline:
            firstline = False
            continue
        else:
            zipcode = row[0]
            city = row[1]
            state = row[2]
            state_short = row[3]
            county = row[4]
            lat = row[5]
            long = row[6]
            
            zipcode_data[zipcode] = {'city': city, 'state': state, 'state_short': state_short, 
                                     'county': county, 'latitude': lat, 'longitude': long}

### Guest Days

There are cases of missing data and cases in which certain data elements are not in the correct columns in the supplied CSV. Below are a few functions I wrote to extract and sanity check data from Guest Days.csv. Ultimately I create two lists containing 

All other entries (international customers, US rows that either don't have a zip code at all or the zip code isn't valid) is saved to a separate list for further cleaning.

In [233]:
def clean_guestdays(row):
### takes in a single row from the guest_days csv and returns zipcode, ski_days, and booking_window

    # check if the zip code is a valid US zip code
    valid_zip = False
    z = get_zipcode(row)
    
    if z in zipcode_data:
        valid_zip = True
    
    # ski days and booking window
    booking_window = None
    ski_days = row[-1]
    if ski_days:
        booking_window = row[-2] # if ski_days is present, booking_window is the second to last value in the row
    else:
        ski_days = row[-2] # otherwise, assume booking_window is missing
    
    if valid_zip:
        return True, [z, ski_days, booking_window]
    else:
        return False, row

In [234]:
def get_zipcode(row):
### Parses a row from guest_days to extract the best candidate for a zipcode
    match = re.search('(\d{5})([- ])?(\d{4})?', ' '.join(row))
    
    if match:
        # convert to string
        match = match.group(1)
    else:
        # do more work
        possible_zipcode = None
        minimum_zipcode = 210 # lowest zipcode in the US
        
        for i in row:
            # the regex expression should've caught any 5-digit or 5-plus-4 digit zip codes
            # a non-match is either not present, not a US zip, or is 3 or 4 digits
            digits = len(i)
            if 2 < digits < 5:
                try:
                    possible_zipcode = int(i)
                except ValueError:
                    pass

            if possible_zipcode:
                if possible_zipcode >= minimum_zipcode and possible_zipcode != 1000:
                    # there's a single false positive w/ zipcode 1000
                    match = str(possible_zipcode)
                    match = add_leading_zeros(match)
    
    return match

In [235]:
def add_leading_zeros(z):
### Adds appropriate amount of leading zeros to zipcode
    zeroes = {1: 4, 2: 3, 3: 2, 4: 1, 5: 0}
    return zeroes[len(z)] * '0' + z

In [236]:
guest_days_clean = []
guest_days_dirty = []

with open('assignment/guest_days.csv', newline='') as gd:
    reader = csv.reader(gd, delimiter=',')
    for row in reader:
        result = clean_guestdays(row)
        if result[0]:
            guest_days_clean.append(result[1])
        else:
            guest_days_dirty.append(result[1])

## Check data loss

I filtered the data, so let's see how much was excluded and browse the excluded data

In [237]:
loss = 1 - len(guest_days_clean) / float(len(guest_days_clean) + len(guest_days_dirty))

In [238]:
loss * 100 # percentage of total rows dropped

6.825587377808262

In [239]:
guest_days_dirty = guest_days_dirty[1:] # dropping the header row
guest_days_dirty

[['He', 'UK', '0', '15+ Days', '36'],
 ['Unknown', 'SYDNEY', '0', '08-14 Days', '6'],
 ['Alberta', 'MEXICO CITY,  MEXICO', '1000', '15+ Days', '15'],
 ['Unknown', 'MEXICO', '3100', '04-7 Days', '1'],
 ['Unknown', 'MEXICO', '3100', '08-14 Days', '2'],
 ['Other', 'FLOWERY GULLY', '7270', '15+ Days', '3'],
 ['Alberta', 'MEXICO', '11000', '08-14 Days', '15'],
 ['Alberta', 'MEXICO', '11700', '02 Days', '6'],
 ['Alberta', 'MEXICO', '11700', '03 Days', '6'],
 ['Alberta', 'MEXICO ', '11910', '03 Days', '4'],
 ['Alberta', 'MEXICO ', '11910', '04-7 Days', '10'],
 ['Unknown', 'SYDNEY', '20760', '02 Days', '3'],
 ['Virginia', 'FARMVILLE', '23900', '08-14 Days', '2'],
 ['Unknown', 'CAYMAN ISLANDS', '33210', '15+ Days', '6'],
 ['Es', 'NAUCALPAN EDO MEX', '53310', '04-7 Days', '6'],
 ['Unknown', 'EDO. MEXICO', '57310', '02 Days', '6'],
 ['Montana', 'BOZEMAN', '59178', '08-14 Days', '2'],
 ['Missouri', 'KANSAS CITY', '64416', '04-7 Days', '2'],
 ['Nu', 'SANTA CATARINA, NL', '66181', '04-7 Days', '3'],

Browsing the "dirty" guest days, we can see that most of them are international, though some are in the United States but were dropped because a valid zip code was not provided.

Now, let's see the total percentage of guest days dropped

In [240]:
total_clean_guestdays = 0
total_dirty_guestdays = 0

for row in guest_days_clean:
    total_clean_guestdays += int(row[1])

for row in guest_days_dirty:
    # guest_days is either the last or second to last entry in each row
    try:
        guest_days = int(row[-1])
    except ValueError:
        guest_days = int(row[-2])
    
    total_dirty_guestdays += guest_days

In [241]:
total_clean_guestdays

27422

In [242]:
total_dirty_guestdays

2075

In [243]:
loss_guestdays = 1 - total_clean_guestdays / float(total_clean_guestdays + total_dirty_guestdays)
loss_guestdays * 100

7.034613689527747

In [244]:
guest_days_clean

[['01033', '6', '15+ Days'],
 ['01057', '1', '15+ Days'],
 ['01060', '3', '08-14 Days'],
 ['01082', '1', '15+ Days'],
 ['01085', '1', '15+ Days'],
 ['01106', '3', '04-7 Days'],
 ['01229', '4', '15+ Days'],
 ['01230', '1', '02 Days'],
 ['01420', '4', '15+ Days'],
 ['01453', '3', '03 Days'],
 ['01463', '9', '02 Days'],
 ['01510', '1', '04-7 Days'],
 ['01523', '3', '04-7 Days'],
 ['01532', '2', '04-7 Days'],
 ['01545', '6', '08-14 Days'],
 ['01545', '3', '15+ Days'],
 ['01564', '2', '03 Days'],
 ['01583', '1', '02 Days'],
 ['01583', '3', '04-7 Days'],
 ['01701', '2', '04-7 Days'],
 ['01720', '1', '08-14 Days'],
 ['01742', '4', '04-7 Days'],
 ['01742', '6', '15+ Days'],
 ['01748', '2', '04-7 Days'],
 ['01752', '3', '08-14 Days'],
 ['01756', '6', '08-14 Days'],
 ['01756', '6', '15+ Days'],
 ['01773', '3', '03 Days'],
 ['01773', '9', '04-7 Days'],
 ['01775', '6', '08-14 Days'],
 ['01775', '6', '15+ Days'],
 ['01776', '5', '15+ Days'],
 ['01778', '6', '02 Days'],
 ['01830', '1', '15+ Days'],


### Create DataFrames

Since there are often multiple rows per zip code, and differing values of "booking_window" across rows with the same zip code, we'll need to summarize booking window. Complicating this is the "15+ Days" value that can occur in booking_window. Since the other possible values are either discrete ("02 Days") or a range, I'll aggregate the data and find an approximate median for booking_window. The values that are ranges run from `x` to `2x - 1`, so I'm taking the liberty of assuming that "15+ Days" is actually a range of "15-29 Days"

In [280]:
booking_window_map = {'00 Same Day': 0,
                      '01 Day': 1, 
                      '02 Days': 2, 
                      '03 Days': 3, 
                      '04-7 Days': statistics.median([4,5,6,7]), 
                      '08-14 Days': statistics.median([8,9,10,11,12,13,14]),
                      '15+ Days': statistics.median([15,16,17,18,19,20,21,22,23,24,25,26,27,28,29])}

In [281]:
booking_window_map

{'00 Same Day': 0,
 '01 Day': 1,
 '02 Days': 2,
 '03 Days': 3,
 '04-7 Days': 5.5,
 '08-14 Days': 11,
 '15+ Days': 22}

Now, I'll rebuid guest_days_clean to only contain a single entry per zip code before importing it in to pandas

In [300]:
guest_days_clean_unique = {}
for row in guest_days_clean:
    # If booking window is present, use it, otherwise don't
    if row[2]:
        booking_window = booking_window_map[row[2]]
    else:
        booking_window = 0
        
    if row[0] not in guest_days_clean_unique:
        guest_days_clean_unique[row[0]] = {'guest_days': float(row[1]), 'avg_booking_window': booking_window * float(row[1])}
    else:
        guest_days_clean_unique[row[0]]['guest_days'] += float(row[1])
        guest_days_clean_unique[row[0]]['avg_booking_window'] += booking_window * float(row[1])

In [301]:
guest_days_clean_unique

{'01033': {'avg_booking_window': 132.0, 'guest_days': 6.0},
 '01057': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01060': {'avg_booking_window': 33.0, 'guest_days': 3.0},
 '01082': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01085': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01106': {'avg_booking_window': 16.5, 'guest_days': 3.0},
 '01229': {'avg_booking_window': 88.0, 'guest_days': 4.0},
 '01230': {'avg_booking_window': 2.0, 'guest_days': 1.0},
 '01420': {'avg_booking_window': 88.0, 'guest_days': 4.0},
 '01453': {'avg_booking_window': 9.0, 'guest_days': 3.0},
 '01463': {'avg_booking_window': 18.0, 'guest_days': 9.0},
 '01510': {'avg_booking_window': 5.5, 'guest_days': 1.0},
 '01523': {'avg_booking_window': 16.5, 'guest_days': 3.0},
 '01532': {'avg_booking_window': 11.0, 'guest_days': 2.0},
 '01545': {'avg_booking_window': 132.0, 'guest_days': 9.0},
 '01564': {'avg_booking_window': 6.0, 'guest_days': 2.0},
 '01583': {'avg_booking_window': 18.5, 'guest_days': 4.0},

Now I'll adjust the booking_window values to be averaged by the number of guest_days per zip

In [302]:
for z in guest_days_clean_unique:
    guest_days_clean_unique[z]['avg_booking_window'] = guest_days_clean_unique[z]['avg_booking_window'] / guest_days_clean_unique[z]['guest_days']

In [303]:
guest_days_clean_unique

{'01033': {'avg_booking_window': 22.0, 'guest_days': 6.0},
 '01057': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01060': {'avg_booking_window': 11.0, 'guest_days': 3.0},
 '01082': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01085': {'avg_booking_window': 22.0, 'guest_days': 1.0},
 '01106': {'avg_booking_window': 5.5, 'guest_days': 3.0},
 '01229': {'avg_booking_window': 22.0, 'guest_days': 4.0},
 '01230': {'avg_booking_window': 2.0, 'guest_days': 1.0},
 '01420': {'avg_booking_window': 22.0, 'guest_days': 4.0},
 '01453': {'avg_booking_window': 3.0, 'guest_days': 3.0},
 '01463': {'avg_booking_window': 2.0, 'guest_days': 9.0},
 '01510': {'avg_booking_window': 5.5, 'guest_days': 1.0},
 '01523': {'avg_booking_window': 5.5, 'guest_days': 3.0},
 '01532': {'avg_booking_window': 5.5, 'guest_days': 2.0},
 '01545': {'avg_booking_window': 14.666666666666666, 'guest_days': 9.0},
 '01564': {'avg_booking_window': 3.0, 'guest_days': 2.0},
 '01583': {'avg_booking_window': 4.625, 'guest_day

Flatten guest_days_clean_uniqe to import into pandas

In [315]:
guest_days = []
for z in guest_days_clean_unique:
    guest_days.append([z,  guest_days_clean_unique[z]['guest_days'], guest_days_clean_unique[z]['avg_booking_window']])

In [316]:
guest_days

[['01033', 6.0, 22.0],
 ['01057', 1.0, 22.0],
 ['01060', 3.0, 11.0],
 ['01082', 1.0, 22.0],
 ['01085', 1.0, 22.0],
 ['01106', 3.0, 5.5],
 ['01229', 4.0, 22.0],
 ['01230', 1.0, 2.0],
 ['01420', 4.0, 22.0],
 ['01453', 3.0, 3.0],
 ['01463', 9.0, 2.0],
 ['01510', 1.0, 5.5],
 ['01523', 3.0, 5.5],
 ['01532', 2.0, 5.5],
 ['01545', 9.0, 14.666666666666666],
 ['01564', 2.0, 3.0],
 ['01583', 4.0, 4.625],
 ['01701', 2.0, 5.5],
 ['01720', 1.0, 11.0],
 ['01742', 10.0, 15.4],
 ['01748', 2.0, 5.5],
 ['01752', 3.0, 11.0],
 ['01756', 12.0, 16.5],
 ['01773', 12.0, 4.875],
 ['01775', 12.0, 16.5],
 ['01776', 5.0, 22.0],
 ['01778', 6.0, 2.0],
 ['01830', 1.0, 22.0],
 ['01833', 15.0, 11.0],
 ['01844', 3.0, 2.0],
 ['01845', 2.0, 22.0],
 ['01879', 6.0, 22.0],
 ['01880', 2.0, 2.0],
 ['01886', 2.0, 5.5],
 ['01907', 11.0, 3.909090909090909],
 ['01915', 9.0, 16.5],
 ['01930', 4.0, 13.125],
 ['01944', 8.0, 8.25],
 ['01960', 3.0, 22.0],
 ['01970', 6.0, 15.5],
 ['01984', 4.0, 5.5],
 ['01985', 4.0, 22.0],
 ['02025', 9

In [317]:
guest_days = pandas.DataFrame(data=guest_days, columns=['zip_code', 'guest_days', 'avg_booking_window'])

In [318]:
guest_days

Unnamed: 0,zip_code,guest_days,avg_booking_window
0,01033,6.0,22.000000
1,01057,1.0,22.000000
2,01060,3.0,11.000000
3,01082,1.0,22.000000
4,01085,1.0,22.000000
5,01106,3.0,5.500000
6,01229,4.0,22.000000
7,01230,1.0,2.000000
8,01420,4.0,22.000000
9,01453,3.0,3.000000


Now I'll incorporate the data for each zip code

In [319]:
def get_zipcode_data(z):
    zd = zipcode_data[z]
    return zd

def get_city(zd):
    return zd['city']

def get_state(zd):
    return zd['state']

def get_state_short(zd):
    return zd['state_short']

def get_county(zd):
    return zd['county']

def get_latitude(zd):
    return zd['latitude']

def get_longitude(zd):
    return zd['longitude']

In [320]:
guest_days['city'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_city)
guest_days['state'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_state)
guest_days['state_short'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_state_short)
guest_days['county'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_county)
guest_days['latitude'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_latitude)
guest_days['longitude'] = guest_days['zip_code'].apply(get_zipcode_data).apply(get_longitude)

In [321]:
guest_days

Unnamed: 0,zip_code,guest_days,avg_booking_window,city,state,state_short,county,latitude,longitude
0,01033,6.0,22.000000,Granby,Massachusetts,MA,Hampshire,42.2557,-72.52
1,01057,1.0,22.000000,Monson,Massachusetts,MA,Hampden,42.101,-72.3196
2,01060,3.0,11.000000,Northampton,Massachusetts,MA,Hampshire,42.3223,-72.6313
3,01082,1.0,22.000000,Ware,Massachusetts,MA,Hampshire,42.2618,-72.2583
4,01085,1.0,22.000000,Westfield,Massachusetts,MA,Hampden,42.1251,-72.7495
5,01106,3.0,5.500000,Longmeadow,Massachusetts,MA,Hampden,42.0507,-72.5676
6,01229,4.0,22.000000,Glendale,Massachusetts,MA,Berkshire,42.2793,-73.3435
7,01230,1.0,2.000000,Great Barrington,Massachusetts,MA,Berkshire,42.1959,-73.3607
8,01420,4.0,22.000000,Fitchburg,Massachusetts,MA,Worcester,42.5796,-71.8031
9,01453,3.0,3.000000,Leominster,Massachusetts,MA,Worcester,42.5274,-71.7563


# Analysis

Incorporating MSA statistics

In [335]:
def get_msa(z):
    return msa[z]['name']

guest_days['msa'] = guest_days['zip_code'].apply(get_msa)

In [354]:
guest_days

Unnamed: 0,zip_code,guest_days,avg_booking_window,city,state,state_short,county,latitude,longitude,msa,population
0,01033,6.0,22.000000,Granby,Massachusetts,MA,Hampshire,42.2557,-72.52,"Springfield, MA MSA",6227
1,01057,1.0,22.000000,Monson,Massachusetts,MA,Hampden,42.101,-72.3196,"Springfield, MA MSA",8534
2,01060,3.0,11.000000,Northampton,Massachusetts,MA,Hampshire,42.3223,-72.6313,"Springfield, MA MSA",15284
3,01082,1.0,22.000000,Ware,Massachusetts,MA,Hampshire,42.2618,-72.2583,"Springfield, MA MSA",10322
4,01085,1.0,22.000000,Westfield,Massachusetts,MA,Hampden,42.1251,-72.7495,"Springfield, MA MSA",41117
5,01106,3.0,5.500000,Longmeadow,Massachusetts,MA,Hampden,42.0507,-72.5676,"Springfield, MA MSA",16021
6,01229,4.0,22.000000,Glendale,Massachusetts,MA,Berkshire,42.2793,-73.3435,"Pittsfield, MA MSA",95
7,01230,1.0,2.000000,Great Barrington,Massachusetts,MA,Berkshire,42.1959,-73.3607,"Pittsfield, MA MSA",8430
8,01420,4.0,22.000000,Fitchburg,Massachusetts,MA,Worcester,42.5796,-71.8031,"Worcester, MA MSA",40337
9,01453,3.0,3.000000,Leominster,Massachusetts,MA,Worcester,42.5274,-71.7563,"Worcester, MA MSA",40883


Unfortunately, the major msa population totals provided don't cover all of the MSA's in the guest days dataset. That's okay, though, because I have the internet.

In [339]:
pop = {}
with open('2010_census_population.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        if first_row:
            first_row = False
            continue
        else:
            pop[row[0]] = row[1]
            

Add population data to dataframe, and then weight guest_days by population

In [350]:
def get_population(z):
    if z in pop:
        return pop[z]
    else:
        return None
    
guest_days['population'] = guest_days['zip_code'].apply(get_population)

#### What zipcodes purchase the furthest in advance?

In [346]:
matches = 0
for p in guest_days['population']:
    if p:
        matches += 1

Export data to CSV to generate a heatmap

In [353]:
len(guest_days) - matches

79

79 zip codes did not match the ZCTA codes provided by Census, dropping them

In [359]:
gd = guest_days.dropna(subset=['population'])

What zip codes have the best market penetration (guest days / population)?

In [363]:
gd['population'] = pandas.to_numeric(gd['population'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [368]:
gd['market_penetration'] = gd['guest_days'] / gd['population']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [383]:
gd

Unnamed: 0,zip_code,guest_days,avg_booking_window,city,state,state_short,county,latitude,longitude,msa,population,market_penetration
0,01033,6.0,22.000000,Granby,Massachusetts,MA,Hampshire,42.2557,-72.52,"Springfield, MA MSA",6227,0.000964
1,01057,1.0,22.000000,Monson,Massachusetts,MA,Hampden,42.101,-72.3196,"Springfield, MA MSA",8534,0.000117
2,01060,3.0,11.000000,Northampton,Massachusetts,MA,Hampshire,42.3223,-72.6313,"Springfield, MA MSA",15284,0.000196
3,01082,1.0,22.000000,Ware,Massachusetts,MA,Hampshire,42.2618,-72.2583,"Springfield, MA MSA",10322,0.000097
4,01085,1.0,22.000000,Westfield,Massachusetts,MA,Hampden,42.1251,-72.7495,"Springfield, MA MSA",41117,0.000024
5,01106,3.0,5.500000,Longmeadow,Massachusetts,MA,Hampden,42.0507,-72.5676,"Springfield, MA MSA",16021,0.000187
6,01229,4.0,22.000000,Glendale,Massachusetts,MA,Berkshire,42.2793,-73.3435,"Pittsfield, MA MSA",95,0.042105
7,01230,1.0,2.000000,Great Barrington,Massachusetts,MA,Berkshire,42.1959,-73.3607,"Pittsfield, MA MSA",8430,0.000119
8,01420,4.0,22.000000,Fitchburg,Massachusetts,MA,Worcester,42.5796,-71.8031,"Worcester, MA MSA",40337,0.000099
9,01453,3.0,3.000000,Leominster,Massachusetts,MA,Worcester,42.5274,-71.7563,"Worcester, MA MSA",40883,0.000073


Export to csv to create a heatmap

In [387]:
gd_trunc = gd
drop_these = ['zip_code', 'guest_days', 'avg_booking_window', 'city', 'state', 'state_short', 'county', 'msa', 'population']
for d in drop_these:
    gd_trunc = gd_trunc.drop(d, 1)

In [388]:
gd_trunc.columns = ['lat', 'lon', 'market_penetration']

In [382]:
gd_trunc['market_penetration'] = gd_trunc['market_penetration'].apply(lambda x: x*100)

0       0.096355
1       0.011718
2       0.019628
3       0.009688
4       0.002432
5       0.018725
6       4.210526
7       0.011862
8       0.009916
9       0.007338
10      0.078281
11      0.007350
12      0.039567
13      0.014129
14      0.025496
15      0.025615
16      0.052694
17      0.006407
18      0.004681
19      0.056414
20      0.013400
21      0.007792
22      0.205515
23      0.234742
24      0.182094
25      0.028314
26      0.045770
27      0.003978
28      0.183307
29      0.006349
          ...   
3320    0.016259
3322    0.004854
3323    0.008293
3324    0.004182
3325    0.003144
3326    0.024536
3327    0.015759
3328    0.018373
3329    0.008314
3330    0.022886
3331    0.011504
3332    0.009434
3333    0.018246
3334    0.005409
3335    0.113379
3336    0.015131
3337    0.006768
3338    0.024353
3339    0.004932
3340    0.011620
3341    0.003982
3342    0.012258
3343    0.002879
3345    0.004316
3346    0.024888
3347    0.022409
3348    0.016626
3349    0.0148

In [386]:
# normalize the market penetration data for easier viewing
from sklearn import preprocessing

In [None]:
x = gd_trunc['market_penetration']
min_max_scaler = preprocessing.MinMaxScaler()
x_s

In [380]:
with open('gd.csv', 'w') as f:
    gd_trunc.to_csv(f)