In [1]:
import pandas as pd
import json
import datetime
import requests
import numpy as np
import math
import base64
from fuzzywuzzy import process

# Table of Contents
1 TicketMaster Face Value Data
2 TicketMaster Resale Data
3 SeatGeek Data
4 StubHub Data
5 Join DataFrames

## 1 TicketMaster Data
### List of cities and Ticketmaster codes
City_List = 
    'Denver' : 264,
    'San Francisco Bay': 382,
    'Portland': 362,
    'Los Angeles': 27,
    'Las Vegas' : 14,
    'Phoenix': 36, 
    'Seattle': 42,
    'Austin' : 40, 
    'Houston': 22,
    'Dallas' : 5,
    'Chicago': 3,
    'Nashville': 31,
    'Atlanta': 10,
    'Boston' : 11,
    'New York': 35,
    'Washington DC': 47,
    'Miami' : 15
### Ticketmaster API info    
base_url = 'https://app.ticketmaster.com/discovery/v2/events.json?countryCode=US&apikey={apikey}'
api_key = 'OhKdHqBZOOuGCrWIcjlhzoxmnjUoaGWL'
dmaId = [382,362,264]
marketId =  [42,27,14,36,40,22,5,3,31,10,11,35,47,15]



### 1.1 Define Functions

Define functions for pulling event data from Ticketmaster API and formatting dataframe

In [2]:
#Create list of all US Ticketmaster market numbers -- Look at TM API docs for market number info
mkt_list = list(range(0,56)) + list(range(121,126))

In [3]:
#DMA and Market numbers
#dma_list = [382,362,264]
#mkt_list =  [42,27,14,36,40,22,5,3,31,10,11,35,47,15]

# Function to get the number of pages of ticketmaster data
def get_number_of_TM_pages(market,source='ticketmaster,frontgate'):
    url = 'https://app.ticketmaster.com/discovery/v2/events.json?countryCode=US'
    payload = {'source': source,
               'classificationName': 'music',
               'size': '200',
               'marketId': market,
               'apikey':'rxkTyetQBXzi6YcrZDeoaaGfa0U32PPQ'}
    r = requests.get(url,params=payload,verify=True)
    json_obj = json.loads(r.text)
    return json_obj['page']['totalPages']

#Get TicketMaster data, return a dataframe
def getTicketMasterData(pageNumber,market,source='ticketmaster,frontgate'):
    url = 'https://app.ticketmaster.com/discovery/v2/events.json?countryCode=US'
    payload = {'source': source,
               'marketId': market,
               'classificationName' : 'music',
               'size': '200',
               'page': pageNumber,
               'apikey':'rxkTyetQBXzi6YcrZDeoaaGfa0U32PPQ'}
    r = requests.get(url,params=payload,verify=True)
    json_response = json.loads(r.text)
    event_info = []
    for event in json_response.get('_embedded',{}).get('events',{}):
         event_info.append({
            'TM_id': str(event.get('id',{})),
            'TM_name' : str(event.get('name',{})),
            'TM_artist': list(str(attraction.get('name')) for attraction in event['_embedded'].get('attractions',{})),
            'TM_venue' : list(str(venue.get('name')) for venue in event['_embedded'].get('venues',{}))[0],
            'TM_venue_city' : list(str(venue.get('city',{}).get('name')) for venue in event['_embedded'].get('venues',{}))[0],
            'TM_venue_state' : list(str(venue.get('state',{}).get('stateCode')) for venue in event['_embedded'].get('venues',{}))[0],
            'TM_description' : str(event.get('description',{})),
            'TM_more_info' : str(event.get('additionalInfo',{})),
            'TM_start_date' : str(event.get('dates',{}).get('start',{}).get('dateTime',{})),
            'TM_timezone' : str(event.get('dates',{}).get('timezone')),
            'TM_span_multiple_days' : str(event.get('dates',{}).get('spanMultipleDays')),
            'TM_presale_date_start' : list(str(presale.get('startDateTime',{})) for presale in event.get('sales').get('presales',{})),
            'TM_presale_date_end' : list(str(presale.get('endDateTime',{})) for presale in event.get('sales').get('presales',{})),
            'TM_sale_date_start' : str(event.get('sales',{}).get('public',{}).get('startDateTime')),
            'TM_FV_prices': event.get('priceRanges'),
            'TM_promoter': str(event.get('promoter',{}).get('name')),
            'TM_genre' : event.get('classifications'),
            'TM_place' : str(event.get('place'))
         })
    tmDF = pd.DataFrame(event_info)
    return tmDF

#Convert timedate information from UTC to local time
def convert_times(df,times_list,tz_col):
    #Loop through each timezone
    df_list = []
    for tz in df[tz_col].unique():
        #Filter rows by timezone
        mask = (df[tz_col] == tz)
        df_local = df.loc[mask]
        #Loop through each datetime row
        for col in times_list:
            #Convert each column to datetime series, localize to UTC and then convert to proper timezone
            df_local[col] = pd.to_datetime(df_local[col],errors='coerce').dt.tz_localize('UTC').dt.tz_convert(tz)
            #Convert each column to datetime series, localize to UTC and then convert to proper timezone
        df_list.append(df_local)
    df = pd.concat(df_list, axis=0)
    return df

#Explode out columns with nested information
def explode(df,col,index_col):
    df1 = df[col].apply(pd.Series)
    df1 = df1[0].apply(pd.Series)
    df1.index = df[index_col]
    df = df.merge(df1, how='left', left_on=index_col, right_index=True)
    df = df.drop(col,axis=1)
    return df

### 1.2 Query Ticketmaster face value data
### Pull the data and create the data frame
For each city/market, get the number of pages of data, pull all data, and then concatanate the dataframes

In [4]:
fv_df_list = []

#Create dictionaries of dma/mkt numbers, and number of pages of data
dma_dict = dict()
mkt_dict = dict()
#Get number of pages for face value dma items
# for dma in range(1,126):
#     dma_dict[dma] = get_number_of_TM_pages(dma=dma)
    
# #Get DMA data
# for dma in dma_dict.keys():
#     for page in range(1,dma_dict[dma]+1):
#         fv_df_list.append(getTicketMasterData(dma=dma,page=page))
    
#Get number of pages for face value market items
for mkt in mkt_list:
    mkt_dict[mkt] = get_number_of_TM_pages(market=mkt)
    
#Get market data
for mkt in mkt_dict.keys():
    for page in range(0,mkt_dict[mkt]):
        fv_df_list.append(getTicketMasterData(market=mkt,pageNumber=page))
        
#Combine all dataframes for full data on events and face_value prices
fv_df = pd.concat(fv_df_list,axis=0)

fv_df.head()

Unnamed: 0,TM_FV_prices,TM_artist,TM_description,TM_genre,TM_id,TM_more_info,TM_name,TM_place,TM_presale_date_end,TM_presale_date_start,TM_promoter,TM_sale_date_start,TM_span_multiple_days,TM_start_date,TM_timezone,TM_venue,TM_venue_city,TM_venue_state
0,"[{'type': 'standard', 'currency': 'USD', 'min'...","[House of Blues Gospel Brunch, Cedric Nunley, ...",{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv1A7ZA7-GkeXdenP,{},World Famous Gospel Brunch at House of Blues (...,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-07-01T15:00:00Z,False,2017-12-10T18:30:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
1,"[{'type': 'standard', 'currency': 'USD', 'min'...","[House of Blues Gospel Brunch, LaVarnga Hubbar...",{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv1A7ZA7-GkeWve0K,{},World Famous Gospel Brunch at House of Blues (...,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-07-01T15:00:00Z,False,2017-12-17T16:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
2,"[{'type': 'standard', 'currency': 'USD', 'min'...",[Kick - the Inxs Experience],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGklNLpPJ,{},KICK - The INXS Experience,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-10-27T15:00:00Z,False,2018-01-13T02:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
3,"[{'type': 'standard', 'currency': 'USD', 'min'...",[Gaelic Storm],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGkBg8e46,{},Gaelic Storm,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-11-01T15:00:00Z,False,2018-03-18T00:30:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
4,"[{'type': 'standard', 'currency': 'USD', 'min'...",[],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGkByvwLB,{},3rd Annual Timbuck2 Forever Tribute,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-11-03T15:00:00Z,False,2017-12-20T03:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL


### 1.3 Convert datetime columns to datetime objects in  proper timezones
Currently the four columns with datetime information are a mess. Some are of type dict, some are nested in lists, and all are in the UTC timezone. We can look and see that in timedate columns with lists of dates, the timedates are identitical, so we can unnest datetimes by simply taking the first element of the list.

The 3 things we need to accomplish:
    1. Convert all datetime columns to dtype string, and unnest 'TM_presale_date_end' and 'TM_presale_date_start' columns
    2. Convert all columns to datetime series localized to UTC (Done in the convert_times function)
    3. Filter rows by timezone, and loop over each datetime column to convert objects to their proper timezone (Done in the convert_times function)

In [5]:
# 1. Convert all datetime columns to dtype string, and unnest objects
fv_df['TM_presale_date_end'] = fv_df['TM_presale_date_end'].str[0]
fv_df['TM_presale_date_start'] = fv_df['TM_presale_date_start'].str[0]

In [6]:
# 2. Convert all columns to datetime series localized to UTC
# 3. Filter rows by timezone and create a new dataframe per timezone, loop over each datetime column to convert objects to their proper timezone 
col_list = ['TM_presale_date_end','TM_presale_date_start','TM_start_date','TM_sale_date_start']
fv_df = convert_times(df=fv_df,times_list=col_list,tz_col='TM_timezone')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### 1.4 Explode nested rows
Unnest information in prices and genre column

In [7]:
#Explode price column
fv_df = explode(fv_df,col='TM_FV_prices',index_col='TM_id')

  union = _union_indexes(indexes)
  result = result.union(other)


In [8]:
#Unnest genre info and clean up data in exploded columns
fv_df = explode(fv_df,'TM_genre','TM_id')
for col in ['genre','segment','subGenre']:
    fv_df[col] = fv_df[col].map(lambda x: dict(x).get('name',{}))
fv_df.head()

Unnamed: 0,TM_artist,TM_description,TM_id,TM_more_info,TM_name,TM_place,TM_presale_date_end,TM_presale_date_start,TM_promoter,TM_sale_date_start,...,max,min,type_x,0,genre,primary,segment,subGenre,subType,type_y
0,"[House of Blues Gospel Brunch, Cedric Nunley, ...",{},vv1A7ZA7-GkeXdenP,{},World Famous Gospel Brunch at House of Blues (...,,NaT,NaT,HOUSE OF BLUES CONCERTS (HOB),2017-07-01 10:00:00-05:00,...,42.5,25.0,standard,,Religious,True,Music,Gospel,"{'id': 'KZFzBErXgnZfZ7v7lJ', 'name': 'Undefined'}","{'id': 'KZAyXgnZfZ7v7nI', 'name': 'Undefined'}"
1,"[House of Blues Gospel Brunch, LaVarnga Hubbar...",{},vv1A7ZA7-GkeWve0K,{},World Famous Gospel Brunch at House of Blues (...,,NaT,NaT,HOUSE OF BLUES CONCERTS (HOB),2017-07-01 10:00:00-05:00,...,42.5,25.0,standard,,Religious,True,Music,Gospel,"{'id': 'KZFzBErXgnZfZ7v7lJ', 'name': 'Undefined'}","{'id': 'KZAyXgnZfZ7v7nI', 'name': 'Undefined'}"
2,[Kick - the Inxs Experience],{},vv178ZfgGklNLpPJ,{},KICK - The INXS Experience,,NaT,NaT,HOUSE OF BLUES CONCERTS (HOB),2017-10-27 10:00:00-05:00,...,15.0,15.0,standard,,Undefined,True,Music,Undefined,"{'id': 'KZFzBErXgnZfZ7vAAI', 'name': 'Tribute ...","{'id': 'KZAyXgnZfZ7v7l1', 'name': 'Group'}"
3,[Gaelic Storm],{},vv178ZfgGkBg8e46,{},Gaelic Storm,,NaT,NaT,HOUSE OF BLUES CONCERTS (HOB),2017-11-01 10:00:00-05:00,...,36.0,36.0,standard,,World,True,Music,World,"{'id': 'KZFzBErXgnZfZ7v7lJ', 'name': 'Undefined'}","{'id': 'KZAyXgnZfZ7v7nI', 'name': 'Undefined'}"
4,[],{},vv178ZfgGkByvwLB,{},3rd Annual Timbuck2 Forever Tribute,,NaT,NaT,HOUSE OF BLUES CONCERTS (HOB),2017-11-03 10:00:00-05:00,...,30.0,30.0,standard,,Undefined,True,Music,Undefined,"{'id': 'KZFzBErXgnZfZ7vAAI', 'name': 'Tribute ...","{'id': 'KZAyXgnZfZ7v7l1', 'name': 'Group'}"


## 2 Query TicketMaster resale data and merge with face value data
Next we will query TicketMaster's resale ticket data and then merge it with the face value dataframe

In [9]:
#Query resale data from the Ticketmaster API

rv_df_list = []

#Create dictionaries of dma/mkt numbers, and number of pages of data
dma_rv_dict = dict()
mkt_rv_dict = dict()

#Get number of pages for resale value dma items
# for dma in dma_list:
#     dma_rv_dict[dma] = get_number_of_TM_pages(dma=dma,source='tmr')
    
# #Get DMA data
# for dma in dma_rv_dict.keys():
#     for page in range(1,dma_dict[dma]+1):
#         fv_df_list.append(getTicketMasterData(dma=dma,page=page,source='tmr'))
    
#Get number of pages for resale value market items
for mkt in mkt_list:
    mkt_rv_dict[mkt] = get_number_of_TM_pages(market=mkt,source='tmr')
    
#Get market data
for mkt in mkt_rv_dict.keys():
    for current_page in range(0,mkt_dict[mkt]):
        fv_df_list.append(getTicketMasterData(market=mkt,pageNumber=current_page,source='tmr'))
        
#Combine all dataframes for full data on events and resale value prices
rv_df = pd.concat(fv_df_list,axis=0)
rv_df.head()

Unnamed: 0,TM_FV_prices,TM_artist,TM_description,TM_genre,TM_id,TM_more_info,TM_name,TM_place,TM_presale_date_end,TM_presale_date_start,TM_promoter,TM_sale_date_start,TM_span_multiple_days,TM_start_date,TM_timezone,TM_venue,TM_venue_city,TM_venue_state
0,"[{'type': 'standard', 'currency': 'USD', 'min'...","[House of Blues Gospel Brunch, Cedric Nunley, ...",{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv1A7ZA7-GkeXdenP,{},World Famous Gospel Brunch at House of Blues (...,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-07-01T15:00:00Z,False,2017-12-10T18:30:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
1,"[{'type': 'standard', 'currency': 'USD', 'min'...","[House of Blues Gospel Brunch, LaVarnga Hubbar...",{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv1A7ZA7-GkeWve0K,{},World Famous Gospel Brunch at House of Blues (...,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-07-01T15:00:00Z,False,2017-12-17T16:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
2,"[{'type': 'standard', 'currency': 'USD', 'min'...",[Kick - the Inxs Experience],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGklNLpPJ,{},KICK - The INXS Experience,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-10-27T15:00:00Z,False,2018-01-13T02:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
3,"[{'type': 'standard', 'currency': 'USD', 'min'...",[Gaelic Storm],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGkBg8e46,{},Gaelic Storm,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-11-01T15:00:00Z,False,2018-03-18T00:30:00Z,America/Chicago,House of Blues Chicago,Chicago,IL
4,"[{'type': 'standard', 'currency': 'USD', 'min'...",[],{},"[{'primary': True, 'segment': {'id': 'KZFzniwn...",vv178ZfgGkByvwLB,{},3rd Annual Timbuck2 Forever Tribute,,[],[],HOUSE OF BLUES CONCERTS (HOB),2017-11-03T15:00:00Z,False,2017-12-20T03:00:00Z,America/Chicago,House of Blues Chicago,Chicago,IL


### 2.1 Extract price information and merge it with face value data in a new dataframe

In [10]:
# Explode out resale price information
rv_df = explode(rv_df,col='TM_FV_prices',index_col='TM_id')
rv_df.rename(columns={'min': 'tmr_min', 'max': 'tmr_max'}, inplace=True)

# # Create a new dataframe combining face value and resale value information, joined on ticketmaster event ID
joined_tm_df = fv_df.join(rv_df[['tmr_min','tmr_max']],on='TM_id',how='outer')
joined_tm_df.head()

  union = _union_indexes(indexes)
  result = result.union(other)


## 3 Download Resale Data from SeatGeek API
1. Connect to SeatGeek API and download data
2. Merge data with TicketMaster DataFrame
### 3.1 Define functions

In [12]:
#Connect to SeatGeek API and determine number of pages of information
def get_SeatGeek_Pages():
    url = 'https://api.seatgeek.com/2/events?format=json'
    payload = {'per_page' : 1000,
               'taxonomies.name':'concert',
               'client_id': 'OTU5MDE5MXwxNTEwMzcxNjgyLjIx',
              }
    r = requests.get(url, params=payload,verify=True)
    json_obj = json.loads(r.text)
    
    #Return the total number of JSON items divided by the number of page to get page count
    return math.ceil(json_obj['meta']['total']/json_obj['meta']['per_page'])

#Connect to SeatGeek API and download JSON data, format it into pandas dataframe
def get_SeatGeek_data(page=1):
    url = 'https://api.seatgeek.com/2/events?format=json'
    payload = {'per_page' : 1000,
               'page' : page,
               'taxonomies.name':'concert',
               'venue.country' : 'US',
               'client_id': 'OTU5MDE5MXwxNTEwMzcxNjgyLjIx',
              }
    r = requests.get(url,params=payload,verify=True)
    json_obj = json.loads(r.text)
    info_list = []
    for event in json_obj['events']:
        info_list.append(
         {'SG_event_id' : str(event.get('id',{})),
         'SG_listing_count' : str(event.get('stats',{}).get('listing_count',{})),
         'SG_average_price' : str(event.get('stats',{}).get('average_price',{})),
         'SG_lowest_price' : str(event.get('stats',{}).get('lowest_price',{})),
         'SG_highest_price' : str(event.get('stats',{}).get('highest_price',{})),
         'SG_title' : str(event.get('title',{})),
         'SG_datetime_local' : str(event.get('datetime_local',{})),
         'SG_artists' : list(str(performer.get('name',{})) for performer in event.get('performers',{})),
         'SG_artists_score' : list(str(performer.get('score',{})) for performer in event.get('performers',{})),
         'SG_artists_type' : list(str(performer.get('type',{})) for performer in event.get('performers',{})),
         'SG_artists_id' : list(str(performer.get('id',{})) for performer in event.get('performers',{})),
         'SG_venue' : str(event.get('venue',{}).get('name',{})),
         'SG_venue_city' : str(event.get('venue',{}).get('city',{})),
         'SG_venue_state' : str(event.get('venue',{}).get('state',{})),
         'SG_venue_score' : str(event.get('venue',{}).get('score',{}))
                               })
    sg_df = pd.DataFrame(info_list)
    return sg_df

### 3.2 Download SeatGeek data into DataFrame

In [13]:
sgdf_list = []

#Find the total number of pages in 
total_pages = get_SeatGeek_Pages()

#Loop through the number pages of data and combine data into single dataframe
for pageNum in range(1,total_pages+1):
    sgdf_list.append(get_SeatGeek_data(page=pageNum))
sg_df = pd.concat(sgdf_list,axis=0)
sg_df.head()

Unnamed: 0,SG_artists,SG_artists_id,SG_artists_score,SG_artists_type,SG_average_price,SG_datetime_local,SG_event_id,SG_highest_price,SG_listing_count,SG_lowest_price,SG_title,SG_venue,SG_venue_city,SG_venue_score,SG_venue_state
0,"[Taylor Eigsti, Peter Bernstein, Billy Hart]","[4954, 26691, 73492]","[0.4499999881, 0.400000006, 0.0]","[band, band, band]",,2017-11-29T03:30:00,4126440,,,,Billy Hart with Taylor Eigsti and Peter Bernstein,"Dizzy's Club Coca-Cola, Lincoln Center",New York,0.0,NY
1,[John Craigie],[97580],[0.3899999857],[band],,2017-11-29T03:30:00,4026960,,,,John Craigie,Club Passim,Cambridge,0.0,MA
2,[Unsane],[247113],[0.3700000048],[band],,2017-11-29T03:30:00,4060635,,,,Unsane,Beauty Bar,Las Vegas,0.4066849947,NV
3,[Eiffel Tower Experience],[605657],[0.0],[band],,2017-11-29T03:30:00,3963302,,,,Eiffel Tower Experience,Paris Las Vegas,Las Vegas,0.4538879991,NV
4,[MUS 200 Recital Hour],[475285],[0.0],[band],,2017-11-29T12:00:00,4146915,,,,MUS 200 Recital Hour,FD Hall Music Center,Jackson,0.0,MS


## 4 StubHub Data
Connect to StubHub API and download data

### 4.1 Define Functions

In [14]:
access_token = '57476f0a-f69e-334d-ba1b-02d394883b2a'
user_GUID = '84175DB4D85A6777E04400144FB7AE36'

#Get number of pages of data
def get_SH_pages():
    url = 'https://api.stubhub.com/search/catalog/events/v3/'
    payload = {'minAvailableTickets':1, 
              'categoryName':'Concert',
              'country' : 'US'}
    headers = {'Authorization': 'Bearer ' + access_token} # Insert StubHub API Key here
    r = requests.get(url, params=payload, headers=headers, verify=True)
    json_response = json.loads(r.text)
    return math.ceil(json_response.get('numFound',{})/500)

#Connect to StubHub API and put data into DataFrame
def get_SH_data(start):
    url = 'https://api.stubhub.com/search/catalog/events/v3/'
    payload = {'minAvailableTickets':1, 
              'categoryName':'Concert',
               'rows' : 500,
               'start' : start,
               'country' : 'US'}
    headers = {'Authorization': 'Bearer ' + access_token} # Insert StubHub API Key here
    r = requests.get(url, params=payload, headers=headers, verify=True)
    json_response = json.loads(r.text)
    event_info = []
    for event in json_response.get('events',{}):
            event_info.append({
                    'SH_artist':event.get('ancestors',{}).get('performers',{}),
                    'SH_date':str(event.get('eventDateLocal',{})),
                    'SH_event_id':str(event.get('id',{})),
                    'SH_min_price':str(event.get('ticketInfo',{}).get('minPrice',{})),
                    'SH_max_price':str(event.get('ticketInfo',{}).get('maxPrice')),
                    'SH_total_postings':str(event.get('ticketInfo',{}).get('totalPostings')),
                    'SH_total_tickets':str(event.get('ticketInfo',{}).get('totalTickets')),
                    'SH_venue':str(event['venue'].get('name')),
                    'SH_city':str(event['venue'].get('city')),
                    'SH_state':str(event['venue'].get('state'))
                })
    TicketData = pd.DataFrame(event_info)
    return TicketData

### 4.2 Gather data and concat into a dataframe

In [15]:
pages = get_SH_pages()
sh_df_list = []
for page in range(pages):
     start_place = page * 500
     sh_df_list.append(get_SH_data(start=start_place))
sh_df = pd.concat(sh_df_list,axis=0)

sh_df.head()

Unnamed: 0,SH_artist,SH_city,SH_date,SH_event_id,SH_max_price,SH_min_price,SH_state,SH_total_postings,SH_total_tickets,SH_venue
0,"[{'id': 47796, 'name': 'Judy Collins', 'url': ...",Annapolis,2018-04-29T19:30:00-0400,103196984,277.3,277.3,MD,1,2,Rams Head Tavern On Stage
1,"[{'id': 725293, 'name': 'ZZ Ward', 'url': 'zz-...",Boston,2018-02-17T19:00:00-0500,103268652,88.05,45.64,MA,13,72,Paradise Rock Club
2,"[{'id': 725293, 'name': 'ZZ Ward', 'url': 'zz-...",Salt Lake City,2018-01-30T20:00:00-0700,103268576,68.5,64.88,UT,3,28,The Depot
3,"[{'id': 493452, 'name': 'Tennis', 'url': 'tenn...",Ithaca,2018-01-19T20:00:00-0500,103268588,59.09,59.09,NY,1,10,The Haunt
4,"[{'id': 11327, 'name': 'Pixies', 'url': 'pixie...",Portland,2017-11-29T20:00:00-0800,103098144,82.61,38.5,OR,13,33,Roseland Theater


### 4.3 Convert to datetimes, Explode out artist names

In [18]:
#From string to datetime
sh_df['SH_date'] = pd.to_datetime(sh_df['SH_date'])

#Explode out artists column
sh_df = explode(sh_df,'SH_artist','SH_event_id')

  union = _union_indexes(indexes)
  result = result.union(other)


## 5 Join dataframes on venue name fuzzy match and datetime
### 5.1 Define Functions

In [20]:
joined_tm_df = fv_df.join(rv_df[['tmr_min','tmr_max']],on='TM_id',how='outer')

In [75]:
#Create list of unique TM venue names to compare against
tm_venues = joined_tm_df['TM_venue'].unique()
sh_venues = sh_df['SH_venue'].unique()
sg_venues = sg_df['SG_venue'].unique()


#Find a fuzzy match venue with Ticketmaster venue
def venue_match(venue):
    if venue in tm_venues:  # might want to make this a dict for O(1) lookups
        return venue, 100
    else:
        new_name, score = process.extractOne(venue, tm_venues)
        if score < 90:
            return venue + ' NO MATCH', score
        else:
            return new_name, score

In [91]:
#Create new SG DF columns with equivelant TM venue name, and match score
#sg_df['corrected_venue'], sg_df['fuzzy_match_score'] = zip(*sg_df['SG_venue'].apply(venue_match))

In [89]:
x = pd.Series(sg_venues).apply(venue_match)

In [93]:
x

0       (Dizzy's Club Coca-Cola, Lincoln Center NO MAT...
1                              (Club Passim NO MATCH, 86)
2                               (Beauty Bar NO MATCH, 86)
3                          (Paris Las Vegas NO MATCH, 86)
4                     (FD Hall Music Center NO MATCH, 86)
5                                      (City Theatre, 90)
6         (Andy Williams Moon River Theatre NO MATCH, 86)
7                          (Le Musique Room NO MATCH, 86)
8                       (Rams Head On Stage NO MATCH, 86)
9                 (Bethany Lutheran College NO MATCH, 60)
10                                   (Cielo NO MATCH, 54)
11                              (Paradise Rock Club, 100)
12                          (The Canal Club NO MATCH, 86)
13                                       (BACKBOOTH, 100)
14                                   (EXPRESS LIVE!, 100)
15                                  (Chameleon Club, 100)
16                             (The Orpheum NO MATCH, 86)
17            