In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('../ums_viz.csv')
df_perf = pd.read_pickle('descriptions.pkl')
df_perf.columns.values[0] = 'perf_name'

df['perf_name'] = df['perf_name'].apply(lambda x: x.strip())
df_perf['perf_name'] = df_perf['perf_name'].apply(lambda x: x.strip())

df['per_seat'] = df['tck_amt']/df['num_seats']
df_group = df.groupby('perf_name')

# Add a whole bunch of new performance-level features

# mean ticket amount, number of seats, price per seat
df_tmp = df_group['tck_amt', 'num_seats', 'per_seat'].mean().reset_index()
df_tmp = df_tmp.rename(columns = {'tck_amt': 'mean_tck_amt', 'num_seats': 'mean_num_seats', 'per_seat': 'mean_per_seat'})

df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

# max ticket amount, number of seats, price per seat
df_tmp = df_group['tck_amt', 'num_seats', 'per_seat'].max().reset_index()
df_tmp = df_tmp.rename(columns = {'tck_amt': 'max_tck_amt', 'num_seats': 'max_num_seats', 'per_seat': 'max_per_seat'})
df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

# total number of seats
df_tmp = df_group['num_seats'].count().reset_index()
df_tmp = df_tmp.rename(columns = {'num_seats': 'count_tck_amt'})
df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

print df_perf

                perf_name                                        description  \
0     The Infernal Comedy  John Malkovich makes his UMS debut portraying ...   
1        Yuja Wang, piano  Chinese pianist Yuja Wang combines the spontan...   
2     London Philharmonic  The London Philharmonic returns for its first ...   
3        Hamburg Symphony  In 1971, French composer Olivier Messiaen was ...   
4    Chicago Symphony Orc  Wind players of the Chicago Symphony come toge...   
5    San Francisco Sympho  Michael Tilson Thomas and the San Francisco Sy...   
6      Pavel Haas Quartet  “The world’s most exciting string quartet? Wel...   
7        Ballet Preljocaj  Few audience members in attendance at Ballet P...   
8             Joshua Bell  Often referred to as a “poet of the violin,” J...   
9          Murray Perahia  In the more than 35 years he has been performi...   
10    Mariinsky Orchestra  The Mariinsky Orchestra and Valery Gergiev ret...   
11   Detroit Symphony Orc  A concert cre

In [124]:
# Extract bigram matrix
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

vect = CountVectorizer(ngram_range=(1, 1), stop_words = ENGLISH_STOP_WORDS, min_df = 5)
X = vect.fit_transform(df_perf.description.values)

print X.shape

(143, 446)


In [125]:
# Latent Dirichlet Allocation

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics=5)
y = lda.fit_transform(X)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print

print_top_words(lda, vect.get_feature_names(), 20)

df_perf['topic'] = np.argmax(y, 1)

Topic #0:
ballet dance company program century ums new performance set dancers movement william chicago contemporary director art years works world thing
Topic #1:
music new york jazz times voice musical best year heart album contemporary ums international known ensemble legendary singer debut traditions
Topic #2:
award winning director tour london theatre national high broadcasts presented theater definition michigan new partnership people history house play things
Topic #3:
music ums theatre national michigan london theater arbor ann american high new composers play partnership concert hall orchestra available broadcasts
Topic #4:
music quartet ums program performance debut work group world audiences concert string classical appearance years great returns new opera playing



In [126]:
df_perf.groupby('topic').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count_tck_amt,max_num_seats,max_per_seat,max_tck_amt,mean_num_seats,mean_per_seat,mean_tck_amt
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,count,8.0,8.0,8.0,8.0,8.0,8.0,8.0
0,mean,770.875,17.875,33.25,365.5,1.898167,19.689914,37.638334
0,std,974.780479,14.681743,24.609812,405.904633,0.147214,14.384974,27.816925
0,min,17.0,3.0,0.0,0.0,1.590909,0.0,0.0
0,25%,121.75,7.0,9.0,54.0,1.863387,6.193548,10.625806
0,50%,527.0,13.0,48.0,292.0,1.907226,27.895396,53.327766
0,75%,905.0,27.0,51.0,474.0,1.98773,29.23694,57.388637
0,max,2977.0,42.0,54.0,1188.0,2.058824,33.781414,63.442792
1,count,42.0,42.0,42.0,42.0,42.0,42.0,42.0
1,mean,688.095238,31.190476,74.809524,1044.266667,2.056273,31.658722,64.767773
