In [14]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
path = 'Pickles/output_df.pkl'
df = pd.read_pickle(path)

Regression Analysis:
- Spotify Info,
- SG venue score, 
- Ticket Listings, 
- Time Info
- Artist Count

ANOVA
- Genre & Subgenre
- Day of Week
- Promoter
- Ticket Source

Correlation Matrix between features

In [3]:
continuous = ['avg_ticket_listings','spotify_avg_followers','spotify_avg_popularity',
              'presale_length', 'days_on_sale','days_until_show','artist_count']
categorical = ['genre', 'subGenre','day_of_week','promoter','min_source']

## 1. Correlations between Continuous Features & Min Markup
I will calculate Pearson R and Spearman R values because, as previously observed the correlations may not necessarily be linear. <br><br>
$H_0 : r = 0$ There is no statistically significant correlation between each continuous variable and ticket minimum markup<br>
$H_a : r \neq 0$ There is a statistically significant correlation between each continuous variable and ticket minimum markup

In [4]:
#Compute Pearson R values for continuous features and minmarkup%
pearson_r = list()
pearson_p = list()
for col in continuous:
    r,p = stats.pearsonr(df[col],df['min_markup%'])
    pearson_r.append(r)
    pearson_p.append(p)
    
#Compute Spearman R values for continuous features and minmarkup%    
spearman_r = list()
spearman_p = list()
for col in continuous:
    r,p = stats.spearmanr(df[col],df['min_markup%'])
    spearman_r.append(r)
    spearman_p.append(p)
    
#Create DF of values
r_df = pd.DataFrame({'Column' : continuous,
                    'PearsonR' : pearson_r,
                    'PearsonR_pvalue' : pearson_p,
                    'SpearmanP' : spearman_r,
                    'SpearmanP_pvalue' : spearman_p})
r_df

Unnamed: 0,Column,PearsonR,PearsonR_pvalue,SpearmanP,SpearmanP_pvalue
0,avg_ticket_listings,-0.094553,6.372505e-09,-0.199221,6.176388e-35
1,spotify_avg_followers,0.049878,0.00222745,-0.092131,1.537915e-08
2,spotify_avg_popularity,0.049662,0.002327623,-0.091791,1.737195e-08
3,presale_length,-0.077672,1.874547e-06,-0.180006,9.953438e-29
4,days_on_sale,-0.045595,0.005186303,-0.167394,5.163824e-25
5,days_until_show,-0.073715,6.098756e-06,0.013097,0.4222535
6,artist_count,0.010766,0.5094595,0.010364,0.525406


## Conclusions

All R values are significant at $\alpha = .01$ and $\alpha = .05$ significance levels, meaning the null hypthoesis is rejected and alternate hypothesis is accepted for each continous feature.

Excluding Days Until Show, and Artist Count, all Spearman R values are higher than the Pearson values. This indicates that there is in fact associations in the movements between features, but that the movements aren't linear. This corroborates with the ECDF visualisations which suggested a logarithmic association between markup and the features. 

## 2. Correlations between Categorical Features & Min Markup
For each categorical feature, an ANOVA test will be used to determine the statistical signficance of each category on ticket minimum markups

$H_0 : r = 0$ There is no statistically significant correlation between each categorical variable and ticket minimum markup<br>
$H_a : r \neq 0$ There is a statistically significant correlation between each continuous variable and ticket minimum markup

## 2.1 Genre

In [5]:
genre_df = df[['min_markup%','genre']].pivot(columns='genre',values='min_markup%')
genre_df.describe()

genre,Blues,Country,Dance/Electronic,Folk,Hip-Hop/Rap,Jazz,Metal,Other,Pop,R&B,Religious,Rock,Undefined,World
count,76.0,296.0,133.0,68.0,172.0,97.0,111.0,122.0,258.0,266.0,84.0,1511.0,361.0,202.0
mean,185.209334,179.823589,120.306847,203.602444,148.520164,130.704343,150.435346,168.526471,76.977162,145.138177,121.274377,207.432196,116.82925,101.245218
std,85.960636,476.529504,109.885481,279.84442,151.323897,76.338896,218.027912,149.22736,73.596679,128.726938,67.698992,441.677863,133.306952,71.83423
min,58.8,-72.537313,-32.2,13.333333,-6.177215,-9.963636,-5.276995,-8.108108,-43.396226,9.539326,-18.0,-70.446927,-2.857143,-70.0
25%,114.009286,74.034351,69.929577,86.022727,83.357092,88.888889,87.706522,93.679487,18.42296,81.818182,101.666667,79.868058,73.996176,62.935897
50%,162.360447,98.010473,93.7,105.263158,114.642857,100.6,110.75,123.45805,61.902216,117.00624,104.87234,106.372881,85.719225,89.337121
75%,286.666667,143.977273,121.892857,154.160792,167.118333,142.6,137.297895,200.9375,99.577465,154.330339,136.090909,159.400833,123.220339,109.659994
max,356.54321,6579.0,840.0,1676.923077,1440.0,373.287671,2210.0,1218.872727,425.0,1109.102041,583.04,4703.333333,1488.0,442.857143


variances are not equal and sample sizes are unbalanced, __can we still use ANOVA?__

In [6]:
stats.f_oneway(genre_df['Blues'].dropna(),
               genre_df['Country'].dropna(),
               genre_df['Dance/Electronic'].dropna(),
               genre_df['Folk'].dropna(),
               genre_df['Hip-Hop/Rap'].dropna(),
               genre_df['Jazz'].dropna(),
               genre_df['Metal'].dropna(),
               genre_df['Other'].dropna(),
               genre_df['Pop'].dropna(),
               genre_df['R&B'].dropna(),
               genre_df['Religious'].dropna(),
               genre_df['Rock'].dropna(),
               genre_df['Undefined'].dropna(),
               genre_df['World'].dropna())

F_onewayResult(statistic=5.315216039384361, pvalue=1.4712118660348582e-09)

In [7]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [8]:
mc = MultiComparison(df['min_markup%'],df['genre'])
result = mc.tukeyhsd()

print(result)

          Multiple Comparison of Means - Tukey HSD,FWER=0.05         
     group1           group2       meandiff   lower    upper   reject
---------------------------------------------------------------------
     Blues           Country       -5.3857  -145.5703 134.7988 False 
     Blues       Dance/Electronic  -64.9025 -221.6576 91.8526  False 
     Blues             Folk        18.3931  -163.5775 200.3637 False 
     Blues         Hip-Hop/Rap     -36.6892 -186.8429 113.4645 False 
     Blues             Jazz        -54.505  -221.5031 112.4931 False 
     Blues            Metal        -34.774  -197.0797 127.5317 False 
     Blues            Other        -16.6829  -175.987 142.6213 False 
     Blues             Pop        -108.2322 -250.5102 34.0458  False 
     Blues             R&B         -40.0712 -181.8615 101.7192 False 
     Blues          Religious      -63.935  -236.5167 108.6467 False 
     Blues             Rock        22.2229  -105.9307 150.3764 False 
     Blues          

Pop & Country, Pop & Rock, Rock & Undefined, Rock & World have different means, the rest of the means do not have significant differences at $\alpha = .05$

## 2.2 SubGenre

In [10]:
subgenre_df = df[['min_markup%','subGenre']].pivot(columns='subGenre',values='min_markup%')
subgenre_df.describe()

subGenre,Adult Contemporary,Alternative Rock,Blues,Club Dance,Country,Folk,Gospel,Heavy Metal,Jazz,Latin,Other,Pop,R&B,Soul,Undefined,Urban,World
count,81.0,426.0,76.0,117.0,278.0,65.0,75.0,107.0,97.0,125.0,278.0,1198.0,172.0,80.0,361.0,150.0,71.0
mean,117.300024,160.195889,185.209334,124.123362,180.968324,207.438043,118.388102,151.29349,130.704343,94.263078,150.90374,206.121614,137.830055,161.681964,116.82925,151.460394,102.218419
std,50.638642,167.388309,85.960636,116.105926,490.63628,285.684126,69.18274,222.030101,76.338896,69.488109,131.772998,488.147304,146.321178,77.32532,133.306952,157.563337,55.400666
min,17.168142,-68.0,58.8,-32.2,-72.537313,13.333333,-18.0,-5.276995,-9.963636,-22.984597,-8.108108,-70.446927,9.539326,22.773109,-2.857143,-6.177215,-70.0
25%,99.577465,81.219737,114.009286,69.929577,72.096963,86.363636,101.333333,86.556452,88.888889,54.94382,88.964646,67.307191,75.416667,125.0,73.996176,84.068554,78.888889
50%,99.577465,110.084746,162.360447,93.7,97.851478,105.263158,104.553191,109.090909,100.6,80.831933,109.090909,98.72381,90.151515,135.0,85.719225,115.333333,96.939891
75%,126.728972,166.761966,286.666667,123.08,143.293004,156.643167,119.162905,137.297895,142.6,104.216867,163.566667,144.835,134.070513,187.05,123.220339,173.343333,111.551224
max,391.891892,1333.333333,356.54321,840.0,6579.0,1676.923077,583.04,2210.0,373.287671,414.762516,1218.872727,4703.333333,1109.102041,600.0,1488.0,1440.0,395.533333


In [11]:
stats.f_oneway(subgenre_df['Adult Contemporary'].dropna(),
               subgenre_df['Alternative Rock'].dropna(),
               subgenre_df['Blues'].dropna(),
               subgenre_df['Club Dance'].dropna(),
               subgenre_df['Country'].dropna(),
               subgenre_df['Folk'].dropna(),
               subgenre_df['Gospel'].dropna(),
               subgenre_df['Heavy Metal'].dropna(),
               subgenre_df['Jazz'].dropna(),
               subgenre_df['Latin'].dropna(),
               subgenre_df['Other'].dropna(),
               subgenre_df['Pop'].dropna(),
               subgenre_df['R&B'].dropna(),
               subgenre_df['Soul'].dropna(),
               subgenre_df['Undefined'].dropna(),
               subgenre_df['Urban'].dropna(),
               subgenre_df['World'].dropna(),)

F_onewayResult(statistic=2.8727032334771003, pvalue=0.00010571892638891329)

In [12]:
mc = MultiComparison(df['min_markup%'],df['subGenre'])
result = mc.tukeyhsd()

print(result)

           Multiple Comparison of Means - Tukey HSD,FWER=0.05          
      group1            group2       meandiff   lower    upper   reject
-----------------------------------------------------------------------
Adult Contemporary Alternative Rock  42.8959   -93.8504 179.6421 False 
Adult Contemporary      Blues        67.9093  -112.2511 248.0697 False 
Adult Contemporary    Club Dance      6.8233  -156.2398 169.8865 False 
Adult Contemporary     Country       63.6683   -78.7747 206.1113 False 
Adult Contemporary       Folk         90.138   -97.7227 277.9987 False 
Adult Contemporary      Gospel        1.0881  -179.6909 181.867  False 
Adult Contemporary   Heavy Metal     33.9935  -132.1576 200.1446 False 
Adult Contemporary       Jazz        13.4043  -156.3967 183.2054 False 
Adult Contemporary      Latin        -23.0369 -183.9513 137.8774 False 
Adult Contemporary      Other        33.6037  -108.8393 176.0467 False 
Adult Contemporary       Pop         88.8216   -40.6943 218.3375

Latin & Pop, Pop & Soul have significantly different means