In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = pd.read_csv('outputs/edm_songs.csv')

In [3]:
data['genre'].value_counts()

bounce              1048
bigroom             1033
hardstyle           1029
techhouse           1021
futurehouse         1019
tropicalhouse       1017
futurebass          1011
trance              1001
progressivehouse     995
basshouse            976
Name: genre, dtype: int64

In [4]:
data = data[data['energy'] > .3]

In [5]:
def full_time(df):
    tempo = df['tempo']
    tempo[tempo < 90] = tempo * 2
    return tempo

In [6]:
def drop_tempo(df, new_df, genre, high, low):
    tempo = df.loc[df.genre == genre, 'tempo']
    too_fast = tempo[tempo>high].index.tolist()
    df.drop(too_fast, inplace=True)
    
    too_slow = tempo[tempo<low].index.tolist()
    df.drop(too_slow, inplace=True)
    return new_df

In [7]:
filtered_df = pd.DataFrame()
data['tempo'] = full_time(data)
filtered_df = drop_tempo(data, filtered_df, 'progressivehouse', 135, 120)
filtered_df = drop_tempo(data, filtered_df, 'trance', 140, 125)
filtered_df = drop_tempo(data, filtered_df, 'basshouse', 140, 115)
filtered_df = drop_tempo(data, filtered_df, 'techhouse', 130, 120)
filtered_df = drop_tempo(data, filtered_df, 'futurebass', 155, 125)
filtered_df = drop_tempo(data, filtered_df, 'tropicalhouse', 120, 90)
filtered_df = drop_tempo(data, filtered_df, 'hardstyle', 160, 130)
filtered_df = drop_tempo(data, filtered_df, 'bigroom', 135, 125)
filtered_df = drop_tempo(data, filtered_df, 'bounce', 135, 125)
filtered_df = drop_tempo(data, filtered_df, 'futurehouse', 132, 124)

In [8]:
def undersampler(df, majority):
    minority_count = df[majority].value_counts()[-1]
    undersampled_df = pd.DataFrame()
    for item in df[majority].value_counts().index:
        item_df = df[df[majority] == item]
        item_df = item_df.sample(minority_count, random_state=1)
        undersampled_df = pd.concat([undersampled_df, item_df])
    return undersampled_df

In [9]:
data = undersampler(data, 'genre')

In [10]:
data.drop(['Unnamed: 0','analysis_url', 'id','track_href',  'type', 'uri', 'title'], axis=1, inplace=True)

In [11]:
data['genre'].value_counts()

progressivehouse    558
techhouse           558
basshouse           558
hardstyle           558
bounce              558
bigroom             558
trance              558
futurehouse         558
futurebass          558
tropicalhouse       558
Name: genre, dtype: int64

In [12]:
foo = data.groupby('genre').describe()

In [13]:
foo['danceability']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.766849,0.084848,0.373,0.71325,0.7805,0.82075,0.959
bigroom,558.0,0.631563,0.103946,0.267,0.571,0.636,0.69275,0.946
bounce,558.0,0.762627,0.097394,0.407,0.7,0.764,0.82975,0.979
futurebass,558.0,0.486796,0.110142,0.15,0.416,0.4935,0.55975,0.855
futurehouse,558.0,0.690222,0.096345,0.344,0.627,0.695,0.76,0.937
hardstyle,558.0,0.469731,0.10403,0.114,0.418,0.48,0.537,0.797
progressivehouse,558.0,0.551654,0.085541,0.225,0.5,0.554,0.604,0.826
techhouse,558.0,0.77622,0.073887,0.427,0.7405,0.7925,0.81575,0.971
trance,558.0,0.523719,0.101016,0.178,0.45525,0.5065,0.583,0.893
tropicalhouse,558.0,0.685591,0.082091,0.397,0.64025,0.689,0.73975,0.934


In [14]:
foo['energy']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.857763,0.104737,0.436,0.79075,0.887,0.94075,0.998
bigroom,558.0,0.895643,0.093304,0.41,0.862,0.921,0.95775,0.998
bounce,558.0,0.857817,0.113024,0.37,0.80025,0.8875,0.945,0.997
futurebass,558.0,0.729195,0.124378,0.323,0.65325,0.743,0.822,0.977
futurehouse,558.0,0.865896,0.092554,0.476,0.81125,0.8865,0.936,0.999
hardstyle,558.0,0.883296,0.093478,0.436,0.84325,0.9115,0.951,1.0
progressivehouse,558.0,0.812572,0.099022,0.444,0.75425,0.829,0.886,0.997
techhouse,558.0,0.846579,0.118984,0.384,0.78725,0.8795,0.937,0.999
trance,558.0,0.905468,0.106919,0.474,0.86525,0.946,0.98375,1.0
tropicalhouse,558.0,0.709444,0.119036,0.304,0.629,0.72,0.8,0.973


In [15]:
foo['key']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,5.681004,3.905807,0.0,2.0,6.0,9.0,11.0
bigroom,558.0,5.215054,3.241947,0.0,2.0,5.0,7.0,11.0
bounce,558.0,5.207885,3.72094,0.0,1.0,6.0,8.0,11.0
futurebass,558.0,5.392473,3.625921,0.0,2.0,5.0,8.0,11.0
futurehouse,558.0,5.695341,3.674896,0.0,2.0,6.0,9.0,11.0
hardstyle,558.0,5.657706,3.721841,0.0,2.0,6.0,9.0,11.0
progressivehouse,558.0,5.428315,3.464266,0.0,2.0,6.0,8.0,11.0
techhouse,558.0,5.879928,3.650571,0.0,2.0,6.0,9.0,11.0
trance,558.0,5.405018,3.403047,0.0,2.0,6.0,8.0,11.0
tropicalhouse,558.0,5.12724,3.686544,0.0,2.0,5.0,8.0,11.0


In [16]:
foo['loudness']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,-4.960909,1.838978,-11.875,-6.0285,-4.835,-3.6105,-0.156
bigroom,558.0,-4.102014,1.750441,-16.277,-4.98475,-3.927,-3.035,-0.342
bounce,558.0,-4.727776,1.960587,-16.111,-5.796,-4.588,-3.435,0.176
futurebass,558.0,-5.221134,1.811625,-13.038,-6.239,-5.1175,-4.0195,-0.484
futurehouse,558.0,-5.013186,1.723806,-10.494,-6.1015,-4.8945,-3.864,-0.944
hardstyle,558.0,-4.137907,1.703727,-13.697,-5.00425,-4.0505,-2.997,-0.1
progressivehouse,558.0,-4.600776,1.402943,-9.345,-5.38125,-4.473,-3.65825,-1.582
techhouse,558.0,-6.864952,1.958761,-17.9,-7.8995,-6.7265,-5.4955,-2.133
trance,558.0,-6.130808,1.878626,-19.645,-7.07975,-5.9075,-4.92325,-1.634
tropicalhouse,558.0,-6.447869,2.063135,-17.094,-7.64125,-6.365,-4.881,-1.576


In [17]:
foo['mode']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.61828,0.486244,0.0,0.0,1.0,1.0,1.0
bigroom,558.0,0.462366,0.499029,0.0,0.0,0.0,1.0,1.0
bounce,558.0,0.632616,0.482525,0.0,0.0,1.0,1.0,1.0
futurebass,558.0,0.507168,0.500397,0.0,0.0,1.0,1.0,1.0
futurehouse,558.0,0.437276,0.496495,0.0,0.0,0.0,1.0,1.0
hardstyle,558.0,0.335125,0.472458,0.0,0.0,0.0,1.0,1.0
progressivehouse,558.0,0.469534,0.499519,0.0,0.0,0.0,1.0,1.0
techhouse,558.0,0.528674,0.499625,0.0,0.0,1.0,1.0,1.0
trance,558.0,0.442652,0.497146,0.0,0.0,0.0,1.0,1.0
tropicalhouse,558.0,0.594982,0.491336,0.0,0.0,1.0,1.0,1.0


In [18]:
foo['speechiness']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.107106,0.079287,0.0311,0.055525,0.07485,0.13575,0.472
bigroom,558.0,0.089364,0.065156,0.0283,0.048625,0.06805,0.10375,0.474
bounce,558.0,0.141097,0.097995,0.033,0.0665,0.106,0.1975,0.585
futurebass,558.0,0.064097,0.05048,0.0257,0.0382,0.0472,0.068475,0.443
futurehouse,558.0,0.073509,0.052285,0.0269,0.044,0.055,0.08,0.459
hardstyle,558.0,0.107454,0.097015,0.0282,0.0467,0.06755,0.1325,0.561
progressivehouse,558.0,0.062853,0.038923,0.0275,0.041175,0.05135,0.0681,0.4
techhouse,558.0,0.078295,0.048707,0.0265,0.0514,0.0634,0.0858,0.454
trance,558.0,0.078713,0.055799,0.0285,0.044825,0.0613,0.0907,0.463
tropicalhouse,558.0,0.060912,0.046439,0.0239,0.0356,0.04375,0.06295,0.366


In [19]:
foo['acousticness']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.028022,0.058332,2.6e-05,0.002383,0.0078,0.027975,0.586
bigroom,558.0,0.021072,0.061631,1.3e-05,0.001203,0.00424,0.013875,0.707
bounce,558.0,0.027611,0.059917,2.6e-05,0.001905,0.00685,0.026075,0.601
futurebass,558.0,0.143145,0.163585,0.000208,0.0192,0.0746,0.211,0.762
futurehouse,558.0,0.041271,0.07647,2.1e-05,0.003632,0.0129,0.042675,0.523
hardstyle,558.0,0.048697,0.080518,0.00016,0.00465,0.01975,0.04975,0.593
progressivehouse,558.0,0.053235,0.086014,8.1e-05,0.004865,0.0208,0.06525,0.681
techhouse,558.0,0.022439,0.047726,1.2e-05,0.00137,0.005885,0.01865,0.376
trance,558.0,0.016223,0.05284,1e-05,0.000433,0.001805,0.008837,0.641
tropicalhouse,558.0,0.188966,0.191326,0.000319,0.034025,0.129,0.27775,0.839


In [20]:
foo['instrumentalness']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.358129,0.325712,0.0,0.0311,0.275,0.67775,0.945
bigroom,558.0,0.342845,0.341472,0.0,0.0054,0.223,0.7065,0.946
bounce,558.0,0.314938,0.328962,0.0,0.005165,0.1955,0.6415,0.945
futurebass,558.0,0.036767,0.145518,0.0,0.0,1.5e-05,0.000833,0.953
futurehouse,558.0,0.273065,0.330812,0.0,0.000722,0.088,0.5555,0.952
hardstyle,558.0,0.086568,0.216587,0.0,1e-06,0.00015,0.016725,0.991
progressivehouse,558.0,0.0799,0.216268,0.0,0.0,1.5e-05,0.004977,0.972
techhouse,558.0,0.486765,0.347662,0.0,0.12225,0.557,0.83175,0.947
trance,558.0,0.40044,0.352682,0.0,0.028,0.3435,0.76575,0.972
tropicalhouse,558.0,0.121049,0.268685,0.0,0.0,0.000113,0.025325,0.962


In [21]:
foo['liveness']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.204569,0.171241,0.0181,0.080925,0.139,0.29525,0.972
bigroom,558.0,0.233683,0.186833,0.0197,0.090575,0.163,0.339,0.955
bounce,558.0,0.224512,0.204252,0.0215,0.0798,0.139,0.31575,0.968
futurebass,558.0,0.188763,0.155758,0.027,0.099225,0.121,0.22075,0.951
futurehouse,558.0,0.206513,0.178369,0.0137,0.07285,0.126,0.31575,0.955
hardstyle,558.0,0.269075,0.194794,0.0189,0.113,0.224,0.352,0.951
progressivehouse,558.0,0.248102,0.197698,0.022,0.102,0.167,0.34175,0.911
techhouse,558.0,0.163757,0.143836,0.0169,0.070425,0.105,0.21925,0.959
trance,558.0,0.297406,0.211556,0.0338,0.11725,0.269,0.399,0.975
tropicalhouse,558.0,0.175628,0.121752,0.0327,0.093425,0.1225,0.23,0.741


In [22]:
foo['valence']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,0.470778,0.217086,0.0362,0.31125,0.4775,0.62875,0.97
bigroom,558.0,0.269852,0.179134,0.0339,0.133,0.234,0.36975,0.925
bounce,558.0,0.42059,0.202145,0.0307,0.258,0.401,0.55175,0.959
futurebass,558.0,0.272646,0.153528,0.0366,0.16225,0.242,0.36075,0.898
futurehouse,558.0,0.467747,0.212174,0.0358,0.3,0.465,0.622,0.967
hardstyle,558.0,0.256438,0.156568,0.0374,0.14,0.23,0.337,0.887
progressivehouse,558.0,0.303327,0.152547,0.0349,0.18525,0.2925,0.40075,0.875
techhouse,558.0,0.560242,0.230326,0.0336,0.40725,0.589,0.72075,0.969
trance,558.0,0.241867,0.171825,0.0322,0.10825,0.204,0.32875,0.868
tropicalhouse,558.0,0.415847,0.186324,0.036,0.2815,0.4055,0.535,0.904


In [23]:
foo['tempo']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,126.553912,2.310003,120.002,125.008,126.011,127.999,135.132
bigroom,558.0,128.351023,1.343701,125.004,127.982,128.011,128.07,134.959
bounce,558.0,128.269095,1.175387,125.061,127.989,128.0055,128.027,134.999
futurebass,558.0,143.062665,7.612869,125.016,139.5995,144.8875,149.963,155.0
futurehouse,558.0,126.007862,1.384947,124.0,125.001,125.994,127.00125,131.96
hardstyle,558.0,151.1946,3.317633,130.037,149.95525,150.0305,154.806,159.989
progressivehouse,558.0,127.793977,1.229802,122.011,127.9265,127.995,128.034,134.934
techhouse,558.0,125.251254,1.825284,120.011,124.002,125.009,126.026,129.997
trance,558.0,135.248964,4.396337,125.002,131.988,137.972,138.007,140.0
tropicalhouse,558.0,108.733923,7.883101,90.011,102.43225,108.0415,115.98475,120.0


In [24]:
foo['duration_ms']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,203739.290323,43312.577692,111231.0,172723.75,198498.0,228508.25,430080.0
bigroom,558.0,220745.849462,55538.397684,88308.0,179010.75,206250.0,256465.25,402331.0
bounce,558.0,235385.673835,58638.207395,121880.0,190861.5,232647.5,273899.5,428539.0
futurebass,558.0,230597.270609,41368.682488,131765.0,201657.5,224000.0,251001.0,406179.0
futurehouse,558.0,211917.116487,52796.488531,126720.0,177125.75,198138.5,230038.5,489500.0
hardstyle,558.0,217696.901434,44195.276791,135200.0,188400.0,211204.0,236875.75,546581.0
progressivehouse,558.0,227326.403226,51572.854805,138750.0,193331.75,212306.5,243632.75,412510.0
techhouse,558.0,255150.012545,96837.918828,102861.0,184294.25,215500.0,339110.25,637341.0
trance,558.0,291169.044803,104567.848744,113267.0,211088.75,236426.5,391098.75,691501.0
tropicalhouse,558.0,203300.594982,44364.537328,60742.0,176626.25,196389.0,219548.75,416434.0


In [25]:
foo['time_signature']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basshouse,558.0,3.976703,0.21057,1.0,4.0,4.0,4.0,4.0
bigroom,558.0,3.996416,0.059815,3.0,4.0,4.0,4.0,4.0
bounce,558.0,3.987455,0.152255,1.0,4.0,4.0,4.0,4.0
futurebass,558.0,3.987455,0.163623,1.0,4.0,4.0,4.0,5.0
futurehouse,558.0,3.994624,0.073192,3.0,4.0,4.0,4.0,4.0
hardstyle,558.0,3.962366,0.225046,1.0,4.0,4.0,4.0,5.0
progressivehouse,558.0,3.991039,0.111745,3.0,4.0,4.0,4.0,5.0
techhouse,558.0,3.987455,0.111399,3.0,4.0,4.0,4.0,4.0
trance,558.0,3.991039,0.09432,3.0,4.0,4.0,4.0,4.0
tropicalhouse,558.0,4.0,0.103788,3.0,4.0,4.0,4.0,5.0


In [26]:
data.drop(['key', 'duration_ms', 'mode', 'time_signature','acousticness'
          ], axis=1, inplace=True)

In [27]:
data.to_csv('outputs/edm_trimmed.csv', index=False)

In [28]:
validation_data = pd.read_csv('outputs/validation_songs.csv')

In [29]:
validation_data['genre'].value_counts()

progressivehouse    30
futurebass          30
futurehouse         30
tropicalhouse       30
trance              30
hardstyle           30
bigroom             30
bounce              30
techhouse           30
basshouse           30
Name: genre, dtype: int64

In [30]:
validation_data = validation_data[validation_data['energy'] > .3]

In [31]:
validation_filtered_df = pd.DataFrame()
validation_data['tempo'] = full_time(validation_data)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'progressivehouse', 135, 120)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'trance', 140, 125)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'basshouse', 140, 115)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'techhouse', 130, 120)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'futurebass', 155, 125)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'tropicalhouse', 120, 90)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'hardstyle', 160, 130)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'bigroom', 135, 125)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'bounce', 135, 125)
filtered_df = drop_tempo(validation_data, validation_filtered_df, 'futurehouse', 132, 124)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempo[tempo < 90] = tempo * 2


In [32]:
validation_data = undersampler(validation_data, 'genre')

In [33]:
validation_data['genre'].value_counts()

progressivehouse    15
futurehouse         15
bounce              15
basshouse           15
bigroom             15
techhouse           15
trance              15
hardstyle           15
tropicalhouse       15
futurebass          15
Name: genre, dtype: int64

In [34]:
validation_data.drop(['Unnamed: 0','analysis_url', 'id','track_href',  'type', 'uri', 'title', 'key', 'duration_ms', 'mode', 'time_signature','acousticness'
          ], axis=1, inplace=True)

In [35]:
validation_data.to_csv('outputs/val_trimmed.csv', index=False)