Imports to featurize and load data

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../../")
from DanceProj1.DanceObj import Dance
from DanceProj1.data_proc import get_data, data_to_features


Imports for Automl

In [2]:
import autosklearn.classification
import sklearn.model_selection
from sklearn.model_selection import train_test_split
import sklearn.datasets
import sklearn.metrics

In [3]:
#make sure autosklearn is here
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.15.0


Data loaded as two dataframes of dances by features, one for basic and one for advanced movement sequences.

In [4]:
aistpath = '../../aist_keypoints'      #path to 3d keypoints data
dataBM, dataFM = get_data(aistpath)    #get data
dfBasic, dfAdvanced = data_to_features(dataBM, dataFM)  #get features as dataframes


In [5]:
#make new indexes for Advanced, starting after last index in Basic, for unique index per id
new_index_advanced = range(len(dfBasic.index), len(dfBasic.index)+len(dfAdvanced.index))
dfAdvanced.index = dfAdvanced.index + new_index_advanced
dfAdvanced.head()

Unnamed: 0,Expandedness,Expandedness_std,Expandednessvel,Expandednessvel_std,Expandednessacc,Expandednessacc_std,Asym_RL_vel,Asym_RL_acc,Asym_RL_jer,Asym_RL_vel_std,...,Contracorr_Relb_Lknee_prominencej2,Contracorr_Relb_Lknee_prominencej3,Contracorr_Relb_Lknee_prominencej_std,Contracorr_Lelb_Rknee_heightjer,Contracorr_Lelb_Rknee_prominencej1,Contracorr_Lelb_Rknee_prominencej2,Contracorr_Lelb_Rknee_prominencej3,Contracorr_Lelb_Rknee_prominencej_std,id,Genre
1199,1056.865643,90.082477,1538.228504,958.219385,7666.414735,3900.555376,2839.095566,3202.027147,-8145.973932,6.328799,...,1664854000.0,1434043000.0,144375500.0,74022090.0,1229382000.0,1192352000.0,1179422000.0,21172520.0,gBR_sFM_cAll_d05_mBR2_ch09,Break
1201,1031.250205,115.045741,2035.464041,966.684795,10537.243423,4651.21315,-498.891513,248.747443,687.314989,7.118658,...,1652572000.0,1376125000.0,166189700.0,639480000.0,1357771000.0,1261664000.0,889105500.0,202123200.0,gBR_sFM_cAll_d06_mBR5_ch19,Break
1203,1264.589529,654.019537,3468.162657,2929.231708,17493.828054,14958.90514,2277.125602,1744.592252,-1649.130332,30.364511,...,3830212000.0,3618890000.0,1640630000.0,1340948000.0,4431559000.0,4308251000.0,2924700000.0,683133500.0,gBR_sFM_cAll_d05_mBR5_ch14,Break
1205,1100.526992,108.640735,1619.546822,761.437478,8351.133624,3748.230439,1330.346396,-3083.023419,-94451.348976,15.941952,...,1614114000.0,1565013000.0,105578500.0,331792900.0,1771329000.0,1737268000.0,1393618000.0,170594000.0,gBR_sFM_cAll_d04_mBR5_ch06,Break
1207,1012.354348,101.102753,1380.73498,483.503378,6921.930683,2232.699927,2425.553893,-3654.077082,2469.86696,19.227707,...,1208936000.0,1112264000.0,115928100.0,377874900.0,1274994000.0,925589100.0,905884200.0,169546400.0,gBR_sFM_cAll_d06_mBR2_ch16,Break


In [6]:
#alphabetize dfs by genre (to avoid reordering by classifiers later)
dfBasic = dfBasic.sort_values(by='Genre')
dfAdvanced = dfAdvanced.sort_values(by='Genre')
dfAdvanced.head()

Unnamed: 0,Expandedness,Expandedness_std,Expandednessvel,Expandednessvel_std,Expandednessacc,Expandednessacc_std,Asym_RL_vel,Asym_RL_acc,Asym_RL_jer,Asym_RL_vel_std,...,Contracorr_Relb_Lknee_prominencej2,Contracorr_Relb_Lknee_prominencej3,Contracorr_Relb_Lknee_prominencej_std,Contracorr_Lelb_Rknee_heightjer,Contracorr_Lelb_Rknee_prominencej1,Contracorr_Lelb_Rknee_prominencej2,Contracorr_Lelb_Rknee_prominencej3,Contracorr_Lelb_Rknee_prominencej_std,id,Genre
1615,1065.821287,100.792451,1735.6531,750.213614,8434.501762,3851.117291,2450.308351,669.453048,447.942426,10.535955,...,2437719000.0,1671086000.0,375740300.0,870010700.0,1874843000.0,1795367000.0,1583052000.0,123169300.0,gJB_sFM_cAll_d07_mJB2_ch03,Ballet Jazz
1577,943.558702,94.981145,1440.802249,833.4894,7682.011394,5091.582272,-2047.497225,34163.471185,-241.612173,12.147221,...,1778878000.0,1684116000.0,240644200.0,8035391000.0,14800050000.0,9934780000.0,6696022000.0,3330592000.0,gJB_sFM_cAll_d09_mJB5_ch20,Ballet Jazz
1579,1016.357632,99.647915,1285.371478,587.623914,5784.954412,2807.437497,2195.687077,1578.601247,1954.705135,6.754464,...,903539700.0,796486200.0,84722180.0,1002533000.0,1734618000.0,1686987000.0,1495129000.0,103512400.0,gJB_sFM_cAll_d08_mJB3_ch11,Ballet Jazz
1581,952.4538,100.915748,1349.334022,759.992458,6434.637824,4151.00095,-33921.557899,1804.89838,-12073.446919,147.646347,...,2006960000.0,1939330000.0,456166200.0,884043500.0,3678480000.0,3672786000.0,2399304000.0,601671800.0,gJB_sFM_cAll_d09_mJB1_ch16,Ballet Jazz
1583,986.565303,106.354552,1250.544711,716.203115,6256.392835,3838.150549,979.998271,-8376.755982,7664.762826,5.98797,...,1379766000.0,920697000.0,315957000.0,602152100.0,1457539000.0,1285451000.0,1222143000.0,99461940.0,gJB_sFM_cAll_d07_mJB4_ch05,Ballet Jazz


Test set is to be composed only of Advanced dances. Most of the data is Basic dances, with each choreography performed with slight variations. Below is a custom splitting function (see data_proc) for getting an all-Advanced Test, and a mix of Advanced and Basic for validation.  
   
Here we use 10 dances from each genre (100 total) for Test, 2 from each genre in Validation, and the remaining ~9 Advanced from each in Train. This leaves 14-Basic-per-genre for validation, and ~105-Basic-per-genre for Train.    


In [7]:
from DanceProj1.data_proc import traintestval_split
train, valid, testset = traintestval_split(dfBasic, dfAdvanced, testfrac_adv=.5, testfrac_bas=0, valfrac_adv_nonT=.2, valfrac_bas=.12)

In [8]:
testset.loc[testset.Genre=='House'].head()

Unnamed: 0,Expandedness,Expandedness_std,Expandednessvel,Expandednessvel_std,Expandednessacc,Expandednessacc_std,Asym_RL_vel,Asym_RL_acc,Asym_RL_jer,Asym_RL_vel_std,...,Contracorr_Relb_Lknee_prominencej2,Contracorr_Relb_Lknee_prominencej3,Contracorr_Relb_Lknee_prominencej_std,Contracorr_Lelb_Rknee_heightjer,Contracorr_Lelb_Rknee_prominencej1,Contracorr_Lelb_Rknee_prominencej2,Contracorr_Lelb_Rknee_prominencej3,Contracorr_Lelb_Rknee_prominencej_std,id,Genre
1445,899.662162,55.879398,1227.2878,413.422803,6457.155711,2228.122518,-1129.478986,1023.242998,738.392074,4.498651,...,594635800.0,564326000.0,28534460.0,898254300.0,1678000000.0,1371023000.0,1306400000.0,162103500.0,gHO_sFM_cAll_d19_mHO5_ch06,House
1433,1030.034324,67.294539,1694.467046,610.983554,8257.222844,2983.687749,-3343.373931,6927.332775,2720.499263,14.589017,...,1363984000.0,1058428000.0,146230700.0,114752900.0,1398055000.0,1377998000.0,1135556000.0,119296900.0,gHO_sFM_cAll_d21_mHO3_ch18,House
1411,911.598974,69.306864,1467.226192,478.671268,7707.74365,2583.242409,1628.978593,627.919242,-3038.095561,3.106333,...,1092473000.0,1036731000.0,239128300.0,656558800.0,1214404000.0,1134712000.0,1109461000.0,44722970.0,gHO_sFM_cAll_d19_mHO2_ch07,House
1431,1026.761618,74.919079,1904.470267,643.834194,10026.898632,3370.920958,-239.876826,2343.001385,-1244.847342,4.162597,...,1975533000.0,1559996000.0,269717500.0,1577067000.0,2621326000.0,1729645000.0,1622309000.0,447791100.0,gHO_sFM_cAll_d20_mHO1_ch09,House
1409,1022.3947,77.089159,1520.848294,527.779648,7753.914176,2774.999882,2006.037578,-222.037879,-4700.990933,11.292699,...,1341113000.0,1156458000.0,205349400.0,399933700.0,2504699000.0,2030513000.0,1475123000.0,420758300.0,gHO_sFM_cAll_d21_mHO4_ch19,House


The dfs above include ID, for later evaluation. This column should be removed from input.  

traintestval_split also includes genre, which is our target label in this classification task and should accordingly be removed from x and used for y.

In [19]:
test_ids = testset['id']
index_lookup = pd.DataFrame(
               testset.index, columns=['original_index']) #lookup table for original index

X_test = testset.drop(['Genre'], axis=1).drop(['id'], axis=1)
y_test = testset['Genre']

X_valid = valid.drop(['Genre'], axis=1).drop(['id'], axis=1)
y_valid = valid['Genre']

X_train = train.drop(['Genre'], axis=1).drop(['id'], axis=1)
y_train = train['Genre']

In [10]:
# check the shape 
for item in [X_train, X_test, X_valid, y_valid, y_train, y_test]:
  print(item.shape)

(1148, 104)
(100, 104)
(160, 104)
(160,)
(1148,)
(100,)


Let's see how a Perceptron does (skipping validation)

In [11]:
from sklearn.linear_model import Perceptron
preg = Perceptron(tol=.05, max_iter=2000, alpha=.00001).fit(X_train, y_train)
print('Perceptron is at chance: Score =', preg.score(X_test, y_test))

Perceptron is at chance: Score = 0.1


Let's see how a Ridge Classifier does (skipping validation)

In [12]:
#Let's see how a ridge classifier does
from sklearn.linear_model import RidgeClassifier
rreg = RidgeClassifier().fit(X_train, y_train)
print('RidgeClassifier is better than chance, but not great. Score =',rreg.score(X_test, y_test))

RidgeClassifier is better than chance, but not great. Score = 0.5


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Let's see how XGBoost does (skipping validation)

In [13]:
print([i for i in y_test.unique()])

['Ballet Jazz', 'Break', 'House', 'Krump', 'LAhop', 'Lock', 'Midhop', 'Pop', 'Street Jazz', 'Waack']


In [15]:
#change genre labels to integers for xgboosty_trainxgb = y_train
y_trainxgb = y_train
y_testxgb = y_test
y_validxgb = y_valid
for i, G in enumerate(y_test.unique()):
    y_trainxgb.replace({G:i}, inplace=True)
    y_testxgb.replace({G:i}, inplace=True)
    y_validxgb.replace({G:i}, inplace=True)
    

In [21]:
import xgboost as xgb
xgb_mod = xgb.XGBClassifier(objective='multi:softprob', random_state=42, n_jobs=16)
xgb_mod.fit(X_train, y_trainxgb)
print('XGBoost is not bad. Score =',xgb_mod.score(X_test, y_testxgb))

XGBoost is not bad. Score = 0.59


In [22]:
y_hatp1 = xgb_mod.predict_proba(X_test)
print(y_hatp1[:5])

[[2.21340165e-01 1.11120835e-01 3.71732912e-03 5.97326597e-03
  8.95645190e-03 1.00800209e-02 6.17810786e-01 7.49257533e-03
  6.84311800e-03 6.66548405e-03]
 [6.59394801e-01 3.29738110e-02 7.10075197e-04 2.78233826e-01
  2.72338837e-03 3.89358238e-03 1.31406849e-02 3.40115582e-03
  2.34655128e-03 3.18207848e-03]
 [9.86151397e-01 2.32502096e-03 4.39459080e-04 5.25681302e-04
  5.63064939e-04 6.82962965e-03 4.54590132e-04 9.21996078e-04
  1.09772547e-03 6.91500027e-04]
 [1.55798763e-01 2.22534284e-01 2.95149274e-02 9.73950326e-03
  9.55407396e-02 1.79429762e-02 4.39018101e-01 8.42774846e-03
  8.84874165e-03 1.26342047e-02]
 [9.00347158e-03 9.86758340e-03 1.24720100e-03 8.71815905e-03
  1.32849729e-02 9.40010309e-01 1.80289289e-03 9.50922165e-03
  1.99192623e-03 4.56422148e-03]]


In [23]:
from sklearn.calibration import CalibratedClassifierCV
calibrxgb = CalibratedClassifierCV(xgb_mod, cv='prefit', method='sigmoid')

calibrxgb.fit(X_valid, y_validxgb)
y_hatpc = calibr.predict_proba(X_test)
print(y_hatpc.shape)
print('Calibrated XGBoost is...worse? Score =',calibrxgb.score(X_test, y_test))

(100, 10)
Calibrated XGBoost is...worse? Score = 0.57


##### Now for auto-sklearn. The two main parameters for auto-sklearn are:   
- time_left_for_this_task, optional (default=3600): Time limit in seconds for the search of appropriate models. By increasing this value, auto-sklearn has a higher chance of finding better models.  
- per_run_time_limit, optional (default=1/10 of time_left_for_this_task): Time limit for a single call to the machine learning model. Model fitting will be terminated if the machine learning algorithm runs over the time limit. Set this value high enough so that typical machine learning algorithms can be fit on the training data.


In [None]:
#todo, edit below based on jupyter notebook from lab box

In [None]:
# Instantiate an automl classifier model object with time limits
automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=400,
        per_run_time_limit=50
)

# Fit the classifier to training data
automl.fit(X_train, y_train)
print('Score =', automl.score(X_test, y_test))

In [None]:
# Model search statistics
print(automl.sprint_statistics())

In [None]:
# Model Description
print(automl.show_models())

In [None]:
from sklearn.calibration import CalibratedClassifierCV
calibr = CalibratedClassifierCV(automl, cv='prefit', method='isotonic')

calibr.fit(X_valid, y_valid)
y_hatp = calibr.predict_proba(X_test)