In [43]:
from sklearn import datasets
import pandas as pd
pd.set_option("display.max_columns", 100)
from model_comparison import ModelComparison, TaskNames, ModelNames


In [2]:
comparison_datasets = {}

# California housing

In [3]:
california_housing = datasets.fetch_california_housing(as_frame=True)
california_housing.data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
california_housing.data.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
Latitude      float64
Longitude     float64
dtype: object

In [5]:
california_housing.target.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

In [6]:
comparison_datasets['california'] = {"task": TaskNames.regression, 
                                     "features": california_housing.data, 
                                     "target": california_housing.target,
                                     "cv": 4}

# Covtype

In [7]:
covtype_features, covtype_target = datasets.fetch_covtype(return_X_y=True)

In [8]:
covtype_features.shape

(581012, 54)

In [9]:
covtype_target

array([5, 5, 2, ..., 3, 3, 3], dtype=int32)

In [10]:
comparison_datasets['covtype'] = {"task": TaskNames.classification, 
                                     "features": pd.DataFrame(covtype_features), 
                                     "target": pd.Series(covtype_target),
                                     "cv": 2}

# Adult

In [11]:
adult = datasets.fetch_openml('adult', as_frame=True)

  warn("Multiple active versions of the dataset matching the name"


In [12]:
adult.data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [13]:
adult.data.shape

(48842, 14)

In [14]:
adult.target.head()

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: class, dtype: category
Categories (2, object): ['>50K', '<=50K']

In [15]:
comparison_datasets['adult'] = {"task": TaskNames.classification, 
                                "features": adult.data, 
                                "target": adult.target,
                                "cv": 4}


# UKAir

In [16]:
ukair = datasets.fetch_openml('particulate-matter-ukair-2017', as_frame=True)

In [17]:
ukair.data.head()

Unnamed: 0,datetime,Hour,Month,DayofWeek,Site.Name,Environment.Type,Zone,Altitude..m.,PM.sub.2.5..sub..particulate.matter..Hourly.measured.
0,2017-01-01 01:00:00,1.0,1,1,Aberdeen,Background Urban,North East Scotland,20.0,9.9
1,2017-01-01 02:00:00,2.0,1,1,Aberdeen,Background Urban,North East Scotland,20.0,4.6
2,2017-01-01 03:00:00,3.0,1,1,Aberdeen,Background Urban,North East Scotland,20.0,0.8
3,2017-01-01 04:00:00,4.0,1,1,Aberdeen,Background Urban,North East Scotland,20.0,2.1
4,2017-01-01 05:00:00,5.0,1,1,Aberdeen,Background Urban,North East Scotland,20.0,5.4


In [18]:
ukair.data.shape

(394299, 9)

In [19]:
ukair.target.head()

0    12.8
1    11.0
2     4.5
3     7.3
4     8.2
Name: PM.sub.10..sub..particulate.matter..Hourly.measured., dtype: float64

In [20]:
comparison_datasets['ukair'] = {"task": TaskNames.regression, 
                                "features": ukair.data, 
                                "target": ukair.target,
                                "cv": 2}

# Diabetes

In [21]:
diabetes = datasets.fetch_openml('diabetes', as_frame=True)

  warn("Multiple active versions of the dataset matching the name"


In [22]:
diabetes.data.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [23]:
diabetes.data.shape

(768, 8)

In [24]:
diabetes.target.head()

0    tested_positive
1    tested_negative
2    tested_positive
3    tested_negative
4    tested_positive
Name: class, dtype: category
Categories (2, object): ['tested_negative', 'tested_positive']

In [25]:
comparison_datasets['diabetes'] = {"task": TaskNames.classification, 
                                   "features": diabetes.data, 
                                   "target": diabetes.target,
                                   "cv": 10}

# Bank marketing

In [26]:
bank_marketing = datasets.fetch_openml("bank-marketing", as_frame=True)

  warn("Multiple active versions of the dataset matching the name"


In [27]:
bank_marketing.data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5.0,may,92.0,1.0,-1.0,0.0,unknown
4,33.0,unknown,single,unknown,no,1.0,no,no,unknown,5.0,may,198.0,1.0,-1.0,0.0,unknown


In [28]:
bank_marketing.data.shape

(45211, 16)

In [29]:
bank_marketing.target.head()

0    1
1    1
2    1
3    1
4    1
Name: Class, dtype: category
Categories (2, object): ['1', '2']

In [30]:
comparison_datasets['bank'] = {"task": TaskNames.classification, 
                               "features": bank_marketing.data, 
                               "target": bank_marketing.target,
                               "cv": 4}

# Speed dating

In [31]:
speed_dating = datasets.fetch_openml("SpeedDating", as_frame=True)

In [32]:
speed_dating.data.head()

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,expected_num_interested_in_me,expected_num_matches,d_expected_happy_with_sd_people,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met
0,0,1.0,female,21.0,27.0,6.0,[4-6],Asian/Pacific Islander/Asian-American,European/Caucasian-American,0,...,2.0,4.0,[0-4],[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0
1,0,1.0,female,21.0,22.0,1.0,[0-1],Asian/Pacific Islander/Asian-American,European/Caucasian-American,0,...,2.0,4.0,[0-4],[0-3],[3-5],7.0,5.0,[6-8],[5-6],1.0
2,1,1.0,female,21.0,22.0,1.0,[0-1],Asian/Pacific Islander/Asian-American,Asian/Pacific Islander/Asian-American,1,...,2.0,4.0,[0-4],[0-3],[3-5],7.0,,[6-8],[0-4],1.0
3,0,1.0,female,21.0,23.0,2.0,[2-3],Asian/Pacific Islander/Asian-American,European/Caucasian-American,0,...,2.0,4.0,[0-4],[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0
4,0,1.0,female,21.0,24.0,3.0,[2-3],Asian/Pacific Islander/Asian-American,Latino/Hispanic American,0,...,2.0,4.0,[0-4],[0-3],[3-5],6.0,6.0,[6-8],[5-6],0.0


In [33]:
speed_dating.data.shape

(8378, 120)

In [34]:
speed_dating.target.head()

0    0
1    0
2    1
3    1
4    1
Name: match, dtype: category
Categories (2, object): ['0', '1']

In [35]:
comparison_datasets['dating'] = {"task": TaskNames.classification, 
                                 "features": speed_dating.data, 
                                 "target": speed_dating.target,
                                 "cv": 6}

# Hill valley

In [36]:
hill_valley = datasets.fetch_openml("hill-valley", as_frame=True)

  warn("Multiple active versions of the dataset matching the name"


In [37]:
hill_valley.data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100
0,39.02,36.49,38.2,38.85,39.38,39.74,37.02,39.53,38.81,38.79,...,37.57,36.62,36.92,38.8,38.52,38.07,36.73,39.46,37.5,39.1
1,1.83,1.71,1.77,1.77,1.68,1.78,1.8,1.7,1.75,1.78,...,1.71,1.8,1.79,1.77,1.74,1.74,1.8,1.78,1.75,1.69
2,68177.69,66138.42,72981.88,74304.33,67549.66,69367.34,69169.41,73268.61,74465.84,72503.37,...,69384.71,73438.88,71053.35,71112.62,74916.48,72571.58,66348.97,71063.72,67404.27,74920.24
3,44889.06,39191.86,40728.46,38576.36,45876.06,47034.0,46611.43,37668.32,40980.89,38466.15,...,47653.6,42625.67,40684.2,46960.73,44546.8,45410.53,47139.44,43095.68,40888.34,39615.19
4,5.7,5.4,5.28,5.38,5.27,5.61,6.0,5.38,5.34,5.87,...,5.52,5.17,5.67,5.6,5.94,5.73,5.22,5.3,5.73,5.91


In [38]:
hill_valley.data.shape

(1212, 100)

In [39]:
hill_valley.target.head()

0    0
1    1
2    1
3    0
4    0
Name: Class, dtype: category
Categories (2, object): ['0', '1']

In [54]:
comparison_datasets['valley'] = {"task": TaskNames.classification, 
                                 "features": hill_valley.data, 
                                 "target": hill_valley.target,
                                 "cv": 8}

# Cars

In [41]:
cars = pd.read_csv("cars.csv")

In [44]:
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,universal,False,owned,all,10900.0,False,Минская обл.,9,13,False,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,universal,False,owned,all,5000.0,True,Минская обл.,12,54,False,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,suv,False,owned,all,2800.0,True,Минская обл.,4,72,False,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,sedan,False,owned,all,9999.0,True,Минская обл.,9,42,True,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,universal,False,owned,all,2134.11,True,Гомельская обл.,14,7,False,True,False,True,True,False,False,False,False,True,7


In [45]:
cars.shape

(38531, 30)

In [53]:
cars_target = "duration_listed"
comparison_datasets['cars'] = {"task": TaskNames.regression, 
                                 "features": cars.drop(columns=cars_target), 
                                 "target": cars[cars_target],
                                 "cv": 4}

# Comparaison

In [47]:
def get_comparison(dataset_infos):
    comparison = ModelComparison(task_name=dataset_infos["task"], 
                             cross_validation_n_folds=dataset_infos["cv"], 
                             features=dataset_infos["features"], 
                             target=dataset_infos["target"])
    return comparison.get_default_models_scores_and_training_time()

In [55]:
perf_comparisons = {dataset_name: get_comparison(comparison_datasets[dataset_name])
                   for dataset_name in comparison_datasets.keys()}
perf_comparisons

{'california': {<ModelNames.catboost: 'catboost'>: (0.6680916221949819,
   23.259527921676636),
  <ModelNames.lightgbm: 'lightgbm'>: (0.6737943493533225, 0.9568119049072266),
  <ModelNames.lightgbm_with_cat_encoder: 'lightgbm_with_cat_encoder'>: (0.6737943493533225,
   1.0438101291656494),
  <ModelNames.xgboost: 'xgboost'>: (0.6264496146911739, 2.5029420852661133),
  <ModelNames.xgboost_with_cat_encoder: 'xgboost_with_cat_encoder'>: (0.6264496146911739,
   2.8358547687530518)},
 'covtype': {<ModelNames.catboost: 'catboost'>: (0.5393038353768941,
   136.3773431777954),
  <ModelNames.lightgbm: 'lightgbm'>: (0.5435085678092707, 26.974705934524536),
  <ModelNames.lightgbm_with_cat_encoder: 'lightgbm_with_cat_encoder'>: (0.541035297033452,
   27.85984992980957),
  <ModelNames.xgboost: 'xgboost'>: (0.5617078476864505, 149.85198998451233),
  <ModelNames.xgboost_with_cat_encoder: 'xgboost_with_cat_encoder'>: (0.5617078476864505,
   154.48683381080627)},
 'adult': {<ModelNames.catboost: 'catboo

In [56]:
lightgbm_performance = [perf_comparisons[dataset_perfs][ModelNames.lightgbm][0]
                       for dataset_perfs in perf_comparisons.keys()]
catboost_performance = [perf_comparisons[dataset_perfs][ModelNames.catboost][0]
                       for dataset_perfs in perf_comparisons.keys()]
xgboost_performance = [perf_comparisons[dataset_perfs][ModelNames.xgboost][0]
                       for dataset_perfs in perf_comparisons.keys()]
xgboost_cat_performance = [perf_comparisons[dataset_perfs][ModelNames.xgboost_with_cat_encoder][0]
                       for dataset_perfs in perf_comparisons.keys()]
lightgbm_cat_performance = [perf_comparisons[dataset_perfs][ModelNames.lightgbm_with_cat_encoder][0]
                       for dataset_perfs in perf_comparisons.keys()]

In [57]:
perfs_per_dataset = pd.DataFrame({"dataset": perf_comparisons.keys(),
                                 "lightgbm": [l / x for l, x in zip(lightgbm_performance, xgboost_performance)],
                                 "lightgbm_cat": [lc / x for lc, x in zip(lightgbm_cat_performance, xgboost_performance)],
                                 "catboost": [c / x for c, x in zip(catboost_performance, xgboost_performance)],
                                 "xgboost_cat": [xc / x for xc, x in zip(xgboost_cat_performance, xgboost_performance)],
                                 "dataset_length": [len(dataset["features"])
                                                   for dataset in comparison_datasets.values()]})
perfs_per_dataset.sort_values("dataset_length")

Unnamed: 0,dataset,lightgbm,lightgbm_cat,catboost,xgboost_cat,dataset_length
4,diabetes,1.007107,1.007107,1.035305,1.0,768
7,valley,1.062157,1.062157,0.934894,1.0,1212
6,dating,1.003689,1.003689,0.999005,1.003124,8378
0,california,1.075576,1.075576,1.066473,1.0,20640
8,cars,1.582081,1.582081,1.063057,0.721757,38531
5,bank,1.004783,1.004783,0.887671,0.971537,45211
2,adult,1.003587,1.003587,1.004496,0.997585,48842
3,ukair,1.00516,1.00516,0.985146,0.99528,394299
1,covtype,0.9676,0.963197,0.960114,1.0,581012


In [77]:
import plotly.express as px
px.scatter(perfs_per_dataset, x="dataset_length", y="lightgbm", log_x=True)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [59]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name="catboost", x=list(perf_comparisons.keys()), y=catboost_performance),
    go.Bar(name="lightgbm", x=list(perf_comparisons.keys()), y=lightgbm_performance),
    go.Bar(name="xgboost", x=list(perf_comparisons.keys()), y=xgboost_performance),
    go.Bar(name="xgboost_cat", x=list(perf_comparisons.keys()), y=xgboost_cat_performance),
    go.Bar(name="lightgbm_cat", x=list(perf_comparisons.keys()), y=lightgbm_cat_performance)
           ])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()