# Aprenentatge Supervisat - Regressions

**Descripció**

Anem a practicar i a familiaritzar-nos amb regressions

In [4]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import time

from sklearn.preprocessing import StandardScaler, RobustScaler 
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

from utils import SinCosTransformer

import warnings
warnings.simplefilter('ignore')

SEED = 42
TEST_SIZE = 0.2
TESTING = True

### Carreguem les dades

In [5]:
dfdelays = pd.read_pickle('../data/S11-CleanDelayedFlights.pickle')

In [6]:
# Fem un sample de les dades per fer el procés més ràpid 

if TESTING:
    dfdelays = dfdelays.sample(frac=0.05, random_state=SEED)
    
dfdelays.shape

(96419, 87)

### Separem en train test

In [7]:
target = ['ArrDelay']

cols = [col for col in dfdelays.columns if col not in target]

X_train, X_test, y_train, y_test = train_test_split(dfdelays[cols], dfdelays[target], test_size=TEST_SIZE, random_state=SEED )

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((77135, 86), (19284, 86), (77135, 1), (19284, 1))

### Preprocés

Primer, seleccionem les columnes per als diferents experiments

In [9]:
# columnes originals

base_cols = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 
             'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 
             'UniqueCarrier', 'AirTime', 'DepDelay', 
             'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 
             'CRSElapsedTime_c', 'ActualElapsedTime_c']


In [10]:
cols_exp4 = ['ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 
            'DepDelay', 'Origin', 'Dest', 'UniqueCarrier', 'Distance', 
            'carrier_name', 'Name_Origin', 'Name_Dest', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity', 
            'DepartureTime_hour_sin', 'DepartureTime_hour_cos', 'DepartureTime_day_sin', 'DepartureTime_day_cos', 
            'DepartureTime_dayofweek_sin', 'DepartureTime_dayofweek_cos', 'DepartureTime_month_sin', 'DepartureTime_month_cos', 
            'CRSDepartureTime_hour_sin', 'CRSDepartureTime_hour_cos', 'CRSDepartureTime_day_sin', 'CRSDepartureTime_day_cos', 
            'CRSDepartureTime_dayofweek_sin', 'CRSDepartureTime_dayofweek_cos', 'CRSDepartureTime_month_sin', 'CRSDepartureTime_month_cos', 
            'ArrivalTime_hour_sin', 'ArrivalTime_hour_cos', 'ArrivalTime_day_sin', 'ArrivalTime_day_cos', 
            'ArrivalTime_dayofweek_sin', 'ArrivalTime_dayofweek_cos', 'ArrivalTime_month_sin', 'ArrivalTime_month_cos', 
            'CRSArrivalTime_hour_sin', 'CRSArrivalTime_hour_cos', 'CRSArrivalTime_day_sin', 'CRSArrivalTime_day_cos', 
            'CRSArrivalTime_dayofweek_sin', 'CRSArrivalTime_dayofweek_cos', 'CRSArrivalTime_month_sin', 'CRSArrivalTime_month_cos']


In [11]:
# amb columes noves
cols_exp5 = ['DepDelay',  'Distance', 
            'UniqueCarrier', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity']

# Eliminem el DepDelay
cols_exp6 = ['Distance', 
            'UniqueCarrier', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity']

# Eliminem el DepDelay, així com dades sobre el vol real, fins i tot el DepartureTime
# No esperem un bon resultat 
cols_exp7 = ['Distance', 
            'UniqueCarrier', 'Origin', 'Dest',
            'CRSElapsedTime_c', 
            'CRSDepartureTime', 
            'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'CRSvelocity']

## Nivell 1

### Exercici 1
Crea almenys tres models de regressió diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv.

Utilitzarem pipelines i una funció per a executar els experiments amb diferents models.

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, GammaRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor

# metriques
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Crearem aquests models:
- LinearRegression
- Rigde
- HuberRegressor (ourlier-robust)
- DecisionTreeRegressor
- RandomForestRegressor
- GradientBoostingRegressor
- MLPRegressor
- XGBoostRegressor


##### Preparem un parell de funcions on creem un pipeline i executem la regressió

In [13]:
def make_pipeline(data, columns, model):
    cols_num = data[columns].select_dtypes(include=['int','float']).columns.to_list()
    cols_cat = data[columns].select_dtypes(include=['object']).columns.to_list()
    cols_date = data[columns].select_dtypes(include=['datetime']).columns.to_list()
    
    ct = ColumnTransformer(
             transformers=[('num', RobustScaler(), cols_num),
                          ('cat', OneHotEncoder(handle_unknown='ignore'), cols_cat),
                          ('date', SinCosTransformer(['hour','day','dayofweek','month']), cols_date )])
    
    pipeline = Pipeline( steps=[('transformer', ct),
                               ('regr', model)])
    
    return pipeline

# base_pipe = make_pipeline(dfdelays, base_cols, LinearRegression())

# base_pipe.fit(dfdelays[base_cols], dfdelays[target])
# y_pred_base = base_pipe.predict(dfdelays[base_cols])
# print(f'LR r2: {r2_score(dfdelays[target], y_pred_base)}')
# print(f'LR MSE: {mean_squared_error(dfdelays[target], y_pred_base)}')

def run_experiment(train, test=None, columns=[], model=LinearRegression()):
    if test is None:
        X_train, X_test, y_train, y_test = train_test_split(train, 
                                                        test_size=TEST_SIZE, random_state=SEED)
    else:
        X_train, y_train = train
        X_test, y_test = test
        
    pipe = make_pipeline(X_train, columns, model)
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)
    
    

In [18]:
models = {'lr': LinearRegression(),
          'lr2': LinearRegression(normalize=True),
          'ridge': Ridge(),
          'hub': HuberRegressor(),
          'tree': DecisionTreeRegressor(max_depth=7, random_state=SEED),
          'tree2': DecisionTreeRegressor(max_depth=10, random_state=SEED),
          'rf': RandomForestRegressor(max_depth=7, n_jobs=-1, random_state=SEED),
          'gb': GradientBoostingRegressor( random_state=SEED, criterion='mse'),
          'gb2': GradientBoostingRegressor( n_estimators=500, learning_rate=0.01, random_state=SEED, criterion='mse'),
          'nn': MLPRegressor(hidden_layer_sizes=(100,50,25)),
          'nn2':MLPRegressor(hidden_layer_sizes=(64,32)),
          'xgb': XGBRegressor(n_jobs=-1)}


# params = {  'lr': {'fit_intercept':False, 'normalize': True},
#             'ridge': {'alpha':np.linspace(), 'solver':{'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}},
#             'hub': {},
#             'tree': {},
#             'rf': {} 
#             'gb': {} 
#             'nn': {} 
#             'xgb': {}
#          }


#### Experiments - Features escollits

- **Exp. min**: Només amb DepDelay
- **base**: Amb les dades originals netejades
- **exp. 3**: Amb només 'DepDelay', 'Distance','Origin', 'Dest', 'UniqueCarrier'
- **exercici 5**. Noves columnes. Velocitat, Latitut i Longitut dels aeroports, Numero de vols per hora i dia a l'aeroport de Sortida i Arribada
- **ND exp. 6**: com el 5 però sense el DepDelay
- **ND exp. 7**: Eliminem el DepDelay, així com dades sobre el vol real, fins i tot el DepartureTime. No esperem un bon resultat Experiments sense DepDelay

In [19]:
experiments = { 'Exp. min': ['DepDelay'],
              'base': base_cols, 
              'exp. 3': ['DepDelay', 'Distance','Origin', 'Dest', 'UniqueCarrier'],
              'exercici 5': cols_exp5,
               'ND exp. 6': cols_exp6,
               'ND exp. 7': cols_exp7 }

In [21]:
results = []

for exp, cols in tqdm(experiments.items()):
    for name, model in tqdm(models.items()):
#         print(f'{exp}: {model.__class__.__name__}')
        t0 = time.time()
        mse, r2 = run_experiment((X_train, y_train), (X_test, y_test), cols, model)
        results.append([name, mse, r2, len(cols) ,time.time()-t0, exp])
#         print(f'{exp} {name}: {model.__class__.__name__} \n\tMSE: {mse}\n\tR2 : {r2}')

dfresults =  pd.DataFrame(results, columns=['model','mse', 'r2', 'num_feat','time', 'experiment'])
# dfresults

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [22]:
dfresults.pivot(index=['model'],values=['mse','r2','num_feat','time'],columns=['experiment'])\
            .style.format("{:.5f}")\
            .set_properties(**{'background-color':'lightyellow'}, subset=['mse'])\
            .set_properties(**{'background-color':'lightgreen'}, subset=['r2'])\
            .set_properties(**{'background-color':'cyan'}, subset=['time'])\
            .highlight_min(axis=0, subset=['mse','time'])\
            .highlight_max(axis=0, subset=['r2'])


Unnamed: 0_level_0,mse,mse,mse,mse,mse,mse,r2,r2,r2,r2,r2,r2,num_feat,num_feat,num_feat,num_feat,num_feat,num_feat,time,time,time,time,time,time
experiment,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
gb,308.95194,605.94365,2931.81217,77.69414,40.75552,300.33437,0.90486,0.81339,0.09712,0.97607,0.98745,0.90751,86.0,86.0,86.0,86.0,86.0,86.0,0.9921,29.91473,20.29439,13.47702,30.90397,3.36496
gb2,309.08169,1122.4582,2971.79566,134.60066,118.19865,304.12308,0.90482,0.65433,0.08481,0.95855,0.9636,0.90634,86.0,86.0,86.0,86.0,86.0,86.0,4.99973,146.89895,102.53302,66.51752,152.36239,17.23506
hub,313.67305,1891.45255,3076.40194,9.15496,9.22454,305.51735,0.9034,0.41751,0.05259,0.99718,0.99716,0.90591,86.0,86.0,86.0,86.0,86.0,86.0,0.05682,2.04849,1.48917,1.09659,2.055,0.66768
lr,307.8174,1788.64109,2957.74576,9.45986,8.94284,298.73536,0.9052,0.44917,0.08913,0.99709,0.99725,0.908,86.0,86.0,86.0,86.0,86.0,86.0,0.0126,0.52092,0.68334,0.447,0.56138,0.10073
nn,307.76807,289.63788,4797.0086,15.14828,9.19174,366.44695,0.90522,0.9108,-0.47729,0.99533,0.99717,0.88715,86.0,86.0,86.0,86.0,86.0,86.0,5.02693,86.51463,198.36697,172.66875,30.3066,110.2562
nn2,308.94458,238.78119,4141.9531,20.5053,8.83187,318.75768,0.90486,0.92646,-0.27556,0.99369,0.99728,0.90184,86.0,86.0,86.0,86.0,86.0,86.0,7.7359,69.59077,107.06849,249.04835,18.25958,70.20141
rf,308.62856,1674.02712,2965.58696,148.40314,127.34001,305.67032,0.90495,0.48447,0.08672,0.9543,0.96078,0.90587,86.0,86.0,86.0,86.0,86.0,86.0,0.27049,8.8206,8.15912,6.09434,8.85861,1.67316
ridge,307.81774,1787.77343,2945.414,9.18924,8.94421,298.58634,0.9052,0.44944,0.09293,0.99717,0.99725,0.90805,86.0,86.0,86.0,86.0,86.0,86.0,0.01162,0.43316,0.70981,0.24777,0.43496,0.08368
tree,309.77058,1928.94979,3014.18825,169.83027,170.28759,313.94944,0.9046,0.40596,0.07175,0.9477,0.94756,0.90332,86.0,86.0,86.0,86.0,86.0,86.0,0.02304,1.03603,0.90085,0.58324,1.0671,0.17638
tree2,310.6578,1541.39738,3166.32469,149.82437,95.91998,325.3531,0.90433,0.52531,0.0249,0.95386,0.97046,0.8998,86.0,86.0,86.0,86.0,86.0,86.0,0.02274,1.27297,1.33379,0.9654,1.32924,0.27539


### Exercici 2
Compara’ls en base al MSE i al R2.

<div class="alert-info">
Fet durant l'exercici anterior
</div>


<!-- <span style="color:red">Mirar l'exercici anterior</span> -->

### Exercici 3
Entrena’ls utilitzant els diferents paràmetres que admeten.

In [23]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold

model = XGBRegressor()

grid = {
        'regr__objective': ['reg:squarederror'],
        'regr__colsample_bytree': [0.2, 0.5, 1],
#         'regr__subsample': [0.7, 1],
#         'regr__learning_rate': [0.05, 0.1, 0.3],
        'regr__max_depth': [3, 6, 8],
        'regr__min_child_weight': [0, 1, 10],
        'regr__n_estimators' : [700, 1000, 5000]
    }

fold = KFold(n_splits=3)

pipe = make_pipeline(X_train, cols_exp4, model)

In [24]:
grid = RandomizedSearchCV(pipe, grid, n_iter=10,
                scoring=['neg_mean_squared_error','r2'], cv=fold,
                verbose=2, refit='r2', n_jobs=-1)



In [25]:
%%time 
grid.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
CPU times: user 2min 17s, sys: 22.8 s, total: 2min 39s
Wall time: 55min 17s


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('transformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               RobustScaler(),
                                                                               ['ActualElapsedTime',
                                                                                'CRSElapsedTime',
                                                                                'AirTime',
                                                                                'DepDelay',
                                                                                'Distance',
                                                                                'CRSElapsedTime_c',
                                                                                'ActualElapsedTime_c',
      

In [26]:
print(f'GRID: \nr2:{r2_score(y_test, grid.predict(X_test))} \
        mse: {mean_squared_error(y_test, grid.predict(X_test))}')

GRID: 
r2:0.9984217947762206         mse: 5.124706790026388


In [27]:
grid.best_params_

{'regr__objective': 'reg:squarederror',
 'regr__n_estimators': 700,
 'regr__min_child_weight': 0,
 'regr__max_depth': 3,
 'regr__colsample_bytree': 0.5}

### Exercici 4
Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)

#### CV

In [None]:
# from sklearn.metrics import SCORERS
# SCORERS.keys()

In [29]:
# results = []

# for exp, cols in tqdm(experiments.items()):
#     for name, model in tqdm(models.items()):
# #         print(f'{exp}: {model.__class__.__name__}')
#         t0 = time.time()
#         mse, r2 = run_experiment((X_train, y_train), (X_test, y_test), cols, model)
#         results.append([name, mse, r2, len(cols) ,time.time()-t0, exp])
# #         print(f'{exp} {name}: {model.__class__.__name__} \n\tMSE: {mse}\n\tR2 : {r2}')

# dfresults =  pd.DataFrame(results, columns=['model','mse', 'r2', 'num_feat','time', 'experiment'])
# # dfresults

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [30]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score, cross_validate

cv = 5
X = dfdelays
y = dfdelays[target]

results = []
# models = {'xgb': XGBRegressor(n_jobs=-1)}
for exp, cols in tqdm(experiments.items()):
    for name, model in tqdm(models.items()):
        t0 = time.time()
#         print(model)
        pipe = make_pipeline(X, cols, model )
        score = cross_validate(pipe, X[cols], y, cv=cv, 
                               scoring=['neg_mean_squared_error','r2'],
                               n_jobs=2, verbose=0,  )
    #     results[model] = [scores, scores.mean, t]
        results.append([name, 
                        score['test_neg_mean_squared_error'].mean(),
                        score['test_r2'].mean(),
                        len(cols), 
                        score['fit_time'].sum(),
                        exp])
    

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [33]:
dfcvresults = pd.DataFrame(results, 
                           columns=['model','cv_neg_mse','cv_r2', 'cv_num_feat','cv_time','experiment'])


In [34]:
dfcvresults

Unnamed: 0,model,cv_neg_mse,cv_r2,cv_num_feat,cv_time,experiment
0,lr,-300.458287,0.906466,1,0.042635,Exp. min
1,ridge,-300.458287,0.906466,1,0.034206,Exp. min
2,hub,-305.439850,0.904916,1,0.268205,Exp. min
3,tree,-301.532024,0.906141,1,0.096125,Exp. min
4,tree2,-303.899279,0.905407,1,0.100010,Exp. min
...,...,...,...,...,...,...
61,gb,-2910.587304,0.095371,16,104.997828,ND exp. 7
62,gb2,-2943.257913,0.085273,16,531.241763,ND exp. 7
63,nn,-4524.759042,-0.406861,16,876.029156,ND exp. 7
64,nn2,-3940.976217,-0.226464,16,598.775664,ND exp. 7


In [35]:
dfresults.pivot(index=['model'],values=['mse','r2','num_feat','time'],columns=['experiment'])\
            .style.format("{:.5f}")\
            .set_properties(**{'background-color':'lightyellow'}, subset=['mse'])\
            .set_properties(**{'background-color':'lightgreen'}, subset=['r2'])\
            .set_properties(**{'background-color':'cyan'}, subset=['time'])\
            .highlight_min(axis=0, subset=['mse','time'])\
            .highlight_max(axis=0, subset=['r2'])


Unnamed: 0_level_0,mse,mse,mse,mse,mse,mse,r2,r2,r2,r2,r2,r2,num_feat,num_feat,num_feat,num_feat,num_feat,num_feat,time,time,time,time,time,time
experiment,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
gb,308.95194,605.94365,2931.81217,77.69414,40.75552,300.33437,0.90486,0.81339,0.09712,0.97607,0.98745,0.90751,1.0,18.0,16.0,18.0,19.0,5.0,1.00886,30.58403,20.46032,13.32375,30.95175,3.34514
gb2,309.08169,1122.4582,2971.79566,134.60066,118.19865,304.12308,0.90482,0.65433,0.08481,0.95855,0.9636,0.90634,1.0,18.0,16.0,18.0,19.0,5.0,5.08762,162.0724,108.68916,66.98678,153.80009,17.24756
hub,313.67305,1891.45255,3076.40194,9.15496,9.22454,305.51735,0.9034,0.41751,0.05259,0.99718,0.99716,0.90591,1.0,18.0,16.0,18.0,19.0,5.0,0.05357,2.00432,1.43797,1.00156,2.05669,0.64767
lr,307.8174,1788.64109,2957.74576,9.45986,8.94284,298.73536,0.9052,0.44917,0.08913,0.99709,0.99725,0.908,1.0,18.0,16.0,18.0,19.0,5.0,0.01028,0.51684,0.69087,0.42859,0.52783,0.10217
nn,307.61722,244.73395,4158.78718,15.48172,8.76766,366.18447,0.90527,0.92463,-0.28074,0.99523,0.9973,0.88723,1.0,18.0,16.0,18.0,19.0,5.0,11.24414,128.36915,235.37953,200.19396,17.7892,99.00458
nn2,307.41945,250.58739,3648.65245,19.78993,8.94363,323.17471,0.90533,0.92283,-0.12364,0.99391,0.99725,0.90048,1.0,18.0,16.0,18.0,19.0,5.0,3.0528,60.00981,82.77966,199.20467,61.91574,60.95915
rf,308.62856,1674.02712,2965.58696,148.40314,127.34001,305.67032,0.90495,0.48447,0.08672,0.9543,0.96078,0.90587,1.0,18.0,16.0,18.0,19.0,5.0,0.26696,8.99054,8.37748,5.49297,8.94487,1.65268
ridge,307.81774,1787.77343,2945.414,9.18924,8.94421,298.58634,0.9052,0.44944,0.09293,0.99717,0.99725,0.90805,1.0,18.0,16.0,18.0,19.0,5.0,0.00771,0.4288,0.70998,0.22741,0.42999,0.08283
tree,309.77058,1928.94979,3014.18825,169.83027,170.28759,313.94944,0.9046,0.40596,0.07175,0.9477,0.94756,0.90332,1.0,18.0,16.0,18.0,19.0,5.0,0.0219,1.03621,0.90051,0.56984,1.06043,0.17496
tree2,310.6578,1541.39738,3166.32469,149.82437,95.91998,325.3531,0.90433,0.52531,0.0249,0.95386,0.97046,0.8998,1.0,18.0,16.0,18.0,19.0,5.0,0.02296,1.27892,1.34143,0.9267,1.31781,0.27471


## Nivell 2
### Exercici 5
Realitza algun procés d’enginyeria de variables per millorar-ne la predicció

<div class="alert-info">
    <strong>Info: </strong>Resultat Experiment 5 </div>


## Nivell 3
### Exercici 6
No utilitzis la variable DepDelay a l’hora de fer prediccions

<div class="alert-info">
    <strong>Info: </strong>Resultat Experiments 6 i 7</div>

