# Aprenentatge Supervisat - Regressions

**Descripció**

Anem a practicar i a familiaritzar-nos amb regressions

In [1]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import time

from sklearn.preprocessing import StandardScaler, RobustScaler 
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

from utils import SinCosTransformer

import warnings
warnings.simplefilter('ignore')

SEED = 42
TEST_SIZE = 0.2
TESTING = True

### Carreguem les dades

In [2]:
dfdelays = pd.read_pickle('../data/S11-CleanDelayedFlights.pickle')

In [3]:
# Fem un sample de les dades per fer el procés més ràpid 

if TESTING:
    dfdelays = dfdelays.sample(frac=0.05, random_state=SEED)
    
dfdelays.shape

(96419, 87)

### Separem en train test

In [4]:
target = ['ArrDelay']

cols = [col for col in dfdelays.columns if col not in target]

X_train, X_test, y_train, y_test = train_test_split(dfdelays[cols], dfdelays[target], test_size=TEST_SIZE, random_state=SEED )

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((77135, 86), (19284, 86), (77135, 1), (19284, 1))

### Preprocés

In [6]:
# columnes originals

base_cols = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 
             'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 
             'UniqueCarrier', 'AirTime', 'DepDelay', 
             'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 
             'CRSElapsedTime_c', 'ActualElapsedTime_c']


In [7]:
cols_exp4 = ['ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 
            'DepDelay', 'Origin', 'Dest', 'UniqueCarrier', 'Distance', 
            'carrier_name', 'Name_Origin', 'Name_Dest', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity', 
            'DepartureTime_hour_sin', 'DepartureTime_hour_cos', 'DepartureTime_day_sin', 'DepartureTime_day_cos', 
            'DepartureTime_dayofweek_sin', 'DepartureTime_dayofweek_cos', 'DepartureTime_month_sin', 'DepartureTime_month_cos', 
            'CRSDepartureTime_hour_sin', 'CRSDepartureTime_hour_cos', 'CRSDepartureTime_day_sin', 'CRSDepartureTime_day_cos', 
            'CRSDepartureTime_dayofweek_sin', 'CRSDepartureTime_dayofweek_cos', 'CRSDepartureTime_month_sin', 'CRSDepartureTime_month_cos', 
            'ArrivalTime_hour_sin', 'ArrivalTime_hour_cos', 'ArrivalTime_day_sin', 'ArrivalTime_day_cos', 
            'ArrivalTime_dayofweek_sin', 'ArrivalTime_dayofweek_cos', 'ArrivalTime_month_sin', 'ArrivalTime_month_cos', 
            'CRSArrivalTime_hour_sin', 'CRSArrivalTime_hour_cos', 'CRSArrivalTime_day_sin', 'CRSArrivalTime_day_cos', 
            'CRSArrivalTime_dayofweek_sin', 'CRSArrivalTime_dayofweek_cos', 'CRSArrivalTime_month_sin', 'CRSArrivalTime_month_cos']


In [8]:
# amb columes noves
cols_exp5 = ['DepDelay',  'Distance', 
            'UniqueCarrier', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity']

# Eliminem el DepDelay
cols_exp6 = ['Distance', 
            'UniqueCarrier', 
            'CRSElapsedTime_c', 'ActualElapsedTime_c', 
            'DepartureTime', 'CRSDepartureTime', 
            'ArrivalTime', 'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'velocity', 'CRSvelocity']

# Eliminem el DepDelay, així com dades sobre el vol real, fins i tot el DepartureTime
# No esperem un bon resultat 
cols_exp7 = ['Distance', 
            'UniqueCarrier', 'Origin', 'Dest',
            'CRSElapsedTime_c', 
            'CRSDepartureTime', 
            'CRSArrivalTime', 
            'Dep_hourlyf', 'Dep_dailyf', 'Arr_dailyf', 'Arr_hourlyf',    
            'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 
            'CRSvelocity']

## Nivell 1

### Exercici 1
Crea almenys tres models de regressió diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv.

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, GammaRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor

# metriques
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Crearem aquests models:
- LinearRegression
- Rigde
- HuberRegressor (ourlier-robust)
- DecisionTreeRegressor
- RandomForestRegressor
- GradientBoostingRegressor
- MLPRegressor
- XGBoostRegressor


##### Preparem un parell de funcions on creem un pipeline i executem la regressió

In [10]:
def make_pipeline(data, columns, model):
    cols_num = data[columns].select_dtypes(include=['int','float']).columns.to_list()
    cols_cat = data[columns].select_dtypes(include=['object']).columns.to_list()
    cols_date = data[columns].select_dtypes(include=['datetime']).columns.to_list()
    
    ct = ColumnTransformer(
             transformers=[('num', RobustScaler(), cols_num),
                          ('cat', OneHotEncoder(handle_unknown='ignore'), cols_cat),
                          ('date', SinCosTransformer(['hour','day','dayofweek','month']), cols_date )])
    
    pipeline = Pipeline( steps=[('transformer', ct),
                               ('regr', model)])
    
    return pipeline

# base_pipe = make_pipeline(dfdelays, base_cols, LinearRegression())

# base_pipe.fit(dfdelays[base_cols], dfdelays[target])
# y_pred_base = base_pipe.predict(dfdelays[base_cols])
# print(f'LR r2: {r2_score(dfdelays[target], y_pred_base)}')
# print(f'LR MSE: {mean_squared_error(dfdelays[target], y_pred_base)}')

def run_experiment(train, test=None, columns=[], model=LinearRegression()):
    if test is None:
        X_train, X_test, y_train, y_test = train_test_split(train, 
                                                        test_size=TEST_SIZE, random_state=SEED)
    else:
        X_train, y_train = train
        X_test, y_test = test
        
    pipe = make_pipeline(X_train, columns, model)
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)
    
    

In [11]:
models = {'lr': LinearRegression(),
         'ridge': Ridge(),
         'hub': HuberRegressor(),
         'tree': DecisionTreeRegressor(max_depth=7, random_state=SEED),
         'rf': RandomForestRegressor(max_depth=7, n_jobs=-1, random_state=SEED),
         'gb': GradientBoostingRegressor( random_state=SEED, criterion='mse'),
         'nn': MLPRegressor(hidden_layer_sizes=(100,50,25)),
         'xgb': XGBRegressor(n_jobs=-1)}


# params = {  'lr': {'fit_intercept':False, 'normalize': True},
#             'ridge': {'alpha':np.linspace(), 'solver':{'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}},
#             'hub': {},
#             'tree': {},
#             'rf': {} 
#             'gb': {} 
#             'nn': {} 
#             'xgb': {}
#          }


#### Experiments - Features escollits

- **Exp. min**: Només amb DepDelay
- **base**: Amb les dades originals netejades
- **exp. 3**: Amb només 'DepDelay', 'Distance','Origin', 'Dest', 'UniqueCarrier'
- **exercici 5**. Noves columnes. Velocitat, Latitut i Longitut dels aeroports, Numero de vols per hora i dia a l'aeroport de Sortida i Arribada
- **ND exp. 6**: com el 5 però sense el DepDelay
- **ND exp. 7**: Eliminem el DepDelay, així com dades sobre el vol real, fins i tot el DepartureTime. No esperem un bon resultat Experiments sense DepDelay

In [12]:
experiments = { 'Exp. min': ['DepDelay'],
              'base': base_cols, 
              'exp. 3': ['DepDelay', 'Distance','Origin', 'Dest', 'UniqueCarrier'],
              'exercici 5': cols_exp5,
               'ND exp. 6': cols_exp6,
               'ND exp. 7': cols_exp7 }

In [13]:
results = []

for exp, cols in tqdm(experiments.items()):
    for name, model in tqdm(models.items()):
#         print(f'{exp}: {model.__class__.__name__}')
        t0 = time.time()
        mse, r2 = run_experiment((X_train, y_train), (X_test, y_test), cols, model)
        results.append([name, mse, r2, len(X_train.shape[-1]) ,time.time()-t0, exp])
#         print(f'{exp} {name}: {model.__class__.__name__} \n\tMSE: {mse}\n\tR2 : {r2}')

dfresults =  pd.DataFrame(results, columns=['model','mse', 'r2', 'num_feat','time', 'experiment'])
# dfresults

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
dfresults.pivot(index=['model'],values=['mse','r2','num_feat','time'],columns=['experiment'])\
            .style.format("{:.5f}")\
            .set_properties(**{'background-color':'lightyellow'}, subset=['mse'])\
            .set_properties(**{'background-color':'lightgreen'}, subset=['r2'])\
            .set_properties(**{'background-color':'cyan'}, subset=['time'])\
            .highlight_min(axis=0, subset=['mse','time'])\
            .highlight_max(axis=0, subset=['r2'])


Unnamed: 0_level_0,mse,mse,mse,mse,mse,mse,r2,r2,r2,r2,r2,r2,num_feat,num_feat,num_feat,num_feat,num_feat,num_feat,time,time,time,time,time,time
experiment,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3,Exp. min,ND exp. 6,ND exp. 7,base,exercici 5,exp. 3
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
gb,308.95194,605.94365,2931.81217,77.69414,40.75552,300.33437,0.90486,0.81339,0.09712,0.97607,0.98745,0.90751,1.0,18.0,16.0,18.0,19.0,5.0,1.0182,31.89541,20.35487,13.42548,32.64439,3.40789
hub,313.67305,1891.45255,3076.40194,9.15496,9.22454,305.51735,0.9034,0.41751,0.05259,0.99718,0.99716,0.90591,1.0,18.0,16.0,18.0,19.0,5.0,0.06624,2.16027,1.45415,1.04509,2.29813,0.66447
lr,307.8174,1788.59158,2957.69304,9.45941,8.94258,298.7353,0.9052,0.44919,0.08915,0.99709,0.99725,0.908,1.0,18.0,16.0,18.0,19.0,5.0,0.02361,0.58776,2.29345,3.60088,0.64946,0.42058
nn,307.62209,277.71489,4488.50746,16.42906,8.7308,355.76755,0.90526,0.91447,-0.38228,0.99494,0.99731,0.89044,1.0,18.0,16.0,18.0,19.0,5.0,49.30249,1351.62443,343.17271,770.38053,534.70054,730.18726
rf,308.62856,1674.02712,2965.58696,148.40314,127.34001,305.67032,0.90495,0.48447,0.08672,0.9543,0.96078,0.90587,1.0,18.0,16.0,18.0,19.0,5.0,0.35986,9.4985,8.9236,5.73457,9.90225,1.8147
ridge,307.81774,1787.77343,2945.414,9.18924,8.94421,298.58634,0.9052,0.44944,0.09293,0.99717,0.99725,0.90805,1.0,18.0,16.0,18.0,19.0,5.0,0.01186,0.44348,0.7105,0.23589,0.45591,0.08351
tree,309.77058,1928.94979,3014.18825,169.83027,170.28759,313.94944,0.9046,0.40596,0.07175,0.9477,0.94756,0.90332,1.0,18.0,16.0,18.0,19.0,5.0,0.02369,1.09129,0.90108,0.56938,1.09821,0.18988
xgb,309.79072,327.01005,2863.13859,19.11803,19.9864,301.08739,0.9046,0.89929,0.11827,0.99411,0.99384,0.90728,1.0,18.0,16.0,18.0,19.0,5.0,3.82419,6.07717,4.60606,7.29725,6.22462,2.46436


### Exercici 2
Compara’ls en base al MSE i al R2.

<div class="alert-info">
Fet durant l'exercici anterior
</div>


<!-- <span style="color:red">Mirar l'exercici anterior</span> -->

### Exercici 3
Entrena’ls utilitzant els diferents paràmetres que admeten.

In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold

model = XGBRegressor()

grid = {
        'regr__objective': ['reg:squarederror'],
        'regr__colsample_bytree': [0.2, 0.5, 1],
#         'regr__subsample': [0.7, 1],
#         'regr__learning_rate': [0.05, 0.1, 0.3],
        'regr__max_depth': [3, 6, 8],
        'regr__min_child_weight': [0, 1, 10],
        'regr__n_estimators' : [700, 1000, 5000]
    }

fold = KFold(n_splits=3)

pipe = make_pipeline(X_train, cols_exp4, model)

In [16]:
grid = RandomizedSearchCV(pipe, grid, n_iter=10,
                scoring=['neg_mean_squared_error','r2'], cv=fold,
                verbose=2, refit='r2', n_jobs=-1)



In [17]:
%%time 
grid.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [18]:
print(f'GRID: \nr2:{r2_score(y_test, grid.predict(X_test))} \
        mse: {mean_squared_error(y_test, grid.predict(X_test))}')

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
grid.best_params_

### Exercici 4
Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)

#### CV

In [None]:
from sklearn.metrics import SCORERS
SCORERS.keys()

In [None]:
results = []

for exp, cols in tqdm(experiments.items()):
    for name, model in tqdm(models.items()):
#         print(f'{exp}: {model.__class__.__name__}')
        t0 = time.time()
        mse, r2 = run_experiment((X_train, y_train), (X_test, y_test), cols, model)
        results.append([name, mse, r2, len(cols) ,time.time()-t0, exp])
#         print(f'{exp} {name}: {model.__class__.__name__} \n\tMSE: {mse}\n\tR2 : {r2}')

dfresults =  pd.DataFrame(results, columns=['model','mse', 'r2', 'num_feat','time', 'experiment'])
# dfresults

In [None]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score, cross_validate

cv = 5
X = dfdelays
y = dfdelays[target]

results = []
# models = {'xgb': XGBRegressor(n_jobs=-1)}
for exp, cols in tqdm(experiments.items()):
    for name, model in tqdm(models.items()):
        t0 = time.time()
#         print(model)
        pipe = make_pipeline(X, cols, model )
        score = cross_validate(pipe, X[cols], y, cv=cv, 
                               scoring=['neg_mean_squared_error','r2'],
                               n_jobs=2, verbose=0,  )
    #     results[model] = [scores, scores.mean, t]
        results.append([name, 
                        score['test_neg_mean_squared_error'].mean(),
                        score['test_r2'].mean(),
                        len(cols), 
                        score['fit_time'].sum(),
                        exp])
    

In [None]:
dfcvresults = pd.DataFrame(results, 
                           columns=['model','cv_neg_mse','cv_r2', 'cv_time','experiment'])\


In [None]:
dfcvresults

In [None]:
dfresults.pivot(index=['model'],values=['mse','r2','num_feat','time'],columns=['experiment'])\
            .style.format("{:.5f}")\
            .set_properties(**{'background-color':'lightyellow'}, subset=['mse'])\
            .set_properties(**{'background-color':'lightgreen'}, subset=['r2'])\
            .set_properties(**{'background-color':'cyan'}, subset=['time'])\
            .highlight_min(axis=0, subset=['mse','time'])\
            .highlight_max(axis=0, subset=['r2'])


## Nivell 2
### Exercici 5
Realitza algun procés d’enginyeria de variables per millorar-ne la predicció

<div class="alert-info">
    <strong>Info: </strong>Resultat Experiment 5 </div>


## Nivell 3
### Exercici 6
No utilitzis la variable DepDelay a l’hora de fer prediccions

<div class="alert-info">
    <strong>Info: </strong>Resultat Experiments 6 i 7</div>



In [None]:
sns.histplot(dfdelays.DepDelay)

In [None]:
dfdelays.DepDelay.describe()