## Practicant amb training i test sets

In [1]:
# importem llibreries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
import time

import os

import warnings

warnings.simplefilter('ignore')

In [2]:
# Constant per convertir de milles a Km

ML_TO_KM = 1.609344
DELAYED_MIN = 10

In [3]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

### Carreguem les dades

In [4]:
file = '../data/DelayedFlights.csv'

types = {'DepTime':str, 'CRSDepTime':str, 'ArrTime':str, 'CRSarrTime':str}

parse_dates =  parse_dates=[['Year','Month','DayofMonth','DepTime'], ['Year','Month','DayofMonth','ArrTime']]
dfdelays = pd.read_csv(file, dtype=types,  )

# Renombrem la primera columna 
dfdelays.columns = [ 'ind', *dfdelays.columns[1:]]

# importem la taula de companyies i les guardem en un diccionari

carrier_file = '../data/Unique_carriers.csv'
carriers = pd.read_csv(carrier_file, index_col=0).to_dict()['Description']

# importem els aeroports. Ho fem directament de la web enllaçada en el dataset de Kaggle

airports = pd.read_csv('http://stat-computing.org/dataexpo/2009/airports.csv', index_col=0).to_dict()['airport']

### Exercici 1
Parteix el conjunt de dadesDelayedFlights.csv en train i test. Estudia els dos conjunts per separat, a nivell descriptiu.

In [5]:
dfdelays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 30 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ind                int64  
 1   Year               int64  
 2   Month              int64  
 3   DayofMonth         int64  
 4   DayOfWeek          int64  
 5   DepTime            object 
 6   CRSDepTime         object 
 7   ArrTime            object 
 8   CRSArrTime         int64  
 9   UniqueCarrier      object 
 10  FlightNum          int64  
 11  TailNum            object 
 12  ActualElapsedTime  float64
 13  CRSElapsedTime     float64
 14  AirTime            float64
 15  ArrDelay           float64
 16  DepDelay           float64
 17  Origin             object 
 18  Dest               object 
 19  Distance           int64  
 20  TaxiIn             float64
 21  TaxiOut            float64
 22  Cancelled          int64  
 23  CancellationCode   object 
 24  Diverted           int64  
 25  CarrierDelay      

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_delays, test_delays = train_test_split(dfdelays, test_size=0.2)

In [12]:
train_delays.sort_values(by='ind').head(10)

Unnamed: 0,ind,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,4.0,10.0,0,N,0,,,,,
5,6,2008,1,3,4,1937.0,1830,2037.0,1940,WN,...,3.0,7.0,0,N,0,10.0,0.0,0.0,0.0,47.0
6,10,2008,1,3,4,706.0,700,916.0,915,WN,...,5.0,19.0,0,N,0,,,,,
7,11,2008,1,3,4,1644.0,1510,1845.0,1725,WN,...,6.0,8.0,0,N,0,8.0,0.0,0.0,0.0,72.0
8,15,2008,1,3,4,1029.0,1020,1021.0,1010,WN,...,6.0,9.0,0,N,0,,,,,
9,16,2008,1,3,4,1452.0,1425,1640.0,1625,WN,...,7.0,8.0,0,N,0,3.0,0.0,0.0,0.0,12.0
10,17,2008,1,3,4,754.0,745,940.0,955,WN,...,5.0,16.0,0,N,0,,,,,


In [10]:
test_delays.sort_values(by='ind').head(10)

Unnamed: 0,ind,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
23,37,2008,1,3,4,1812.0,1650,1927.0,1815,WN,...,6.0,11.0,0,N,0,3.0,0.0,0.0,0.0,69.0
27,41,2008,1,3,4,1749.0,1725,2019.0,2030,WN,...,4.0,8.0,0,N,0,,,,,
31,45,2008,1,3,4,1528.0,1510,1802.0,1810,WN,...,4.0,6.0,0,N,0,,,,,
37,56,2008,1,3,4,948.0,925,959.0,940,WN,...,3.0,9.0,0,N,0,0.0,0.0,0.0,0.0,19.0
48,78,2008,1,3,4,1859.0,1850,1950.0,1945,WN,...,4.0,9.0,0,N,0,,,,,
52,84,2008,1,3,4,1614.0,1600,1833.0,1825,WN,...,4.0,15.0,0,N,0,,,,,
61,96,2008,1,3,4,2039.0,1930,155.0,55,WN,...,5.0,14.0,0,N,0,0.0,0.0,22.0,0.0,38.0
62,98,2008,1,3,4,1611.0,1535,1849.0,1825,WN,...,3.0,22.0,0,N,0,9.0,0.0,0.0,0.0,15.0
64,101,2008,1,3,4,2118.0,2015,2224.0,2115,WN,...,3.0,17.0,0,N,0,17.0,0.0,6.0,0.0,46.0


### Exercici 2
aplica algun procés de transformació (estandarditzar les dades numèriques, crea columnes dummies, polinomis...).

### Exercici 3
Resumeix les noves columnes generades de manera estadística i gràfica