In [None]:
import tuplex

In [None]:
import pandas as pd
import os

In [None]:
null_based_file = "/Users/leonhards/Downloads/flights/flights_on_time_performance_2013_01.csv"

In [None]:
df = pd.read_csv(null_based_file)

In [None]:
df['WEATHER_DELAY'].value_counts(dropna=False)

In [None]:
421309 / len(df) * 100.0

In [None]:
# so 82% of all values are null and need value imputation

In [None]:
# is weather delay related to overall delay?

In [None]:
# weather delay should be ONLY imputed if flight is not on time.
# cf. https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations#q8

# i.e. before 2003/06 the data is missing.

In [None]:
list(df.columns)

In [None]:
df['ARR_DELAY_NEW']

In [None]:
delay_cols = sorted([c for c in df.columns if 'DELAY' in c])
df[df['ARR_DELAY'] < 0.].iloc[:2][delay_cols]

In [None]:
df[~df['DIV_ARR_DELAY'].isna()][delay_cols]

In [None]:
# ok, so here's the logic:

def fill_in_delays(row):
    # want to fill in data for missing carrier_delay, weather delay etc.
    # only need to do that prior to 2003/06
    
    year = row['YEAR']
    month = row['MONTH']
    arr_delay = row['ARR_DELAY']
    
    if year == 2003 and month < 6 or year < 2003:
        # fill in delay breakdown using model and complex logic
        if arr_delay < 0.:
            # stays None, because flight arrived early
            # if diverted though, need to add everything to div_arr_delay
            return {'year' : year, 'month' : month,
                    'day' : row['DAY_OF_MONTH'],
                    'carrier': row['OP_UNIQUE_CARRIER'],
                    'flightno' : row['OP_CARRIER_FL_NUM'],
                    'origin': row['ORIGIN_AIRPORT_ID'],
                    'dest': row['DEST_AIRPORT_ID'],
                    'distance' : row['DISTANCE'],
                    'dep_delay' : row['DEP_DELAY'],
                    'arr_delay': row['ARR_DELAY'],
                    'carrier_delay' : None,
                    'weather_delay': None,
                    'nas_delay' : None,
                    'security_delay': None,
                    'late_aircraft_delay' : None}
        elif arr_delay < 5.:
            # it's an ontime flight, just attribute any delay to the carrier
            carrier_delay = arr_delay
            # set the rest to 0
            # ....
            return {'year' : year, 'month' : month,
                    'day' : row['DAY_OF_MONTH'],
                    'carrier': row['OP_UNIQUE_CARRIER'],
                    'flightno' : row['OP_CARRIER_FL_NUM'],
                    'origin': row['ORIGIN_AIRPORT_ID'],
                    'dest': row['DEST_AIRPORT_ID'],
                    'distance' : row['DISTANCE'],
                    'dep_delay' : row['DEP_DELAY'],
                    'arr_delay': row['ARR_DELAY'],
                    'carrier_delay' : carrier_delay,
                    'weather_delay': None,
                    'nas_delay' : None,
                    'security_delay': None,
                    'late_aircraft_delay' : None}
        else:
            # use model to determine everything and set into (join with weather data?)
            # i.e., extract here a couple additional columns & use them for features etc.!
            crs_dep_time = row['CRS_DEP_TIME']
            crs_elapsed_time = row['CRS_ELAPSED_TIME']
            carrier_delay = 1024
            weather_delay = 2000
            nas_delay = 3600
            security_delay = 7200
            late_aircraft_delay = 20
            return {'year' : year, 'month' : month,
                    'day' : row['DAY_OF_MONTH'],
                    'carrier': row['OP_UNIQUE_CARRIER'],
                    'flightno' : row['OP_CARRIER_FL_NUM'],
                    'origin': row['ORIGIN_AIRPORT_ID'],
                    'dest': row['DEST_AIRPORT_ID'],
                    'distance' : row['DISTANCE'],
                    'dep_delay' : row['DEP_DELAY'],
                    'arr_delay': row['ARR_DELAY'],
                    'carrier_delay' : carrier_delay,
                    'weather_delay': weather_delay,
                    'nas_delay' : nas_delay,
                    'security_delay': security_delay,
                    'late_aircraft_delay' : late_aircraft_delay}
    else:
        # just return it as is
        return {'year' : year, 'month' : month,
                'day' : row['DAY_OF_MONTH'],
                'carrier': row['OP_UNIQUE_CARRIER'],
                'flightno' : row['OP_CARRIER_FL_NUM'],
                'origin': row['ORIGIN_AIRPORT_ID'],
                'dest': row['DEST_AIRPORT_ID'],
                'distance' : row['DISTANCE'],
                'dep_delay' : row['DEP_DELAY'],
                'arr_delay': row['ARR_DELAY'],
                'carrier_delay' : row['CARRIER_DELAY'],
                'weather_delay':row['WEATHER_DELAY'],
                'nas_delay' : row['NAS_DELAY'],
                'security_delay': row['SECURITY_DELAY'],
                'late_aircraft_delay' : row['LATE_AIRCRAFT_DELAY']}
   
    # then aggregate them with replacing NULLs with 0s?
# exclude diverted/cancelled flights (!!!)
filter(!diverted and !cancelled)
filter(arr_delay > 0)

In [None]:
# Train quickly a linear model to fill in data!

# fetch traindata!


In [3]:
import pandas as pd
import os
import glob
import sklearn
import time

In [None]:
df = pd.DataFrame()

for month in range(1, 13):
    for year in range(2003, 2021):
        path = '/hot/data/flights_all/flights_on_time_performance_{:04d}_{:02d}.csv'.format(year, month)
        
        if os.path.isfile(path):
            tstart = time.time()
            print('loading {}'.format(os.path.basename(path)))
            subdf = pd.read_csv(path, low_memory=False)
            subdf = subdf[~subdf['WEATHER_DELAY'].isna()]
            df = pd.concat((df, subdf))
            print('loaded {} rows in {:.2f}s'.format(len(subdf), time.time() - tstart))
print('loaded {} rows in total.'.format(len(df)))

loading flights_on_time_performance_2003_01.csv
loaded 0 rows in 19.59s
loading flights_on_time_performance_2004_01.csv
loaded 128236 rows in 17.93s
loading flights_on_time_performance_2005_01.csv
loaded 143316 rows in 18.85s
loading flights_on_time_performance_2006_01.csv
loaded 112299 rows in 19.34s
loading flights_on_time_performance_2007_01.csv
loaded 150144 rows in 21.19s
loading flights_on_time_performance_2008_01.csv
loaded 148807 rows in 21.32s
loading flights_on_time_performance_2009_01.csv
loaded 108656 rows in 19.80s
loading flights_on_time_performance_2010_01.csv
loaded 96928 rows in 6.86s
loading flights_on_time_performance_2011_01.csv
loaded 97078 rows in 19.78s
loading flights_on_time_performance_2012_01.csv
loaded 70908 rows in 19.63s
loading flights_on_time_performance_2013_01.csv
loaded 88210 rows in 20.95s
loading flights_on_time_performance_2014_01.csv
loaded 119994 rows in 20.79s
loading flights_on_time_performance_2015_01.csv
loaded 95951 rows in 21.68s
loading fl

loaded 14949 rows in 63.94s
loading flights_on_time_performance_2003_07.csv
loaded 104579 rows in 73.76s
loading flights_on_time_performance_2004_07.csv
loaded 135999 rows in 75.91s
loading flights_on_time_performance_2005_07.csv
loaded 166767 rows in 77.81s
loading flights_on_time_performance_2006_07.csv
loaded 150771 rows in 78.38s
loading flights_on_time_performance_2007_07.csv
loaded 180319 rows in 80.32s
loading flights_on_time_performance_2008_07.csv
loaded 140050 rows in 80.46s
loading flights_on_time_performance_2009_07.csv
loaded 121429 rows in 80.17s
loading flights_on_time_performance_2010_07.csv
loaded 122900 rows in 81.01s
loading flights_on_time_performance_2011_07.csv
loaded 110478 rows in 80.55s
loading flights_on_time_performance_2012_07.csv
loaded 121396 rows in 81.33s
loading flights_on_time_performance_2013_07.csv
loaded 142559 rows in 83.04s
loading flights_on_time_performance_2014_07.csv
loaded 117151 rows in 82.53s
loading flights_on_time_performance_2015_07.csv


In [6]:
# Need to create a super basic model...
# use spark for quick preprocessing??? Or Tuplex???

In [7]:
len(df)

21744768

In [9]:
!mkdir /hot/scratch/flights_lm

In [11]:
#df.to_csv('/hot/scratch/flights_lm/rawdata.csv', index=None)

In [12]:
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM',
       ...
       'DIV5_AIRPORT', 'DIV5_AIRPORT_ID', 'DIV5_AIRPORT_SEQ_ID',
       'DIV5_WHEELS_ON', 'DIV5_TOTAL_GTIME', 'DIV5_LONGEST_GTIME',
       'DIV5_WHEELS_OFF', 'DIV5_TAIL_NUM', 'Unnamed: 109', 'Unnamed: 23'],
      dtype='object', length=111)

In [15]:
delay_cols = [name for name in df.columns if 'DELAY' in name]

In [16]:
delay_cols

['DEP_DELAY',
 'DEP_DELAY_NEW',
 'DEP_DELAY_GROUP',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'ARR_DELAY_GROUP',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'DIV_ARR_DELAY']

In [17]:
# predict these!
target_vars = ['CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY']

In [18]:
target_vars

['CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY']

In [19]:
y = df[target_vars]

In [22]:
y.values[:10]

array([[  0.,   0.,  22.,   0., 195.],
       [  0.,  62.,   2.,   0.,  46.],
       [  0.,   0.,   2.,   0., 108.],
       [  0.,   4.,  18.,   0.,  16.],
       [  0.,   0.,  40.,   0.,   0.],
       [ 33.,   0.,   0.,   0.,   0.],
       [ 17.,   0.,   8.,   0.,   0.],
       [  0.,   3.,  19.,   0.,  16.],
       [ 27.,   0.,   0.,   0.,   0.],
       [  0.,   0.,  37.,   0.,   0.]])

In [24]:
# features:

# which airline, which year, month, day, ...
# which time
# what distance

In [26]:
airline_codes = df['OP_UNIQUE_CARRIER'].unique()

In [29]:
airline_codes = list(airline_codes)

In [30]:
airline_codes

['AA',
 'DH',
 'DL',
 'EV',
 'HP',
 'MQ',
 'NW',
 'OO',
 'XE',
 'UA',
 'US',
 'WN',
 'CO',
 'FL',
 'OH',
 'HA',
 'TZ',
 'AS',
 'B6',
 'F9',
 'YV',
 '9E',
 'KH',
 'VX',
 'NK',
 'G4',
 'YX']

In [57]:
unique_states = df['ORIGIN_STATE_ABR'].unique()

In [58]:
list(unique_states)

['MO',
 'NY',
 'TX',
 'MA',
 'IL',
 'FL',
 'UT',
 'GA',
 'NC',
 'NJ',
 'CA',
 'TN',
 'NE',
 'PA',
 'MI',
 'RI',
 'OK',
 'VA',
 'PR',
 'IA',
 'NV',
 'IN',
 'WA',
 'LA',
 'AZ',
 'NM',
 'CO',
 'MN',
 'VI',
 'KS',
 'OR',
 'CT',
 'OH',
 'MD',
 'AR',
 'KY',
 'SC',
 'SD',
 'VT',
 'AL',
 'ME',
 'WI',
 'ND',
 'WV',
 'NH',
 'MT',
 'ID',
 'WY',
 'AK',
 'MS',
 'HI',
 'DE',
 nan,
 'TT']

In [59]:
unique_states = ['MO',
 'NY',
 'TX',
 'MA',
 'IL',
 'FL',
 'UT',
 'GA',
 'NC',
 'NJ',
 'CA',
 'TN',
 'NE',
 'PA',
 'MI',
 'RI',
 'OK',
 'VA',
 'PR',
 'IA',
 'NV',
 'IN',
 'WA',
 'LA',
 'AZ',
 'NM',
 'CO',
 'MN',
 'VI',
 'KS',
 'OR',
 'CT',
 'OH',
 'MD',
 'AR',
 'KY',
 'SC',
 'SD',
 'VT',
 'AL',
 'ME',
 'WI',
 'ND',
 'WV',
 'NH',
 'MT',
 'ID',
 'WY',
 'AK',
 'MS',
 'HI',
 'DE',
 'TT']

In [61]:
len(df)

21744768

In [64]:
airport_codes = set(df['ORIGIN'].unique()) | set(df['DEST'].unique())

In [66]:
airport_codes = list(airport_codes)

In [80]:
'[' + ','.join(map(lambda x: "'{}'".format(x), airport_codes)) + ']'

"['RIC','SMX','BFF','STT','ALS','BIL','CRP','ORH','SHV','BMI','MVY','DRO','ESC','GCK','LNK','UST','YUM','MDT','CLD','PIE','DSM','BHM','HHH','PSG','BIS','LIH','ROW','DLH','HGR','DCA','HNL','DTW','LMT','AVL','LAW','ITH','HRL','LAX','PHL','SWO','CWA','BOS','GJT','SRQ','VIS','ACY','CPR','MOB','SJC','YNG','SNA','CSG','BRO','BFM','SWF','HYA','OGG','LBF','ANC','ONT','TRI','SCE','RKS','BTV','BGM','LGB','MCN','ROA','PGV','MSY','MQT','ITO','VLD','OMA','SUN','EVV','FOE','GUM','OGS','YKM','SAT','JMS','EWN','ASE','CIC','STL','CDC','LWB','SCC','SBP','DAY','CEC','SGF','OTH','DFW','INL','ILE','SPS','PSE','SFB','JAC','CHO','TXK','BET','BUR','PSC','DDC','TPA','HVN','BRD','PIB','ABQ','EWR','ORD','OWB','LCH','AKN','PSP','ISO','IAG','RDD','FSM','PFN','DRT','FMN','AGS','EAR','CMI','STC','LBB','SHD','IAD','MIA','EFD','RDU','HOU','GNV','LFT','FAT','MOT','PLN','SPI','IPT','RFD','OME','MFE','TYS','RHI','DEN','MKC','CDB','JAN','SYR','SIT','MBS','ABY','SCK','IPL','DLG','WYS','UIN','GRK','MRY','PIA','LAN','CRW','S

In [86]:
'[' + ','.join(map(lambda x: "'{}'".format(x), unique_states)) + ']'

"['MO','NY','TX','MA','IL','FL','UT','GA','NC','NJ','CA','TN','NE','PA','MI','RI','OK','VA','PR','IA','NV','IN','WA','LA','AZ','NM','CO','MN','VI','KS','OR','CT','OH','MD','AR','KY','SC','SD','VT','AL','ME','WI','ND','WV','NH','MT','ID','WY','AK','MS','HI','DE','TT']"

In [87]:
# for each row, compute feature vector (lambda func!)
def feature_vector(row):
    fv = []
    fv += [float(row['DAY_OF_MONTH']), float(row['DAY_OF_WEEK']), float(row['MONTH']),
           float(row['QUARTER']), float(row['CRS_ELAPSED_TIME']), float(row['DISTANCE'])]
    
    # one-hot encode arrival and departure time
    crs_arr_minute = int(row['CRS_ARR_TIME']) % 100
    crs_arr_hour = int(row['CRS_ARR_TIME'] / 100)
    crs_arr_15min = int(crs_arr_minute / 15)

    crs_dep_minute = int(row['CRS_DEP_TIME']) % 100 % 100
    crs_dep_hour = int(row['CRS_DEP_TIME'] / 100)
    crs_dep_15min = int(crs_dep_minute / 15)
    
    fv += [float(crs_arr_hour), float(crs_arr_15min), float(crs_dep_hour), float(crs_dep_15min)]
    
    # one-hot encode dest airport and origin airport
    airport_codes = ['RIC','SMX','BFF','STT','ALS','BIL','CRP','ORH','SHV','BMI','MVY',
                     'DRO','ESC','GCK','LNK','UST','YUM','MDT','CLD','PIE','DSM','BHM',
                     'HHH','PSG','BIS','LIH','ROW','DLH','HGR','DCA','HNL','DTW','LMT',
                     'AVL','LAW','ITH','HRL','LAX','PHL','SWO','CWA','BOS','GJT','SRQ',
                     'VIS','ACY','CPR','MOB','SJC','YNG','SNA','CSG','BRO','BFM','SWF',
                     'HYA','OGG','LBF','ANC','ONT','TRI','SCE','RKS','BTV','BGM','LGB',
                     'MCN','ROA','PGV','MSY','MQT','ITO','VLD','OMA','SUN','EVV','FOE',
                     'GUM','OGS','YKM','SAT','JMS','EWN','ASE','CIC','STL','CDC','LWB',
                     'SCC','SBP','DAY','CEC','SGF','OTH','DFW','INL','ILE','SPS','PSE',
                     'SFB','JAC','CHO','TXK','BET','BUR','PSC','DDC','TPA','HVN','BRD',
                     'PIB','ABQ','EWR','ORD','OWB','LCH','AKN','PSP','ISO','IAG','RDD',
                     'FSM','PFN','DRT','FMN','AGS','EAR','CMI','STC','LBB','SHD','IAD',
                     'MIA','EFD','RDU','HOU','GNV','LFT','FAT','MOT','PLN','SPI','IPT',
                     'RFD','OME','MFE','TYS','RHI','DEN','MKC','CDB','JAN','SYR','SIT',
                     'MBS','ABY','SCK','IPL','DLG','WYS','UIN','GRK','MRY','PIA','LAN',
                     'CRW','SBA','LEX','PVU','XNA','ROC','FCA','PIT','SAN','PHF','PSM',
                     'PHX','PIH','CVG','RSW','TUS','ABI','DEC','COS','LSE','FSD','ORF',
                     'BWI','BTM','CYS','UTM','ERI','EYW','HIB','LGA','BTR','MMH','YAK',
                     'JNU','ILM','BNA','GPT','FNT','PWM','BRW','AZA','MKK','ABR','DAB',
                     'MHK','MLB','GTR','SEA','SJU','CMX','ACT','BDL','KOA','ACV','RNO',
                     'OTZ','HLN','JST','STS','PAE','BQK','GCC','BLV','SLC','FLL','LYH',
                     'CGI','CIU','GST','TTN','AMA','AVP','JLN','EGE','WRG','MGM','TWF',
                     'BQN','CLE','MLU','GSP','APF','PGD','LBE','SOP','EUG','MLI','ADQ',
                     'CAE','FLG','MDW','CAK','SJT','SLE','MYR','GEG','ATL','PUB','USA',
                     'VEL','MCO','DIK','MWH','EKO','MFR','PBI','TUL','MOD','GRB','ALO',
                     'ECP','AZO','GUC','TUP','HDN','BLI','CNY','RDM','VPS','AEX','LRD',
                     'APN','BKG','VCT','PDX','PNS','GTF','PRC','SPN','SLN','LCK','OAK',
                     'SUX','ADK','TEX','BPT','SBN','OKC','FAR','DVL','ELM','TYR','MEM',
                     'SFO','PPG','DUT','STX','MTH','PVD','SAV','IMT','SDF','ART','HKY',
                     'HYS','DBQ','CMH','TLH','MTJ','CHA','GSO','CLL','OAJ','CHS','LNY',
                     'CID','ISN','ACK','IAH','IND','CLT','DAL','LAS','HPN','AUS','FAY',
                     'DHN','SGU','ATY','GGG','HOB','KTN','ILG','MEI','EAU','MSO','ATW',
                     'TOL','MSN','LBL','JFK','MKG','TVC','RIW','MSP','IFP','OXR','GRR',
                     'ICT','RST','PAH','IDA','BFL','OGD','SHR','BGR','LIT','MKE','HSV',
                     'BJI','ELP','PIR','ABE','COD','MCI','BZN','FLO','MHT','PMD','FAI',
                     'IYK','FWA','SMF','COU','XWA','BOI','HTS','RAP','PBG','JAX','ALB',
                     'ISP','CDV','GRI','MAF','GFK','BUF','LAR','SAF','LWS','CKB']
    
    origin_one_hot = [float(code == row['ORIGIN']) for code in airport_codes]
    dest_one_hot = [float(code == row['DEST']) for code in airport_codes]
    fv += origin_one_hot
    fv += dest_one_hot
    
    # state one hot encoding (can be None!)
    origin_state_is_none = row['ORIGIN_STATE_ABR'] is None
    fv.append(float(origin_state_is_none))
    dest_state_is_none = row['DEST_STATE_ABR'] is None
    fv.append(float(dest_state_is_none))
    
    # now perform state one hot encoding, make a bit faster with if logic!
    states_abbr = ['MO','NY','TX','MA','IL','FL','UT','GA','NC','NJ','CA','TN','NE','PA',
                   'MI','RI','OK','VA','PR','IA','NV','IN','WA','LA','AZ','NM','CO','MN',
                   'VI','KS','OR','CT','OH','MD','AR','KY','SC','SD','VT','AL','ME','WI',
                   'ND','WV','NH','MT','ID','WY','AK','MS','HI','DE','TT']
    
    state_origin_one_hot = [float(code == row['ORIGIN_STATE_ABR']) for code in states_abbr]
    state_dest_one_hot = [float(code == row['DEST_STATE_ABR']) for code in states_abbr]
    fv += state_origin_one_hot
    fv += state_dest_one_hot
    
    return fv

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
nrows = 10000

In [None]:
minidf = pd.read_csv('/hot/scratch/flights_lm/rawdata.csv', nrows=nrows)

In [118]:
# Now encode quickly minidf into feature vector & fit sklearn model...


x = np.array(feature_vector(minidf.iloc[0]))
X = x

for idx, row in tqdm(minidf.reset_index(drop=True).iterrows()):
    if idx == 0:
        continue
    x = np.array(feature_vector(row))
    X = np.vstack((X,x))

1000it [00:08, 122.06it/s]


In [119]:
X.shape

(1000, 930)

In [120]:
y = minidf[target_vars].values

In [121]:
y.shape

(1000, 5)

In [159]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler

In [2]:
scaler = StandardScaler().fit(X)

NameError: name 'StandardScaler' is not defined

In [161]:
# scaler.mean_

In [147]:
sX = scaler.transform(X)

In [1]:
!pip3 install --upgrade scikit-learn

Collecting scikit-learn
  Using cached https://files.pythonhosted.org/packages/f5/ef/bcd79e8d59250d6e8478eb1290dc6e05be42b3be8a86e3954146adbc171a/scikit_learn-0.24.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached https://files.pythonhosted.org/packages/61/cf/6e354304bcb9c6413c4e02a747b600061c21d38ba51e7e544ac7bc66aecc/threadpoolctl-3.1.0-py3-none-any.whl
Collecting scipy>=0.19.1 (from scikit-learn)
  Using cached https://files.pythonhosted.org/packages/c8/89/63171228d5ced148f5ced50305c89e8576ffc695a90b58fe5bb602b910c2/scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl
Collecting joblib>=0.11 (from scikit-learn)
  Using cached https://files.pythonhosted.org/packages/3e/d5/0163eb0cfa0b673aa4fe1cd3ea9d8a81ea0f32e50807b0c295871e4aab2e/joblib-1.1.0-py2.py3-none-any.whl
Collecting numpy>=1.13.3 (from scikit-learn)
  Using cached https://files.pythonhosted.org/packages/45/b2/6c7545bb7a38754d63048c7696804a0d947328125d81bf12beaa692c3ae3/numpy-1.19.

In [163]:
help(Ridge)

Help on class Ridge in module sklearn.linear_model._ridge:

class Ridge(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, _BaseRidge)
 |  Linear least squares with l2 regularization.
 |  
 |  Minimizes the objective function::
 |  
 |  ||y - Xw||^2_2 + alpha * ||w||^2_2
 |  
 |  This model solves a regression model where the loss function is
 |  the linear least squares function and regularization is given by
 |  the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
 |  This estimator has built-in support for multi-variate regression
 |  (i.e., when y is a 2d-array of shape (n_samples, n_targets)).
 |  
 |  Read more in the :ref:`User Guide <ridge_regression>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : {float, ndarray of shape (n_targets,)}, default=1.0
 |      Regularization strength; must be a positive float. Regularization
 |      improves the conditioning of the problem and reduces the variance of
 |      the estimates. Larger values specify stronge

In [162]:
mdl = Ridge(positive=True)

TypeError: __init__() got an unexpected keyword argument 'positive'

In [155]:
reg = mdl.fit(sX, y)

In [156]:
reg.score(sX, y)

0.189922899853625

In [157]:
reg.coef_

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.69532202, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.70215925, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [158]:
reg.intercept_

array([1.61497473e+01, 5.16221997e+00, 1.70202572e+01, 3.00000000e-03,
       1.75490669e+01])

In [153]:
x = np.array(feature_vector(df.iloc[200000])).reshape(1, -1)
sx = scaler.transform(x)
reg.predict(sx)

array([[-2.72557524e+15,  8.88778350e+14, -3.30176617e+14,
        -2.89717579e+12, -1.25487036e+15]])

In [None]:
sklearn.loggi

In [60]:
# encode, also note there are potentially null values!

In [31]:
from tqdm import tqdm

In [32]:
minidf = df.iloc[:1000]

In [47]:
# holiday? ==> could use that as feature as well!
numeric_vars = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'MONTH', 'QUARTER', 'CRS_ELAPSED_TIME', 'DISTANCE']

In [48]:
crs_arr_minute = minidf['CRS_ARR_TIME'].astype(int) % 100
crs_arr_15min = (crs_arr_minute / 15).astype(int)

crs_dep_minute = minidf['CRS_DEP_TIME'].astype(int) % 100
crs_dep_15min = (crs_dep_minute / 15).astype(int)

In [50]:
# state

In [55]:
minidf['ORIGIN_STATE_ABR'].unique()

array(['MO', 'NY', 'TX', 'MA', 'IL', 'FL', 'UT', 'GA', 'NC', 'NJ', 'CA',
       'TN', 'NE', 'PA', 'MI', 'RI', 'OK', 'VA', 'PR', 'IA', 'NV', 'IN',
       'WA', 'LA', 'AZ', 'NM', 'CO', 'MN', 'VI', 'KS'], dtype=object)

In [13]:
delay_cols

NameError: name 'delay_cols' is not defined

In [2]:
!ls /hot/data/flights_all

flights_on_time_performance_1987_10.csv
flights_on_time_performance_1987_11.csv
flights_on_time_performance_1987_12.csv
flights_on_time_performance_1988_01.csv
flights_on_time_performance_1988_02.csv
flights_on_time_performance_1988_03.csv
flights_on_time_performance_1988_04.csv
flights_on_time_performance_1988_05.csv
flights_on_time_performance_1988_06.csv
flights_on_time_performance_1988_07.csv
flights_on_time_performance_1988_08.csv
flights_on_time_performance_1988_09.csv
flights_on_time_performance_1988_10.csv
flights_on_time_performance_1988_11.csv
flights_on_time_performance_1988_12.csv
flights_on_time_performance_1989_01.csv
flights_on_time_performance_1989_02.csv
flights_on_time_performance_1989_03.csv
flights_on_time_performance_1989_04.csv
flights_on_time_performance_1989_05.csv
flights_on_time_performance_1989_06.csv
flights_on_time_performance_1989_07.csv
flights_on_time_performance_1989_08.csv
flights_on_time_performance_1989_09.csv
flights_on_time_

In [None]:
delay_cols = sorted([c for c in df.columns if 'DELAY' in c])

In [None]:
# use the following logic:

# if flight arrives on time => check if arrival delay is larger than 0