In [2]:
import pandas as pd
import joblib
import glob
import os

In [3]:
root_path = '/hot/data/flights_all/'

In [4]:
paths = glob.glob(os.path.join(root_path, 'flights*.csv'))

In [6]:
path = paths[0]

In [7]:
# on which files is weather delay always NULL?
# on which files not?

In [8]:
df = pd.read_csv(path,usecols=['WEATHER_DELAY'])

In [10]:
# use sampling using first 50 and last 50 rows for detection

In [70]:
def count_values(df):
    num_values = len(df)
    num_na_values = len(df[df['WEATHER_DELAY'].isna()])
    num_non_navalues = len(df[~df['WEATHER_DELAY'].isna()])
    num_na_values, num_non_navalues, num_values

    return {'n_na' : num_na_values, 'n_nona' : num_non_navalues, 'n' : num_values}

In [71]:
len(df), len(df.dropna())

(606293, 120126)

In [72]:
# overlap? tiny files? => ignore.
count_values(pd.concat((df.head(10), df.tail(10))))

{'n_na': 15, 'n_nona': 5, 'n': 20}

In [73]:
def get_stats(path):
    df = pd.read_csv(path,usecols=['WEATHER_DELAY'])
    
    df_sample = df
    if len(df) > 20:
        df_sample = pd.concat((df.head(10), df.tail(10)))
    
    row = {'actual' : count_values(df), 'sample' : count_values(df_sample)}
    
    row['path'] = path
    row['name'] = os.path.basename(path)
    
    row['actual_constant'] = row['actual']['n_na'] == row['actual']['n']
    row['sample_constant'] = row['sample']['n_na'] == row['sample']['n']
    
    return row

In [74]:
get_stats(path)

{'actual': {'n_na': 486167, 'n_nona': 120126, 'n': 606293},
 'sample': {'n_na': 15, 'n_nona': 5, 'n': 20},
 'path': '/hot/data/flights_all/flights_on_time_performance_2008_05.csv',
 'name': 'flights_on_time_performance_2008_05.csv',
 'actual_constant': False,
 'sample_constant': False}

In [75]:
#Import package
from joblib import Parallel, delayed
from joblib import Memory
import time

start = time.time()
results = Parallel(n_jobs=32, verbose=10)(
    delayed(get_stats)(path) for path in paths)
stop = time.time()

print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))

[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed:    3.3s
[Parallel(n_jobs=32)]: Done  21 tasks      | elapsed:    4.1s
[Parallel(n_jobs=32)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=32)]: Done  49 tasks      | elapsed:    6.6s
[Parallel(n_jobs=32)]: Done  64 tasks      | elapsed:    7.6s
[Parallel(n_jobs=32)]: Done  81 tasks      | elapsed:    9.2s
[Parallel(n_jobs=32)]: Done  98 tasks      | elapsed:   10.4s
[Parallel(n_jobs=32)]: Done 117 tasks      | elapsed:   12.1s
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:   13.3s
[Parallel(n_jobs=32)]: Done 157 tasks      | elapsed:   15.0s
[Parallel(n_jobs=32)]: Done 178 tasks      | elapsed:   17.1s
[Parallel(n_jobs=32)]: Done 201 tasks      | elapsed:   18.9s
[Parallel(n_jobs=32)]: Done 224 tasks      | elapsed:   20.8s
[Parallel(n_jobs=32)]: Done 249 tasks      | elapsed:   22.8s
[Parallel(n_jobs=32)]: Done 274 tasks      | elapsed:  

Elapsed time for the entire processing: 36.14 s


[Parallel(n_jobs=32)]: Done 410 out of 410 | elapsed:   36.1s finished


In [76]:
df_r = pd.DataFrame(results)

In [77]:
# sampling accurate?
# ==> yep. sufficient to answer everything basically...

In [87]:
num_wrongly_sampled = len(df_r[df_r['actual_constant'].values != df_r['sample_constant'].values])

print('{} out of {} wrongly detected'.format(num_wrongly_sampled, len(df_r)))

21 out of 410 wrongly detected


For the paper could do precision/recall/accuracy for this.

In [83]:
# how large is the sampling error?

df_r[df_r['actual_constant'].values != df_r['sample_constant'].values]

Unnamed: 0,actual,sample,path,name,actual_constant,sample_constant
8,"{'n_na': 502649, 'n_nona': 142650, 'n': 645299}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2018_07.csv,False,True
11,"{'n_na': 398726, 'n_nona': 92285, 'n': 491011}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2014_10.csv,False,True
80,"{'n_na': 521183, 'n_nona': 93619, 'n': 614802}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2005_05.csv,False,True
120,"{'n_na': 432249, 'n_nona': 94684, 'n': 526933}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2012_06.csv,False,True
171,"{'n_na': 421275, 'n_nona': 72613, 'n': 493888}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2011_12.csv,False,True
175,"{'n_na': 523399, 'n_nona': 135630, 'n': 659029}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2019_07.csv,False,True
191,"{'n_na': 458705, 'n_nona': 121429, 'n': 580134}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2009_07.csv,False,True
205,"{'n_na': 469453, 'n_nona': 139742, 'n': 609195}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2005_06.csv,False,True
217,"{'n_na': 454800, 'n_nona': 54740, 'n': 509540}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2009_11.csv,False,True
247,"{'n_na': 415225, 'n_nona': 70908, 'n': 486133}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2012_01.csv,False,True


In [79]:
# how many are constant? how many not?

In [80]:
df_r['actual_constant'].value_counts()

False    222
True     188
Name: actual_constant, dtype: int64

In [93]:
df_r[df_r['actual_constant']]['name'].sort_values()

348    flights_on_time_performance_1987_10.csv
184    flights_on_time_performance_1987_11.csv
357    flights_on_time_performance_1987_12.csv
158    flights_on_time_performance_1988_01.csv
340    flights_on_time_performance_1988_02.csv
                        ...                   
284    flights_on_time_performance_2003_01.csv
187    flights_on_time_performance_2003_02.csv
344    flights_on_time_performance_2003_03.csv
280    flights_on_time_performance_2003_04.csv
131    flights_on_time_performance_2003_05.csv
Name: name, Length: 188, dtype: object

In [94]:
# till 2003/05 it's basically constant.
# Thus, we do unnecessary work for all of these files which are in sum

# XX GB of data.

In [43]:
df_r

Unnamed: 0,actual,sample,path,name,actual_constant,sample_constant
0,"{'n_na': 486167, 'n_nona': 120126, 'n': 486167}","{'n_na': 15, 'n_nona': 5, 'n': 15}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2008_05.csv,True,True
1,"{'n_na': 383950, 'n_nona': 0, 'n': 383950}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1993_02.csv,True,True
2,"{'n_na': 427091, 'n_nona': 0, 'n': 427091}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1995_11.csv,True,True
3,"{'n_na': 446427, 'n_nona': 0, 'n': 446427}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1998_06.csv,True,True
4,"{'n_na': 492088, 'n_nona': 134105, 'n': 492088}","{'n_na': 16, 'n_nona': 4, 'n': 16}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2018_06.csv,True,True
...,...,...,...,...,...,...
405,"{'n_na': 438454, 'n_nona': 0, 'n': 438454}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1988_12.csv,True,True
406,"{'n_na': 428781, 'n_nona': 0, 'n': 428781}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1990_11.csv,True,True
407,"{'n_na': 529940, 'n_nona': 0, 'n': 529940}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_2001_01.csv,True,True
408,"{'n_na': 432725, 'n_nona': 0, 'n': 432725}","{'n_na': 20, 'n_nona': 0, 'n': 20}",/hot/data/flights_all/flights_on_time_performa...,flights_on_time_performance_1997_11.csv,True,True
