In [401]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
print (tf.__version__)

1.4.0


## Load data from ASRS database

In [402]:
file_name = 'ASRS_DBOnline.csv'
data = pd.read_csv(file_name)
data = data.drop(columns = ['ACN', 'Date', 'Local Time Of Day', 'Ceiling', 'Callback', 'Callback.1'])

X = data.drop(columns = 'Result')
#Y = data['Result']
Y = pd.DataFrame(np.random.randint(0, 3, size = (X.shape[0])))
Y.head()

Unnamed: 0,0
0,1
1,2
2,2
3,2
4,2


In [403]:
## change column names
new_col_name = []
for col in X.columns:
    #print(type(col))
    new_col_name.append(col.replace('/ ', '').replace(' ', '_'))
    
X.columns = new_col_name

## output the headers from the csv file
X.keys()

Index(['Locale_Reference', 'State_Reference', 'Relative_Position.Angle.Radial',
       'Relative_Position.Distance.Nautical_Miles',
       'Altitude.AGL.Single_Value', 'Altitude.MSL.Single_Value',
       'Flight_Conditions', 'Weather_Elements_Visibility',
       'Work_Environment_Factor', 'Light', 'RVR.Single_Value', 'ATC_Advisory',
       'Aircraft_Operator', 'Make_Model_Name', 'Aircraft_Zone', 'Crew_Size',
       'Operating_Under_FAR_Part', 'Flight_Plan', 'Mission', 'Nav_In_Use',
       'Flight_Phase1', 'Route_In_Use', 'Airspace',
       'Maintenance_Status.Maintenance_Deferred',
       'Maintenance_Status.Records_Complete',
       'Maintenance_Status.Released_For_Service',
       'Maintenance_Status.Required_Correct_Doc_On_Board',
       'Maintenance_Status.Maintenance_Type',
       'Maintenance_Status.Maintenance_Items_Involved', 'Cabin_Lighting',
       'Number_Of_Seats.Number', 'Passengers_On_Board.Number',
       'Crew_Size_Flight_Attendant.Number_Of_Crew', 'Aircraft_Component',

## Output the data types of all the items

In [404]:
data_type = []
for item_name in X.keys():
    first_valid_index = X[item_name].first_valid_index()
    if (first_valid_index != None):
        data_type.append(type(X[item_name][first_valid_index]))
    
    no_NaNs = np.sum(X[item_name].isna().astype(int))
    if (no_NaNs > 0.8 * X.shape[0]):
        print (item_name)

print ('\n')
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print ('The unique data types across all the items are:', set(data_type))
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

Relative_Position.Angle.Radial
Relative_Position.Distance.Nautical_Miles
Work_Environment_Factor
RVR.Single_Value
Aircraft_Zone
Nav_In_Use
Maintenance_Status.Maintenance_Deferred
Maintenance_Status.Records_Complete
Maintenance_Status.Released_For_Service
Maintenance_Status.Required_Correct_Doc_On_Board
Maintenance_Status.Maintenance_Type
Maintenance_Status.Maintenance_Items_Involved
Cabin_Lighting
Number_Of_Seats.Number
Passengers_On_Board.Number
Crew_Size_Flight_Attendant.Number_Of_Crew
Manufacturer
ATC_Advisory.1
Aircraft_Operator.1
Make_Model_Name.1
Aircraft_Zone.1
Crew_Size.1
Operating_Under_FAR_Part.1
Flight_Plan.1
Mission.1
Nav_In_Use.1
Flight_Phase2
Route_In_Use.1
Airspace.1
Maintenance_Status.Maintenance_Deferred.1
Maintenance_Status.Records_Complete.1
Maintenance_Status.Released_For_Service.1
Maintenance_Status.Required_Correct_Doc_On_Board.1
Maintenance_Status.Maintenance_Type.1
Maintenance_Status.Maintenance_Items_Involved.1
Cabin_Lighting.1
Number_Of_Seats.Number.1
Passenge

## Count the number of missing values in each item

In [405]:
item_name = 'Relative_Position.Angle.Radial'
## find the number of NaN in this item
no = np.sum(X[item_name].isna().astype(int))
print ('The number of items with value equal to NaN is {}'.format(no))

The number of items with value equal to NaN is 2779


## Replace the missing value with corresponding values according to its data type

In [406]:
for item_name in X.keys():
    first_valid_index = X[item_name].first_valid_index()
    if (first_valid_index != None):
        if (type(X[item_name][first_valid_index]) == np.float64):
            X[item_name].fillna(-1, inplace = True)
        elif (type(X[item_name][first_valid_index]) is str):
            X[item_name].fillna('unknown', inplace = True)
        elif (type(X[item_name][first_valid_index]) == np.int64):
            X[item_name].fillna(-11, inplace = True)

print ('The number of unique records in this item is {}'.format(len(set(X[item_name]))))
X.head()

The number of unique records in this item is 2818


Unnamed: 0,Locale_Reference,State_Reference,Relative_Position.Angle.Radial,Relative_Position.Distance.Nautical_Miles,Altitude.AGL.Single_Value,Altitude.MSL.Single_Value,Flight_Conditions,Weather_Elements_Visibility,Work_Environment_Factor,Light,...,Anomaly,Miss_Distance,Were_Passengers_Involved_In_Event,Detector,When_Detected,Contributing_Factors_Situations,Primary_Problem,Narrative1,Narrative2,Synopsis
0,SFO.Tower,CA,-1.0,-1.0,0.0,-1.0,unknown,unknown,unknown,Night,...,Deviation - Procedural Clearance; Ground Incur...,unknown,unknown,Person Flight Crew,Taxi,Airport; Human Factors; Procedure,Airport,Aircraft X was assigned Runway 1R via A-A1. I ...,unknown,SFO Controller reported of a taxiing issue. Pi...
1,LAS.Tower,NV,-1.0,-1.0,100.0,-1.0,VMC,unknown,unknown,Daylight,...,ATC Issue All Types; Conflict Ground Conflict;...,unknown,unknown,Person Air Traffic Control,In-flight,ATC Equipment / Nav Facility / Buildings; Airp...,ATC Equipment / Nav Facility / Buildings,I was working Local Control. VFR weather. An E...,unknown,Local Controller reported a false ASDE-X warni...
2,ZMP.ARTCC,MN,-1.0,-1.0,-1.0,36000.0,unknown,unknown,unknown,Daylight,...,ATC Issue All Types; Airspace Violation All Ty...,unknown,unknown,Person Air Traffic Control,In-flight,Aircraft; Human Factors,Aircraft,Aircraft X was in conflict with another aircra...,unknown,ZMP ARTCC Controller asked an aircraft if they...
3,OGG.Tower,HI,-1.0,-1.0,-1.0,3000.0,VMC,unknown,unknown,Night,...,ATC Issue All Types; Conflict Airborne Conflic...,unknown,unknown,Person Air Traffic Control,In-flight,Airspace Structure; Procedure,Procedure,I was working Local Control with a [small airc...,unknown,TRACON switched an aircraft on a visual approa...
4,RDG.Tower,PA,-1.0,-1.0,0.0,-1.0,VMC,unknown,unknown,Daylight,...,ATC Issue All Types; Conflict Ground Conflict;...,unknown,unknown,Person Air Traffic Control,Taxi,ATC Equipment / Nav Facility / Buildings; Airport,ATC Equipment / Nav Facility / Buildings,Aircraft X was put into position on the runway...,unknown,RDG Local Controller reported a false ASDE-X w...


In [407]:
X[item_name].sample(20)

1125    The pilot of a C-172 reported a near-mid-air-c...
1997    ZNY Controller reported of a conflict with two...
1581    BUR Local Controller reported of a loss of sep...
2464    A Helicopter Pilot at 1;500 feet reported sigh...
24      ZID ARTCC Controller reported a loss of separa...
55      G-IV Captain was informed by ATC that he had l...
1071    During pushback brakes were not set as ground ...
1323    TYS Controller reported of a loss of separatio...
937     MD11 Dispatcher was informed by a crew that th...
1579    A departing BE99 aircraft assigned 3;000 feet ...
132     LR45 Captain experienced a failure of the land...
2278    C177B pilot reported executing a forced landin...
552     A320 Captain reported MEL procedures were not ...
2245    B737 Next Generation flight crew reported the ...
202     The B737 crew had different indications on the...
2328    A pilot reported skydiving operations continue...
2114    Flight crew reported ATC was confused about wh...
2431    B737NG

## Construct feature columns

In [408]:
Locale_Reference = tf.feature_column.categorical_column_with_hash_bucket('Locale_Reference', hash_bucket_size = 500)
State_Reference = tf.feature_column.categorical_column_with_hash_bucket('State_Reference', hash_bucket_size = 80)


## Environment
Flight_Conditions = tf.feature_column.categorical_column_with_hash_bucket('Flight_Conditions', hash_bucket_size = 6)
Weather_Elements_Visibility = tf.feature_column.categorical_column_with_hash_bucket('Weather_Elements_Visibility', hash_bucket_size = 20)
Work_Environment_Factor = tf.feature_column.categorical_column_with_hash_bucket('Work_Environment_Factor', hash_bucket_size = 7)
Light = tf.feature_column.categorical_column_with_hash_bucket('Light', hash_bucket_size = 7)


## Aircraft
ATC_Advisory = tf.feature_column.categorical_column_with_hash_bucket('ATC_Advisory', hash_bucket_size = 500)
Aircraft_Operator = tf.feature_column.categorical_column_with_hash_bucket('Aircraft_Operator', hash_bucket_size = 10)
Make_Model_Name = tf.feature_column.categorical_column_with_hash_bucket('Make_Model_Name', hash_bucket_size = 300)
Crew_Size = tf.feature_column.numeric_column('Crew_Size', [1])
Flight_Plan = tf.feature_column.categorical_column_with_hash_bucket('Flight_Plan', hash_bucket_size = 10)
Mission = tf.feature_column.categorical_column_with_hash_bucket('Mission', hash_bucket_size = 20)
Flight_Phase1 = tf.feature_column.categorical_column_with_hash_bucket('Flight_Phase1', hash_bucket_size = 10)
Route_In_Use = tf.feature_column.categorical_column_with_hash_bucket('Route_In_Use', hash_bucket_size = 10)
Airspace = tf.feature_column.categorical_column_with_hash_bucket('Airspace', hash_bucket_size = 10)

## Component
Aircraft_Component = tf.feature_column.categorical_column_with_hash_bucket('Aircraft_Component', 
                                                                          hash_bucket_size = 240)
Manufacturer = tf.feature_column.categorical_column_with_hash_bucket('Manufacturer', 
                                                                        hash_bucket_size = 5)

## Person
Location_Of_Person = tf.feature_column.categorical_column_with_hash_bucket('Location_Of_Person', 
                                                                           hash_bucket_size = 200)
Location_In_Aircraft = tf.feature_column.categorical_column_with_hash_bucket('Location_In_Aircraft',
                                                                            hash_bucket_size = 20)
Reporter_Organization = tf.feature_column.categorical_column_with_hash_bucket('Reporter_Organization',
                                                                             hash_bucket_size = 20)
Function = tf.feature_column.categorical_column_with_hash_bucket('Function', hash_bucket_size = 20)
Qualification = tf.feature_column.categorical_column_with_hash_bucket('Qualification', hash_bucket_size = 200)
Human_Factors = tf.feature_column.categorical_column_with_hash_bucket('Human_Factors', hash_bucket_size = 600)

## Events
Anomaly = tf.feature_column.categorical_column_with_hash_bucket('Anomaly', hash_bucket_size = 1000)
Detector = tf.feature_column.categorical_column_with_hash_bucket('Detector', hash_bucket_size = 80)
When_Detected = tf.feature_column.categorical_column_with_hash_bucket('When_Detected', hash_bucket_size = 10)
Were_Passengers_Involved_In_Event = tf.feature_column.categorical_column_with_hash_bucket('Were_Passengers_Involved_In_Event',
                                                                                hash_bucket_size = 5)

## Assessments
Contributing_Factors_Situations = tf.feature_column.categorical_column_with_hash_bucket('Contributing_Factors_Situations', 
                                                                             hash_bucket_size = 20)
Primary_Problem = tf.feature_column.categorical_column_with_hash_bucket('Primary_Problem', 
                                                                        hash_bucket_size = 20)

In [409]:
## Place
Locale_Reference = tf.feature_column.embedding_column(Locale_Reference, 500)
State_Reference = tf.feature_column.embedding_column(State_Reference, 80)


## Environment
Flight_Conditions = tf.feature_column.embedding_column(Flight_Conditions,  6)
Weather_Elements_Visibility = tf.feature_column.embedding_column(Weather_Elements_Visibility,  20)
Work_Environment_Factor = tf.feature_column.embedding_column(Work_Environment_Factor,  7)
Light = tf.feature_column.embedding_column(Light, 7)


## Aircraft
ATC_Advisory = tf.feature_column.embedding_column(ATC_Advisory, 500)
Aircraft_Operator = tf.feature_column.embedding_column(Aircraft_Operator, 10)
Make_Model_Name = tf.feature_column.embedding_column(Make_Model_Name, 300)
Flight_Plan = tf.feature_column.embedding_column(Flight_Plan, 10)
Mission = tf.feature_column.embedding_column(Mission, 20)
Flight_Phase1 = tf.feature_column.embedding_column(Flight_Phase1, 10)
Route_In_Use = tf.feature_column.embedding_column(Route_In_Use, 10)
Airspace = tf.feature_column.embedding_column(Airspace, 10)

## Component
Aircraft_Component = tf.feature_column.embedding_column(Aircraft_Component, 240)
Manufacturer = tf.feature_column.embedding_column(Manufacturer, 5)

## Person
Location_Of_Person = tf.feature_column.embedding_column(Location_Of_Person, 200)
Location_In_Aircraft = tf.feature_column.embedding_column(Location_In_Aircraft, 20)
Reporter_Organization = tf.feature_column.embedding_column(Reporter_Organization, 20)
Function = tf.feature_column.embedding_column(Function, 20)
Qualification = tf.feature_column.embedding_column(Qualification, 200)
Human_Factors = tf.feature_column.embedding_column(Human_Factors, 600)

## Events
Anomaly = tf.feature_column.embedding_column(Anomaly, 1000)
Detector = tf.feature_column.embedding_column(Detector, 80)
When_Detected = tf.feature_column.embedding_column(When_Detected, 10)
Were_Passengers_Involved_In_Event = tf.feature_column.embedding_column(Were_Passengers_Involved_In_Event, 5)

## Assessments
Contributing_Factors_Situations = tf.feature_column.embedding_column(Contributing_Factors_Situations, 20)
Primary_Problem = tf.feature_column.embedding_column(Primary_Problem, 20)

## Build a neural network-based learning model

In [410]:
from sklearn.model_selection import train_test_split
X_copy = X[['Locale_Reference', 'State_Reference', 'Flight_Conditions', 'Weather_Elements_Visibility', 
            'Work_Environment_Factor', 'Light', 'ATC_Advisory', 'Aircraft_Operator', 'Make_Model_Name', 
            'Crew_Size', 'Flight_Plan', 'Mission', 'Flight_Phase1',
            'Route_In_Use','Airspace', 'Aircraft_Component', 'Manufacturer', 'Location_Of_Person', 'Location_In_Aircraft',
            'Reporter_Organization', 'Function', 'Qualification', 'Human_Factors', 'Anomaly', 'Detector', 'When_Detected',
            'Were_Passengers_Involved_In_Event', 'Contributing_Factors_Situations', 'Primary_Problem' ]]

X_train, X_test, Y_train, Y_test = train_test_split(X_copy, Y, test_size = 0.2, random_state = 101)
X_train.shape

(2254, 29)

In [411]:
## define input function
input_func = tf.estimator.inputs.pandas_input_fn(x = X_train, y = Y_train, batch_size = 50, 
                                                num_epochs = 1000, shuffle = True)

## define the feature columns
feat_cols = [Locale_Reference, State_Reference, Flight_Conditions, Weather_Elements_Visibility, Work_Environment_Factor, 
             Light, ATC_Advisory,
             Aircraft_Operator, Make_Model_Name, Crew_Size, Flight_Plan, Mission, Flight_Phase1, Route_In_Use, Airspace,
             Aircraft_Component, Manufacturer, Location_Of_Person, Location_In_Aircraft, Reporter_Organization, 
             Function, Qualification, Human_Factors, Anomaly, Detector, When_Detected, Were_Passengers_Involved_In_Event,
             Contributing_Factors_Situations, Primary_Problem]

## build the model
model = tf.estimator.DNNClassifier(hidden_units = [40, 40, 40, 40, 40], feature_columns = feat_cols, n_classes = 3)

#print (feat_cols)
model.train(input_fn=input_func, steps=2000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\ZHANGX~1\\AppData\\Local\\Temp\\tmpza0srefs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000019274267CC0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\ZHANGX~1\AppData\Local\Temp\tmpza0srefs\model.ckpt.
INFO:tensorflow:loss = 55.27509, step = 1
INFO:tensorflow:global_step/sec: 72.3976
INFO:tensorflow:loss = 55.074177, step = 101 (1.381 sec)
INFO:tensorflow:global_step/sec: 154.737
INFO:tensorflow:loss = 54.969353, step = 201 (0

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1925bbf0fd0>

## Test the performance of the trained model

In [412]:
eval_input = tf.estimator.inputs.pandas_input_fn(x = X_test, shuffle = False)
prediction = list(model.predict(eval_input))

pred_label = [int(pred['class_ids']) for pred in prediction]

from sklearn.metrics import classification_report
target_names = ['1', '2', '3']
print(classification_report(Y_test, pred_label, target_names=target_names))

INFO:tensorflow:Restoring parameters from C:\Users\ZHANGX~1\AppData\Local\Temp\tmpza0srefs\model.ckpt-2000
             precision    recall  f1-score   support

          1       0.34      0.34      0.34       175
          2       0.36      0.40      0.38       203
          3       0.33      0.28      0.31       186

avg / total       0.34      0.34      0.34       564

