In [62]:
import pandas as pd
import numpy as np
import tensorflow as tf
from os import listdir
from sklearn.model_selection import train_test_split
print ('The version of TensorFlow is {}'.format(tf.__version__))

The version of TensorFlow is 1.4.0


In [63]:
root_path = './data'

appended_data = []
for file_name in listdir(root_path):
    file_path = root_path + '/' + file_name.encode().decode('utf-8')
    data_from_one_csv = pd.read_csv(file_path, skiprows=1)
    appended_data.append(data_from_one_csv)
    
data = pd.concat(appended_data, axis=0)
data = data.drop(columns = ['ACN', 'Date', 'Local Time Of Day', 'Ceiling', 'Callback', 'Callback.1', 'Unnamed: 96'])
data = data.rename(index=str, columns={"Flight Phase": "Flight Phase1"})

## drop the rows with empty synopsis description
data = data[pd.notnull(data['Synopsis'])]

X = data.drop(columns = 'Result')
Y_raw = pd.DataFrame(data['Result'])

processed_Y = []
for index, row in Y_raw.iterrows():
    #print (index, row['Result'])
    outcome = row['Result']
    if type(outcome) == np.float:
        res = 'unknown'
        processed_Y.append(res)
    elif ';' in outcome:
        res = str(outcome).split(';')[0]
        processed_Y.append(res)
    else:
        res = outcome
        processed_Y.append(res)

Y = pd.DataFrame(processed_Y, columns = ['Result'])

In [64]:
## compress the number of labels to be predicted --> map result to risk level
rate_nine = ['General Declared Emergency', 'General Physical Injury / Incapacitation', 'Flight Crew Inflight Shutdown', 
             'Air Traffic Control Separated Traffic', 'Aircraft Aircraft Damaged']

rate_seven = ['General Evacuated', 'Flight Crew Landed as Precaution', 'Flight Crew Regained Aircraft Control', 
              'Air Traffic Control Issued Advisory / Alert', 'Flight Crew Landed in Emergency Condition',
              'Flight Crew Landed In Emergency Condition']

rate_five = ['General Work Refused', 'Flight Crew Became Reoriented', 'Flight Crew Diverted', 
             'Flight Crew Executed Go Around / Missed Approach', 
             'Flight Crew Overcame Equipment Problem', 'Flight Crew Rejected Takeoff', 'Flight Crew Took Evasive Action', 
             'Air Traffic Control Issued New Clearance']

rate_three = ['General Maintenance Action', 'General Flight Cancelled / Delayed', 'General Release Refused / Aircraft Not Accepted', 
              'Flight Crew Overrode Automation', 'Flight Crew FLC Overrode Automation',
              'Flight Crew Exited Penetrated Airspace', 
              'Flight Crew Requested ATC Assistance / Clarification', 'Flight Crew Landed As Precaution',
              'Flight Crew Returned To Clearance', 'Flight Crew Returned To Departure Airport',
              'Aircraft Automation Overrode Flight Crew']

rate_one = ['General Police / Security Involved', 'Flight Crew Returned To Gate', 'Aircraft Equipment Problem Dissipated', 
            'unknown', 'Air Traffic Control Provided Assistance',
            'General None Reported / Taken', 'Flight Crew FLC complied w / Automation / Advisory']

Y_ = []
for i in range(Y.shape[0]):
    if Y['Result'][i] in rate_nine:
        Y_.append(5)
    elif Y['Result'][i] in rate_seven:
        Y_.append(4)
    elif Y['Result'][i] in rate_five:
        Y_.append(3)
    elif Y['Result'][i] in rate_three:
        Y_.append(2)
    elif Y['Result'][i] in rate_one:
        Y_.append(1)
    else:
        print (Y['Result'][i])

outcomes = np.asarray(Y_)
Y_pred = pd.DataFrame(Y_, index = X.index, columns = ['Result'])
unique, counts = np.unique(outcomes, return_counts=True)

In [65]:
data_rev = X.copy(deep=True)
data_rev['Result'] = Y_pred
data_rev.head()

Unnamed: 0,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,Weather Elements / Visibility,Work Environment Factor,Light,...,Miss Distance,Were Passengers Involved In Event,Detector,When Detected,Contributing Factors / Situations,Primary Problem,Narrative,Narrative.1,Synopsis,Result
0,ZZZ.Airport,US,,,,1500.0,,,,Night,...,,,Automation Aircraft Other Automation,,Human Factors; Aircraft,Aircraft,ON CLBOUT OUT OF ZZZ; LNDG GEAR CTL AND INTERF...,,AN A320 WITH DUEL LGCIU FAILURES EXECUTED A GA...,1
1,DFW.Airport,TX,,,,1000.0,VMC,,,,...,Vertical 400,,Automation Aircraft RA; Person Flight Crew,,Human Factors,Human Factors,NORMAL OPS INTO DFW RWY 18R. ON FINAL AT ABOUT...,,A320 EXPERIENCED CONFLICT WITH UNRPTED HELI WH...,3
2,ZAU.ARTCC,IL,,,,20500.0,IMC,,,Daylight,...,,,Person Flight Crew,,Environment - Non Weather Related; Weather,Weather,ABOUT 40 MI N OF RBS VOR ON CLBOUT; ENCOUNTERE...,,A B737-300 CREW ENCOUNTERED A 20 SECOND SEVERE...,4
3,ZZZ.Airport,US,,,0.0,,IMC,Fog; 1,,Night,...,,,Person Flight Crew,,Human Factors; Weather; Aircraft; Chart Or Pub...,Aircraft,AFTER BLOCK OUT; PRIOR TO DEP; HOLDING SHORT O...,,AN A300 FLT CREW NOTED LOUD RUMBLING AND SHUDD...,2
4,SNA.Airport,CA,,,,5000.0,IMC,Turbulence; Thunderstorm; Rain; Windshear; 3,,Daylight,...,Horizontal 9500; Vertical 600,,Automation Aircraft RA; Person Flight Crew,,Airport; Weather; Human Factors; Airspace Stru...,Human Factors,FLT OPERATED TO SNA. WX AT SNA WAS ABOUT 800 F...,,A B737-700 ON AN SNA IMC APCH DEVIATED FROM TH...,3


## Upsampling the minority categories

In [66]:
print (unique)
print (counts)

from sklearn.utils import resample

df_majority_1 = data_rev[data_rev['Result']==1]
df_majority_3 = data_rev[data_rev['Result']==3]
df_minority_2 = data_rev[data_rev['Result']==2]
df_minority_4 = data_rev[data_rev['Result']==4]
df_minority_5 = data_rev[data_rev['Result']==5]

# Upsample minority class
df_minority_2_upsampled = resample(df_minority_2, 
                                 replace=True,     # sample with replacement
                                 n_samples=20000,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_4_upsampled = resample(df_minority_4, 
                                 replace=True,     # sample with replacement
                                 n_samples=20000,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_5_upsampled = resample(df_minority_5, 
                                 replace=True,     # sample with replacement
                                 n_samples=20000,    # to match majority class
                                 random_state=123) # reproducible results

df_upsampled = pd.concat([df_majority_1, df_majority_3, df_minority_2_upsampled, df_minority_4_upsampled, df_minority_5_upsampled])

df_upsampled['Result'].value_counts()

X = df_upsampled.drop(columns = 'Result')
Y_pred = df_upsampled['Result']

[1 2 3 4 5]
[20985  9359 20848  8009  5372]


In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X['Synopsis'], Y_pred, test_size = 0.2, random_state = 100)


from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [68]:
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
pred_label = clf.predict(X_new_tfidf)

In [69]:
from sklearn.metrics import classification_report
target_names = [str(i) for i in range(1, 5+1)]
print(classification_report(Y_test, pred_label, target_names=target_names))

             precision    recall  f1-score   support

          1       0.53      0.46      0.49      4206
          2       0.55      0.63      0.59      3937
          3       0.46      0.40      0.43      4174
          4       0.53      0.56      0.54      4032
          5       0.63      0.70      0.66      4018

avg / total       0.54      0.55      0.54     20367



## Pipeline: Naive Bayes

In [70]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB(alpha = 1, fit_prior=True)),
                    ])

text_clf.fit(X_train, Y_train)
pred_label = text_clf.predict(X_test)

from sklearn.metrics import classification_report
target_names = [str(i) for i in range(1, 5+1)]
print(classification_report(Y_test, pred_label, target_names=target_names))

             precision    recall  f1-score   support

          1       0.53      0.46      0.49      4206
          2       0.55      0.63      0.59      3937
          3       0.46      0.40      0.43      4174
          4       0.53      0.56      0.54      4032
          5       0.63      0.70      0.66      4018

avg / total       0.54      0.55      0.54     20367



## Pipeline: Support Vector Machine with Linear Kernel

In [115]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                            alpha=1e-5, random_state=40,
                                            max_iter=10, tol=None)),
                    ])


parameters = {'clf__loss': ['epsilon_insensitive', 'hinge', 'log', 'huber', 'modified_huber', 'perceptron', 
                            'squared_loss', 'squared_epsilon_insensitive', 'squared_hinge'],
              'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5),
              'clf__penalty': ['l1', 'l2', 'elasticnet'],
              'clf__max_iter': (10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
 }

optimal_parameters = {'clf__loss': ['modified_huber'],
              'vect__ngram_range':  [(1, 2)],
              'tfidf__use_idf': [True],
              'clf__alpha': [1e-5],
              'clf__penalty': ['elasticnet'],
              'clf__max_iter': [80],
 }

gs_clf = GridSearchCV(text_clf, optimal_parameters, n_jobs=-1)

gs_clf.fit(X_train, Y_train)
pred_label = gs_clf.predict(X_test)

from sklearn.metrics import classification_report
target_names = [str(i) for i in range(1, 6)]
print(classification_report(Y_test, pred_label, target_names=target_names))

             precision    recall  f1-score   support

          1       0.69      0.50      0.58      4206
          2       0.80      0.91      0.85      3937
          3       0.63      0.55      0.59      4174
          4       0.78      0.91      0.84      4032
          5       0.88      0.98      0.93      4018

avg / total       0.75      0.77      0.75     20367



In [118]:
print ('Accuracy: ', np.sum(np.equal(Y_test, pred_label).astype(int))/20367)
print ('The best set of parameters is \n', gs_clf.best_params_)

Accuracy:  0.765503019590514
The best set of parameters is 
 {'clf__alpha': 1e-05, 'clf__loss': 'modified_huber', 'clf__max_iter': 80, 'clf__penalty': 'elasticnet', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


## Pipeline: Support Vector Machine with Nonlinear Kernel

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import svm
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', svm.SVC()),
                    ])

Y_train_rev = Y_train.copy(deep=True)
Y_train_rev.replace(to_replace = 1, value = 0, inplace = True)
Y_train_rev.replace(to_replace = 2, value = 0, inplace = True)
Y_train_rev.replace(to_replace = 4, value = 0, inplace = True)
Y_train_rev.replace(to_replace = 5, value = 0, inplace = True)
Y_train_rev.replace(to_replace = 3, value = 1, inplace = True)

Y_test_rev = Y_test.copy(deep=True)
Y_test_rev.replace(to_replace = 1, value = 0, inplace = True)
Y_test_rev.replace(to_replace = 2, value = 0, inplace = True)
Y_test_rev.replace(to_replace = 4, value = 0, inplace = True)
Y_test_rev.replace(to_replace = 5, value = 0, inplace = True)
Y_test_rev.replace(to_replace = 3, value = 1, inplace = True)

text_clf.fit(X_train, Y_train)
pred_label = text_clf.predict(X_test)

from sklearn.metrics import classification_report
target_names = [str(i) for i in range(0, 2)]
print(classification_report(Y_test_rev, pred_label, target_names=target_names))

  y = column_or_1d(y, warn=True)
