In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import os
from os import listdir
import dateutil.parser
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
import pickle

# Import libraries and classes required for this example:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


#########################
# configuration
#########################
data_path = 'drive/MyDrive/DS_Project_Datasets/Networkrail/Delay/'

In [None]:
file_list = ['2018_19_P05_all_delays.csv','2018_19_P06_all_delays.csv','2018_19_P07_all_delays.csv','2018_19_P08_all_delays.csv','2018_19_P09_all_delays.csv','2018_19_P10_all_delays.csv','2018_19_P11_all_delays.csv','2018_19_P12_all_delays.csv','2018_19_P13_all_delays.csv','2019_20_P01_all_delays.csv','2019_20_P02_all_delays.csv','2019_20_P03_all_delays.csv','2019_20_P04_all_delays.csv','2019_20_P05_all_delays.csv']

#get incident dict
incident_df = pd.read_excel('drive/MyDrive/DS_Project_Datasets/Networkrail/Glossary/' + 'Historic-Delay-Attribution-Glossary (1).xlsx', sheet_name='Incident Reason').drop(labels=[365, 366], axis="index")[['Incident Reason','Incident JPIP Category']]
incident_df = incident_df.rename(columns={"Incident Reason": "INCIDENT_REASON", "Incident JPIP Category": "INCIDENT_CATEGORY"})
incident_df['INCIDENT_CATEGORY'].unique()
incident_df = incident_df.drop(labels=[incident_df[incident_df['INCIDENT_REASON'].isnull()].index[0]], axis="index")

for index, file in enumerate(file_list):
    #print(file)
    data = pd.read_csv(data_path + file, low_memory=False)

    #top 5 delay stations by start_stanox
    drop_col = data.columns.tolist()
    drop_col.remove('FINANCIAL_YEAR_PERIOD')
    drop_col.remove('ENGLISH_DAY_TYPE')
    drop_col.remove('PLANNED_ORIGIN_LOCATION_CODE')
    drop_col.remove('PLANNED_DEST_LOCATION_CODE')
    drop_col.remove('APPLICABLE_TIMETABLE_FLAG')
    drop_col.remove('INCIDENT_REASON')
    drop_col.remove('EVENT_TYPE')
    drop_col.remove('PFPI_MINUTES')

    tmp = data.drop(drop_col, axis = 1)
    if (index == 0):
      delay_data = tmp
    else:
      delay_data = delay_data.append(tmp)

    print('==> done ', str(index+1), "/", len(file_list))

delay_data = pd.merge(
    left=delay_data,
    right=incident_df,
    on="INCIDENT_REASON",
    how = 'left'
)
print('merge: INCIDENT_REASON')

delay_data_dummy_day_type = pd.get_dummies(delay_data['ENGLISH_DAY_TYPE'])
delay_data = pd.merge(
    left=delay_data,
    right=delay_data_dummy_day_type,
    left_index=True,
    right_index=True,
)
print('merge: ENGLISH_DAY_TYPE')
delay_data_dummy_event_type = pd.get_dummies(delay_data['EVENT_TYPE'])
delay_data = pd.merge(
    left=delay_data,
    right=delay_data_dummy_event_type,
    left_index=True,
    right_index=True,
)
print('merge: EVENT_TYPE')
delay_data['PERIOD'] = delay_data['FINANCIAL_YEAR_PERIOD'].str[-2:].astype(int)
delay_data_dummy_period = pd.get_dummies(delay_data['PERIOD'])
delay_data = pd.merge(
    left=delay_data,
    right=delay_data_dummy_period,
    left_index=True,
    right_index=True,
)
print('merge: PERIOD')
delay_data['APPLICABLE_TIMETABLE_FLAG'].replace({'Y': 1, 'N':0}, inplace=True)

#top 5 delay stations by start_stanox
drop_col = ['FINANCIAL_YEAR_PERIOD', 'ENGLISH_DAY_TYPE', 'EVENT_TYPE','INCIDENT_REASON', 'PERIOD']
tmp = delay_data.drop(drop_col, axis = 1)

#drop NAN values
tmp = tmp.dropna()

tmp = tmp[['PLANNED_ORIGIN_LOCATION_CODE', 'PLANNED_DEST_LOCATION_CODE', 'APPLICABLE_TIMETABLE_FLAG', 'PFPI_MINUTES', 'BH', 'SA', 'SU', 'WD', 'A', 'C', 'D', 'F', 'M', 'O', 'P', 'S',1,2,3,4,5,6,7,8,9,10,11,12,13, 'INCIDENT_CATEGORY']]

==> done  1 / 14
==> done  2 / 14
==> done  3 / 14
==> done  4 / 14
==> done  5 / 14
==> done  6 / 14
==> done  7 / 14
==> done  8 / 14
==> done  9 / 14
==> done  10 / 14
==> done  11 / 14
==> done  12 / 14
==> done  13 / 14
==> done  14 / 14
merge: INCIDENT_REASON
merge: ENGLISH_DAY_TYPE
merge: EVENT_TYPE
merge: PERIOD


# Desicion Tree

In [None]:
X = tmp.iloc[:, :-1].values
y = tmp.iloc[:, 29].values

In [None]:
# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

In [None]:
#choose from different tunable hyper parameters
clf = tree.DecisionTreeClassifier(max_depth=9,criterion='entropy')
 
#Printing all the parameters of Decision Trees
print(clf)

DecisionTreeClassifier(criterion='entropy', max_depth=9)


In [None]:
#Creating the model on Training Data
DTree=clf.fit(X_train,y_train)

In [None]:
prediction=DTree.predict(X_test)

In [None]:
#Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

                                     precision    recall  f1-score   support

                         All Others       0.96      0.96      0.96    111734
                           External       0.19      0.28      0.23    111931
                              Fleet       0.14      0.13      0.14    112116
         Network Management / Other       0.35      0.32      0.33    111949
                   Non-Track Assets       0.21      0.12      0.15    112167
                         Operations       0.64      0.51      0.57    111675
Severe Weather, Autumn & Structures       0.37      0.44      0.40    111630
                           Stations       0.21      0.50      0.29    111830
                          TOC Other       0.26      0.09      0.13    111707
                              Track       0.86      0.41      0.55    111576
                          Traincrew       0.19      0.17      0.18    111747

                           accuracy                           0.36   12300

# K-Means

In [None]:
# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 



In [None]:
# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train) 

In [None]:
# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

In [None]:
# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, verbose=True)

clf.fit(X_train, y_train)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs', verbose=True)

In [None]:
# Predict y data with classifier: 
y_predict = clf.predict(X_test)

# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[     0      0      0 302015      0      0      0      0      0      0
       0]
 [     0      0      0  88702      0      0      0      0      0      0
       0]
 [     0      0      0 174679      0      0      0      0      0      0
       0]
 [     0      0      0 328448      0      0      0      0      0      0
       0]
 [     0      0      0 124898      0      0      0      0      0      0
       0]
 [     0      0      0 111611      0      0      0      0      0      0
       0]
 [     0      0      0  70112      0      0      0      0      0      0
       0]
 [     0      0      0  46089      0      0      0      0      0      0
       0]
 [     0      0      0  95869      0      0      0      0      0      0
       0]
 [     0      0      0 112905      0      0      0      0      0      0
       0]
 [     0      0      0  99091      0      0      0      0      0      0
       0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                     precision    recall  f1-score   support

                         All Others       0.00      0.00      0.00    302015
                           External       0.00      0.00      0.00     88702
                              Fleet       0.00      0.00      0.00    174679
         Network Management / Other       0.21      1.00      0.35    328448
                   Non-Track Assets       0.00      0.00      0.00    124898
                         Operations       0.00      0.00      0.00    111611
Severe Weather, Autumn & Structures       0.00      0.00      0.00     70112
                           Stations       0.00      0.00      0.00     46089
                          TOC Other       0.00      0.00      0.00     95869
                              Track       0.00      0.00      0.00    112905
                          Traincrew       0.00      0.00      0.00     99091

                           accuracy                           0.21   15544

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print(unique,counts)

['All Others' 'External' 'Fleet' 'Network Management / Other'
 'Non-Track Assets' 'Operations' 'Severe Weather, Autumn & Structures'
 'Stations' 'TOC Other' 'Track' 'Traincrew'] [1207002  353950  696135 1313216  501375  447508  281128  182794  384016
  453925  396627]


# Desicion Tree

In [None]:
from sklearn.utils import resample
from statistics import median

def balanced_dataset(tmp):
  y_labels = pd.DataFrame(tmp['INCIDENT_CATEGORY'].value_counts())
  important_val = median(y_labels['INCIDENT_CATEGORY'])
  important_index = y_labels[y_labels['INCIDENT_CATEGORY'] == important_val].index[0]
  y_labels_list = y_labels.index.to_list()
  y_labels_list.remove(important_index)

  important_tmp = tmp[tmp['INCIDENT_CATEGORY'] == important_index]

  for y in y_labels_list:
    tmp_tmp = tmp[tmp['INCIDENT_CATEGORY'] == y]
    tmp_tmp = resample(tmp_tmp,replace=True,     # sample with replacement
                              n_samples=important_val,    # to match majority class
                              random_state=123) # reproducible results

    important_tmp = pd.concat([important_tmp, tmp_tmp])
  return important_tmp

In [None]:
tmp_balanced = balanced_dataset(tmp)

In [None]:
X = tmp_balanced.iloc[:, :-1].values
y = tmp_balanced.iloc[:, 29].values

In [None]:
# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

In [None]:
#choose from different tunable hyper parameters
clf = tree.DecisionTreeClassifier(max_depth=9,criterion='entropy')
 
#Printing all the parameters of Decision Trees
print(clf)

DecisionTreeClassifier(criterion='entropy', max_depth=9)


In [None]:
#Creating the model on Training Data
DTree=clf.fit(X_train,y_train)

In [None]:
prediction=DTree.predict(X_test)

In [None]:
#Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

                                     precision    recall  f1-score   support

                         All Others       0.96      0.96      0.96    111734
                           External       0.19      0.28      0.23    111931
                              Fleet       0.14      0.13      0.14    112116
         Network Management / Other       0.35      0.32      0.33    111949
                   Non-Track Assets       0.21      0.12      0.15    112167
                         Operations       0.64      0.51      0.57    111675
Severe Weather, Autumn & Structures       0.37      0.44      0.40    111630
                           Stations       0.21      0.50      0.29    111830
                          TOC Other       0.26      0.09      0.13    111707
                              Track       0.86      0.41      0.55    111576
                          Traincrew       0.19      0.17      0.18    111747

                           accuracy                           0.36   12300

In [None]:
metrics.balanced_accuracy_score(y_test, prediction)

0.3572694450558537

In [None]:
unique, counts = np.unique(prediction, return_counts=True)
sum = counts.sum()
for i,e in enumerate(unique):
  print(e, counts[i]/sum)

print('##############')

unique, counts = np.unique(y_test, return_counts=True)
sum = counts.sum()
for i,e in enumerate(unique):
  print(e, counts[i]/sum)

All Others 0.0913246649355886
External 0.1369752093796898
Fleet 0.08561600959951612
Network Management / Other 0.08105445091385638
Non-Track Assets 0.05051452691002568
Operations 0.07284185675193608
Severe Weather, Autumn & Structures 0.10864086525719842
Stations 0.2193743079617125
TOC Other 0.03212520994876681
Track 0.04313603704528715
Traincrew 0.07839686129642245
##############
All Others 0.09083607167768779
External 0.0909962262064839
Fleet 0.09114662512946502
Network Management / Other 0.09101085961520639
Non-Track Assets 0.09118808645417874
Operations 0.09078810661576409
Severe Weather, Autumn & Structures 0.09075152309395787
Stations 0.09091411652420772
TOC Other 0.09081412156460406
Track 0.09070762286779041
Traincrew 0.09084664025065403


In [None]:
X = tmp.iloc[:, :-1].values
y = tmp.iloc[:, 29].values
# Split dataset into random train and test subsets:
_, X_test_1, _, y_test_1 = train_test_split(X, y, test_size=0.10) 

prediction_1 = DTree.predict(X_test_1)

#Measuring accuracy on Testing Data
print(metrics.classification_report(y_test_1, prediction_1))
print(metrics.confusion_matrix(y_test_1, prediction_1))

                                     precision    recall  f1-score   support

                         All Others       0.98      0.96      0.97    150371
                           External       0.14      0.29      0.19     44455
                              Fleet       0.21      0.13      0.16     86560
         Network Management / Other       0.62      0.32      0.42    164366
                   Non-Track Assets       0.22      0.12      0.15     62474
                         Operations       0.50      0.51      0.51     56048
Severe Weather, Autumn & Structures       0.25      0.44      0.32     35272
                           Stations       0.08      0.50      0.14     23010
                          TOC Other       0.20      0.09      0.12     48235
                              Track       0.81      0.41      0.55     56642
                          Traincrew       0.16      0.17      0.16     49777

                           accuracy                           0.41    7772

# save model

In [None]:
model.save('drive/MyDrive/DS_Project_Code/Jeremias/results/model.txt')