In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def reformat(flight_df, df_type):
    # reformat departure_time
    flight_df['departure_time'] = flight_df['departure_time'].apply(str).apply(lambda x: '0'+x if len(x) != 4 else x)

    # reformat the dates
    flight_df['month'] = flight_df['month'].apply(lambda x: '0'+x[2:] if len(x) != 4 else x[2:])
    flight_df['day_of_month'] = flight_df['day_of_month'].apply(lambda x: '0'+x[2:] if len(x) != 4 else x[2:])
    flight_df['day_of_week'] = flight_df['day_of_week'].apply(lambda x: x[2:])
    
    if df_type == True:
        flight_df['is_delayed'] = flight_df['is_delayed'].astype('category').cat.codes

    flight_df[['departure_time', 'month', 'day_of_month', 'day_of_week']] = flight_df[['departure_time', 'month', 'day_of_month', 'day_of_week']].apply(pd.to_numeric)

    return flight_df

def one_hot_encoding(flight_df):
    # one-hot encoding/dummy variables for categorical data
    dummy_df = flight_df.copy()
    dummy_df = pd.get_dummies(dummy_df, columns=['unique_carrier', 'origin', 'destination'], drop_first=True)
    
    return dummy_df

def split_df(dummy_df):
    # train-test split
    random_seed = 1147
    train_df, test_df = train_test_split(dummy_df,test_size=0.20,random_state=random_seed, stratify=dummy_df['target'])

    return train_df, test_df

def scale(train_df):
    # scaling numeric columns
    scaler = StandardScaler()
    scaler.fit(train_df[numeric_cols])
    
    return scaler

def get_features_and_target_arrays(df, numeric_cols, cat_cols, scaler, df_type):
    X_numeric_scaled = scaler.transform(df[numeric_cols])

    if df_type == True:
        X_categorical = df[cat_cols].to_numpy()
        X = np.hstack((X_categorical, X_numeric_scaled))
        y = df['target']
        return X, y
    else:
        for x in cat_cols:
            if x not in df.columns:
                df[x] = 0
        X_categorical = df[cat_cols].to_numpy()
        X = np.hstack((X_categorical, X_numeric_scaled))
    
    return X


In [2]:
# pre-processing training data - use all train data

flight_df = pd.read_csv("flights_historical/train_1.csv")
flight_df = reformat(flight_df, True)
dummy_df = one_hot_encoding(flight_df)

dummy_df = dummy_df.rename(columns = {'is_delayed': 'target'})
numeric_cols = ['departure_time', 'month', 'day_of_month', 'day_of_week', 'distance']
category_cols = list(set(dummy_df.columns) - set(numeric_cols) - {'target'})
category_cols.sort()

# train_df, test_df = split_df(dummy_df)
train_df = dummy_df
scaler = scale(train_df)

X_train, y_train = get_features_and_target_arrays(train_df, numeric_cols, category_cols, scaler, True)

In [3]:
# pre-processing test data 

test_flight_df = pd.read_csv("flights_historical/test_1.csv")
test_flight_df = reformat(test_flight_df, False)
test_dummy_df = one_hot_encoding(test_flight_df)

# test_numeric_cols = ['departure_time', 'month', 'day_of_month', 'day_of_week', 'distance']
# test_category_cols = list(set(test_dummy_df.columns) - set(test_numeric_cols))
# test_category_cols.sort()

# train_df, test_df = split_df(dummy_df)
test_df = test_dummy_df
test_scaler = scale(test_df)

X_test = get_features_and_target_arrays(test_df, numeric_cols, category_cols, test_scaler, False)

In [4]:
# training decision tree

dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [6]:
# get predictions

y_preds = dtree.predict(X_test)

final_df = test_flight_df.copy()
final_df['is_delayed'] = y_preds
final_df.to_csv(r'final.csv', index = False)
final_df

Unnamed: 0,unique_carrier,origin,destination,departure_time,month,day_of_month,day_of_week,distance,is_delayed
0,RQ,OLV,TVQ,1605,7,24,7,952,0
1,NB,CLO,PRI,922,12,31,7,323,0
2,NB,PVQ,IQQ,1649,12,14,4,1056,0
3,WP,RJB,ULQ,1404,2,11,5,1557,0
4,NB,QTP,PQM,630,11,10,4,368,0
...,...,...,...,...,...,...,...,...,...
9995,I9,TCZ,RJB,720,2,12,7,349,0
9996,XJ,YTL,OPU,2045,6,8,3,1034,0
9997,HH,PBT,QTX,1143,7,8,6,36,0
9998,HH,VWQ,PQM,2016,10,31,1,926,1
