### Imports, Globals, and Configuration

In [1]:
# Imports and globals
import pandas as pd
import numpy as np
import json
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

CONFIG_PATH="../config/config.json"

### Define Functions

In [4]:
# Define functions
# Load configuration
with open(CONFIG_PATH) as fp:
    config = json.load(fp)

def get_df(file_path, year=None):
    '''
    Returns a dataframe for a specific year.
    '''
    if year:
        return pd.read_csv(f"{file_path}/{year}.csv")
    else:
        return pd.read_csv(file_path)
    


### Load DataFrame Dictionary (heavy compute for international files)

In [6]:
df = get_df(config["international_data_path"], 2009)

### Data Preprocessing

In [9]:
def preprocess_data(X):
    label_mappings = {}

    for column in X.columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])
                
        # Store the mapping for each column
        label_mappings[column] = dict(zip(le.classes_, le.transform(le.classes_)))

        # add new boolean column of whether the flight was delayed
        df['BOOL_DEP_DELAY'] = np.where(df['DEP_DELAY'] > 15, 1, 0)
    
    return label_mappings

# Encodes categorical variables
mapping = preprocess_data(df)

# Get scaler
scaler = StandardScaler()

### Naive Bayes Classification