## Modeling functions

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve

def clf_metrics(model,X_test,y_test):
    """
    Returns various classification evaluation metrics
    
        Parameters:
            model ():         ML model to be evaluated
            X_test (df):      scaled test data used to evaluate the model
            y_test (series):  target data used for evaluating predictions
        
        Returns:
            Classification report and plots of roc curve and confusion matrix
            
    
    """
    clf_rep = classification_report(y_test, (model.predict(X_test)))
    roc = plot_roc_curve(model, X_test, y_test)
    mtrx = plot_confusion_matrix(model, X_test, y_test)
    
    
    return clf_rep, roc, mtrx


def results_interpret(model, X_train):
    """
    Returns coefficients from logistric regression model
    
        Parameters:
            model ():  ML model to be interpreted
            X_train (df): features table used to train the model
            
        Returns:
            table of regression coefficients and interpretation
            
    """
    
    
    coefficients = np.hstack((model.intercept_, model.coef[0]))
    results = pd.DataFrame(data={'variable': ['intercept'] + list(X_train.columns), 'coefficient': coefficients})
        
    return results
  
    
    
  

def weekday_cycle(Day_of_week):
    """
    Converts days of the week into circular points to properly show its cyclical nature
    
        Parameters:
            Day_of_week (int): day of week as a number
            
        Returns:
            Day of the week Sine, and Day of the week Cosine    
    """
        
    x = np.sin(Day_of_week * (2 * np.pi/7))
    y = np.cos(Day_of_week * (2 * np.pi/7))
    
    return x,y

    
def two_hot(x):
    """
    Encodes periods of day into categories
    
        Parameters:
            x (str):  period of day (i.e. morning)
    
        Returns:
            encoded period of day for machine learning
        
    """
     
    return np.concatenate([
        (x == "morning") | (x == "afternoon"),
        (x == "afternoon") | (x == "evening"),
        (x == "evening") | (x == "overnight"),
        (x == "overnight") | (x == "morning"),
    ], axis=1).astype(int)

x = np.array([["morning", "afternoon", "evening", "night"]]).T
print(x)
x = two_hot(x)
print(x)  
    
    
    
    
def log_model_fit(X, y, split, estimator, parameters, cv):
    """
    Returns a scaled and tuned log regression model
    
        Parameters:
            X(df):              numeric features data table
            y(series):          target column
            split (float):      split (fraction) for train_test_split
            estimator ():       algorithm
            parameters (dict):  parameters used for GridSearchCV
              eg. param_grid = {'logistic__C': [0.1,1,10], 'logistic__penalty': ['l1', 'l2']}
            cv (int):           # of folds for CV
              
        Returns:
            Fitted model      
    
    
    """
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, stratify=y)
    
    scaler = StandardScaler().fit(X_train)
    
    X_train_sc = scaler.transform(X_train)
    X_test_sc = scaler.transform(X_test)
    
    grid = GridSearchCV(estimator, param_grid=parameters, cv=cv)
    fit_grid = grid.fit(X_train_sc, y_train)
    
    
    return fit_grid
    
    
    
def top_categories(X, perc):
    """
    Returns dominant groups in category by %
    
        Parameters:
            X (list):      column of values
            perc (float):  threshold for cutoff (percentage)
            
        Returns:
            List of top groups
    
    """
    
    val_list = []
    key_list = []
    denom = sum(X.values())
    
    for count, (keys, values) in enumerate(X.items()):
        if (sum(val_list)/denom) < perc:
            key_list.append(keys)
            val_list.append(values)
            
    return val_list, key_list
    
    
    
def clf_threshold(y_predict, y_prob, y_test):
    """
    Returns plot of precision / recall vs threshold
    
        Parameters:
            y_predict(float):  y_predict from fitted model
            y_prob(float):     y_probability from fitted model (predict_proba)
            y_test(array):     target values from test set
            
        Output:
            Plot      
    
    """

    precision, recall, thresholds = precision_recall_curve(y_test, probs_y[:, 1]) 

    pr_auc = metrics.auc(recall, precision)

    plt.title("Precision-Recall vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label="Precision")
    plt.plot(thresholds, recall[: -1], "r--", label="Recall")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.ylim([0,1])

    return
    
    
    

In [6]:
df = pd.read_csv('/Users/wjdol/Desktop/LighthouseLabs/Flight_delays/data/flights_cleanDates.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49133 entries, 0 to 49132
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          49133 non-null  int64  
 1   mkt_unique_carrier  49133 non-null  object 
 2   origin              49133 non-null  object 
 3   origin_city         49133 non-null  object 
 4   dest                49133 non-null  object 
 5   dest_city           49133 non-null  object 
 6   distance            49133 non-null  float64
 7   fl_date             49133 non-null  object 
 8   fl_day              49133 non-null  object 
 9   week_num            49133 non-null  int64  
 10  dep_time_of_day     49133 non-null  object 
 11  arr_time_of_day     49133 non-null  object 
 12  arr_delay           49133 non-null  float64
 13  delay_binary        49133 non-null  float64
dtypes: float64(3), int64(2), object(9)
memory usage: 5.2+ MB


In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,mkt_unique_carrier,origin,origin_city,dest,dest_city,distance,fl_date,fl_day,week_num,dep_time_of_day,arr_time_of_day,arr_delay,delay_binary
0,0,DL,FLL,Fort Lauderdale,ATL,Atlanta,581.0,2019-01-13,Sunday,2,evening,evening,-8.0,0.0
1,1,AA,DCA,Washington,DFW,Dallas/Fort Worth,1192.0,2019-01-14,Monday,3,morning,morning,75.0,1.0
2,2,AA,ORD,Chicago,IND,Indianapolis,177.0,2019-01-10,Thursday,2,afternoon,afternoon,-7.0,0.0
3,3,UA,PIT,Pittsburgh,SFO,San Francisco,2254.0,2019-01-11,Friday,2,afternoon,evening,-7.0,0.0
4,4,DL,MSP,Minneapolis,GFK,Grand Forks,284.0,2019-01-03,Thursday,1,evening,overnight,-15.0,0.0


In [18]:
df_test = df.drop(['Unnamed: 0', 'origin_city', 'dest_city',], axis=1)
