Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use('seaborn')
import os
%matplotlib inline

Cleaning the Data and Get the Big Picture

In [None]:
flights = pd.read_csv('../input/flight-delays/flights.csv')

In [None]:
flights.head()

**Correlations**

Let's look at how much each attributes correlates with the arrival delay:

In [None]:
corr_matrix = flights.corr()
corr_matrix['ARRIVAL_DELAY'].sort_values(ascending=False)

In [None]:
flights.isnull().sum()

In [None]:
flights.shape

**Summary of the numerical attributes**

In [None]:
flights.describe()

**Distribution of the arrival delay attribute**

In [None]:
fig, ax = plt.subplots()
flights.ARRIVAL_DELAY.hist(ax=ax, bins=1000, range=(-10, 1000))
ax.set_xscale('log')
plt.ylim(0, 150000)
plt.xlabel('Delay (minutes)')
plt.ylabel('Number of flights')

# Exploration with Regard to the Mean Delay

In [None]:
def delay_by_attribute(attribute, df=flights, figsize=(10, 7)):
    # Delay with less than 10 min are mapped to 0 otherwise they are mapped to 1
    delay_type = lambda x: 0 if x < 10 else 1
    flights['DELAY_TYPE'] = flights['DEPARTURE_DELAY'].apply(delay_type)
    plt.figure(1, figsize=figsize)
    ax = sns.countplot(y=attribute, hue='DELAY_TYPE', data=df)
    plt.xlabel('Flight count')
    plt.ylabel(attribute)
    plt.title(f'Delay by {attribute}')
    plt.legend()
    
delay_by_attribute('AIRLINE')

We can see that the proportion between small and large delay is related to the airline, for example: the airline **UA** (United Air Lines Inc.) almost 50% of their flights have a large delay, on the other hand, the airline **DL** (Delta Air Lines Inc.) ~25% of their flights have a large delay.

**Statistiques of outliers**

if we consider delays of more than 10 minutes to be significant delays, than let's see how much in percentage these delays represent of the dataset

In [None]:
nb_of_large_delays = (flights.ARRIVAL_DELAY > 10).sum()
percent_of_large_delays = np.round(nb_of_large_delays * 100 / len(flights), 2)
print('There are {} flights with large delays (more than 10min), which represent {}% of the flights'
      .format(nb_of_large_delays, percent_of_large_delays))

In [None]:
# Number of flights that have more than 150min (2.5h) delay
nb_of_rare_delays = (flights.ARRIVAL_DELAY > 150).sum()
percent_of_rare_delays = np.round(nb_of_rare_delays * 100 / len(flights), 1)

# percent of rare delays with regard to large delays
percent_rare_large = np.round((nb_of_rare_delays * 100 / nb_of_large_delays), 1)
print(
    'There are {} flights with rare delays (> 5h) which represent {}% of all flights, which also represent {}% of large delays'
     .format(nb_of_rare_delays, percent_of_rare_delays, percent_rare_large))

 <table style="width:50%; border: 1px solid black; border-collapse: collapse;">
  <tr>
    <th></th>
    <th>All flights</th>
    <th>Large delays (> 10min)</th>
    <th>Rare delays (> 150min)</th>
  </tr>
    <tr>
    <td>All flights</td>
    <td>100%</td>
    <td>/</td>
    <td>/</td>
  </tr>
  <tr>
    <td>Large delays (> 10min)</td>
    <td>22%</td>
    <td>100%</td>
    <td>/</td>
  </tr>
  <tr>
    <td>Rare delays (> 150min)</td>
    <td>1.3%</td>
    <td>5.8%</td>
    <td>100%</td>
  </tr>
</table> 

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
flights.drop(['CANCELLATION_REASON','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY','DELAY_TYPE'],axis=1,inplace=True)

In [None]:
flights.dropna(inplace=True)
flights.shape

In [None]:
x = flights[['TAXI_IN','ARRIVAL_TIME','SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL','DISTANCE','SCHEDULED_TIME']]
y = flights['ARRIVAL_DELAY']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.33)

In [None]:
ridge_reg = Ridge()
ridge_reg.fit(xtrain,ytrain)

In [None]:
pred = ridge_reg.predict(xtest)

In [None]:
print(mean_absolute_error(ytest,pred))
print(mean_squared_error(ytest,pred))

In [None]:
ridge_reg.coef_

In [None]:
pd.DataFrame(ridge_reg.coef_,x.columns,columns=['Coefficient'])

In [None]:
plt.scatter(ytest,pred);

In [None]:
'''def prediction(data):
    if data == 'ARRIVAL_DELAY':
        x = flights[['TAXI_IN','ARRIVAL_TIME','SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL','DISTANCE','SCHEDULED_TIME']]
        y = flights[data]
    else:
        x = flights[['DEPARTURE_TIME','SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL','DISTANCE','SCHEDULED_TIME','ELAPSED_TIME','AIR_TIME','WHEELS_ON']]
        y = flights['DEPARTURE_DELAY']
    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.33)
    ridge_reg = Ridge()
    ridge_reg.fit(xtrain,ytrain)
    pred = ridge_reg.predict(xtest)
    print('mean_absolute_error:',mean_absolute_error(ytest,pred))
    print('mean_squared_error:',mean_squared_error(ytest,pred))
    print()
    print(pd.DataFrame(ridge_reg.coef_,x.columns,columns=['Coefficient']))

prediction('DEPARTURE_DELAY')'''

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'alpha': [0.01, 0.1, 1, 10, 100]},
    {'solver': ['cholesky', 'lsqr']}
]

grid_search = GridSearchCV(ridge_reg, param_grid, cv=4,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          verbose=2)
grid_search.fit(xtrain,ytrain)

In [None]:
grid_search.best_params_

In [None]:
np.sqrt(-grid_search.best_score_)

In [None]:
model = grid_search.best_estimator_
test_predictions = model.predict(xtest)
test_mse = mean_squared_error(ytest, test_predictions)
test_rmse = np.sqrt(test_mse)

In [None]:
test_rmse