In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
d1 = pd.read_csv("../input/new-dataset/2007.csv")
d2 = pd.read_csv("../input/new-dataset/2008.csv")

In [None]:
d1.head()

In [None]:
d2.head()

In [3]:
combined = [d1,d2]
dataset = pd.concat(combined)

Fetching only sample of data due to insufficient amount of RAM.

In [4]:
dataset = dataset.iloc[0:50000,]

In [5]:
len(dataset.columns)

In [6]:
dataset.head()

To predict delays, let's take the mean of all the delays and aggregate them into one column. First, let's extract all delay columns.

* ArrDelay
* DepDelay
* CarrierDelay
* WeatherDelay
* NASDelay
* SecurityDelay
* LateAircraftDelay

In [7]:
features = dataset.columns

In [None]:
features

In [8]:
delay_features = [value for value in features if "Delay" in value]

In [9]:
newdataset = dataset[delay_features]

Now, we have extracted the necessary columns including all the delay columns.

In [None]:
newdataset.head()

Now, let take the mean of all the delay columns by rows.

In [10]:
dataset["AverageDelay"] = newdataset.mean(axis=1)

['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],

In [11]:
dataset.drop(delay_features, axis = 1,inplace=True)

Let's drop some other useless variables for predictions

In [12]:
dataset.drop(['CancellationCode','UniqueCarrier','TailNum'],axis = 1,inplace=True)

In [13]:
Origin = list(set(dataset["Origin"]))
airportNumber = [i for i in range(len(Origin))]
dataset['Origin'].replace(Origin,airportNumber, inplace=True)

In [14]:
Destination = list(set(dataset["Dest"]))
airportNumber = [i for i in range(len(Destination))]
dataset['Dest'].replace(Destination,airportNumber, inplace=True)

Converting Origin and Destination to numerical codes.

In [None]:
dataset.head()

Now, we have dropped all the useless columns. Let's move one and drop any na values if we have any.

In [15]:
dataset.dropna()

In [16]:
labels = dataset["AverageDelay"]

Normalizing the dataset

In [17]:
# Normalize Dataset
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(dataset)

In [18]:
features = dataset.columns

In [19]:
scaled_dataset = pd.DataFrame(scaler.transform(dataset))

In [20]:
scaled_dataset = scaled_dataset.set_axis(features, axis=1, inplace=False)

In [21]:
scaled_dataset.head()

In [22]:
scaled_dataset.shape

Before model, Let's split the dataset in train and test.

# Clean Dataset

In [23]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [24]:
scaled_dataset = clean_dataset(scaled_dataset)

In [25]:
scaled_dataset = scaled_dataset.set_axis(features, axis=1, inplace=False)

In [None]:
scaled_dataset.head()

In [None]:
#scaled_dataset.round(3)

In [26]:
scaled_dataset["AverageDelay"] = labels

In [None]:
scaled_dataset.head()

In [27]:
X = scaled_dataset.drop("AverageDelay",axis=1)
Y = scaled_dataset["AverageDelay"]

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [29]:
model = LinearRegression()
rfe = RFE(model, 5)
fit = rfe.fit(X, Y)

indicies = [index for index in range(len(fit.ranking_)) if fit.ranking_[index]==1]

In [30]:
X = X.iloc[:,indicies]

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Random Forest

In [32]:
# import the regressor
from sklearn.ensemble import RandomForestRegressor
 
 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 10)
 
# fit the regressor with x and y data
regressor.fit(X_train, Y_train) 

In [33]:
Y_pred = regressor.predict(X_test) 

In [34]:
mean_squared_error(Y_test,Y_pred)

# Linear Regression

In [35]:
# create linear regression object
reg = LinearRegression()
 
# train the model using the training sets
reg.fit(X_train, Y_train)

In [36]:
Y_pred = reg.predict(X_test)

In [37]:
mean_squared_error(Y_test,Y_pred)

# Support Vector Machine

In [38]:
from sklearn.svm import SVR

In [40]:
reg = SVR(C=1.0, epsilon=0.2)
reg.fit(X_train, Y_train)

In [41]:
Y_pred = reg.predict(X_test)

In [42]:
mean_squared_error(Y_test,Y_pred)