In [None]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Reading data
df1 = pd.read_csv(r'C:\Users\LENOVO\Desktop\Bosch_Project\2014.csv')
df2 = pd.read_csv(r'C:\Users\LENOVO\Desktop\Bosch_Project\2015.csv')
df3 = pd.read_csv(r'C:\Users\LENOVO\Desktop\Bosch_Project\2016.csv')
df4 = pd.read_csv(r'C:\Users\LENOVO\Desktop\Bosch_Project\2017.csv')
df5 = pd.read_csv(r'C:\Users\LENOVO\Desktop\Bosch_Project\2018.csv')
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
m = df

# Mean
df_mean = df['ARR_DELAY'].mean()
print('mean =', df_mean)

# Median
df_median = df['ARR_DELAY'].median()
print('median =', df_median)

# Skewness
df_skewness = df['ARR_DELAY'].skew()
print('skewness =', df_skewness)

# Kurtosis
df_kurtosis = df['ARR_DELAY'].kurtosis()
print('kurtosis =', df_kurtosis)

# Extracting year, month, etc.
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['Year'] = df['FL_DATE'].dt.year
df['Month'] = df['FL_DATE'].dt.month
df['Week'] = df['FL_DATE'].dt.week
df['Quarter'] = df['FL_DATE'].dt.quarter
df['Day of Week'] = df['FL_DATE'].dt.dayofweek

# Mean delay time per year, month, etc.
y1 = df.groupby('Year')['ARR_DELAY'].mean()
print(y1)
m1 = df.groupby('Month')['ARR_DELAY'].mean()
print(m1)
q1 = df.groupby('Quarter')['ARR_DELAY'].mean()
print(q1)
w1 = df.groupby('Week')['ARR_DELAY'].mean()
print(w1)
d1 = df.groupby('Day of Week')['ARR_DELAY'].mean()
print(d1)
op1 = df.groupby('OP_CARRIER')['ARR_DELAY'].mean()
print(op)
ori1 = df.groupby('ORIGIN')['ARR_DELAY'].mean()
print(ori1.to_string())
dest1 = df.groupby('DEST')['ARR_DELAY'].mean()
print(dest1.to_string())

# Median delay time per year, month, etc.
y2 = df.groupby('Year')['ARR_DELAY'].median()
print(y2)
m2 = df.groupby('Month')['ARR_DELAY'].median()
print(m2)
q2 = df.groupby('Quarter')['ARR_DELAY'].median()
print(q2)
w2 = df.groupby('Week')['ARR_DELAY'].median()
print(w2)
d2 = df.groupby('Day of Week')['ARR_DELAY'].median()
print(d2)
op2 = df.groupby('OP_CARRIER')['ARR_DELAY'].median()
print(op2) 
ori2 = df.groupby('ORIGIN')['ARR_DELAY'].median()
print(ori2.to_string())
dest2 = df.groupby('DEST')['ARR_DELAY'].median()
print(dest2.to_string())

# Concatenation
y = pd.concat([y1, y2], axis=1)
m = pd.concat([m1, m2], axis=1)
q = pd.concat([q1, q2], axis=1)
w = pd.concat([w1, w2], axis=1)
d = pd.concat([d1, d2], axis=1)
op = pd.concat([op1, op2], axis=1)
ori = pd.concat([ori1, ori2], axis=1)
dest = pd.concat([dest1, dest2], axis=1)

# Plotting Mean/Median delay time per year
y.columns = ['Mean', 'Median']
y.plot(kind='bar', xlabel='Year')
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per year')
plt.savefig('Year.png', bbox_inches='tight')
plt.show()

# Plotting Mean/Median delay time per quarter of year
q.columns = ['Mean', 'Median']
q.plot(kind='bar')
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per quarter of year')
plt.savefig('Quarter.png')
plt.show()

# Plotting Mean/Median delay time per month of year
m.columns = ['Mean', 'Median']
m.plot(kind='bar')
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per month of year')
plt.savefig('Month.png')
plt.show()

# Plotting Mean/Median delay time per week of year
w.columns = ['Mean', 'Median']
w.plot(kind='bar', figsize=(20, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per week of year')
plt.savefig('Week.png')
plt.show()

# Plotting Mean/Median delay time per day of week
d.columns = ['Mean', 'Median']
d.plot(kind='bar')
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per day of week')
plt.savefig('Day.png')
plt.show()

# Plotting Mean/Median delay time per carrier
op.columns = ['Mean', 'Median']
op.plot(kind='bar',xlabel='Carrier')
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per carrier')
plt.savefig('Carrier.png')
plt.show()

# Plotting Mean/Median delay time per origin
ori.columns = ['Mean', 'Median']
ori.iloc[0:92].plot(kind='bar', xlabel='Origin', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per origin (1/4)')
plt.savefig('Origin_1.png')
plt.show()

ori.columns = ['Mean', 'Median']
ori.iloc[93:185].plot(kind='bar', xlabel='Origin', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per origin (2/4)')
plt.savefig('Origin_2.png')
plt.show()

ori.columns = ['Mean', 'Median']
ori.iloc[186:278].plot(kind='bar', xlabel='Origin', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per origin (3/4)')
plt.savefig('Origin_3.png')
plt.show()

ori.columns = ['Mean', 'Median']
ori.iloc[279:371].plot(kind='bar', xlabel='Origin', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per origin (4/4)')
plt.savefig('Origin_4.png')
plt.show()

# Plotting Mean/Median delay time per destination
dest.columns = ['Mean', 'Median']
dest.iloc[0:92].plot(kind='bar', xlabel='Destination', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per destination (1/4)')
plt.savefig('Destination_1.png')
plt.show()

dest.columns = ['Mean', 'Median']
dest.iloc[93:184].plot(kind='bar', xlabel='Destination', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per destination (2/4)')
plt.savefig('Destination_2.png')
plt.show()

dest.columns = ['Mean', 'Median']
dest.iloc[185:276].plot(kind='bar', xlabel='Destination', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per destination (3/4)')
plt.savefig('Destination_3.png')
plt.show()

dest.columns = ['Mean', 'Median']
dest.iloc[277:368].plot(kind='bar', xlabel='Destination', figsize=(25, 10))
plt.ylabel('Delay time [min]')
plt.title('Mean/Median delay time per destination (4/4)')
plt.savefig('Destination_4.png')
plt.show()


### Gradient Boosting Regressor 

# Droppig Cancellation code and rows with NaN values
m = m.drop(['Unnamed: 27', 'CANCELLATION_CODE'], axis=1)
m = m.dropna(axis=0)

# Extracting Month and Day of Week from Flight Date
m['FL_DATE'] = pd.to_datetime(m['FL_DATE'])
m['Month'] = m['FL_DATE'].dt.month
m['Day of Week'] = m['FL_DATE'].dt.dayofweek
m = m.drop(['FL_DATE'], axis=1)

# Features and Target
X = m.drop(['ARR_DELAY'], axis=1)
Y = m['ARR_DELAY']

# Mapping 
X['OP_CARRIER'] = X['OP_CARRIER'].astype('category').cat.codes
X['ORIGIN'] = X['ORIGIN'].astype('category').cat.codes
X['DEST'] = X['DEST'].astype('category').cat.codes

# Train test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

# Scaling
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)

# Model
model = GradientBoostingRegressor()
model.fit(Xtrain, Ytrain)

# Metrics for model evaluation
print('Score =', model.score(Xtest,Ytest))
print('MSE =', mean_squared_error(Ytest, model.predict(Xtest)))
print('MAE =', mean_absolute_error(Ytest, model.predict(Xtest)))
print('R2 score=', r2_score(Ytest, model.predict(Xtest)))

# Feature importance
print(model.feature_importances_)