In [17]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
df = pd.read_csv("abe_atl_eda_2018.csv")

In [3]:
df.head()

Unnamed: 0,origin precipitation_sum (mm),origin rain_sum (mm),origin snowfall_sum (cm),origin windspeed_10m_max (km/h),origin windgusts_10m_max (km/h),origin et0_fao_evapotranspiration (mm),dest precipitation_sum (mm),dest rain_sum (mm),dest snowfall_sum (cm),dest windspeed_10m_max (km/h),dest windgusts_10m_max (km/h),dest et0_fao_evapotranspiration (mm),delay
0,-0.536792,-0.513072,-0.15434,0.485277,0.463999,-1.021872,-0.536792,-0.513072,-0.15434,0.485277,0.463999,-1.021872,21.0
1,-0.536792,-0.513072,-0.15434,0.130566,0.129316,-0.975951,-0.536792,-0.513072,-0.15434,0.130566,0.129316,-0.975951,-22.0
2,-0.536792,-0.513072,-0.15434,0.130566,0.129316,-0.975951,-0.536792,-0.513072,-0.15434,0.130566,0.129316,-0.975951,-15.0
3,-0.536792,-0.513072,-0.15434,-1.120257,-1.13158,-0.998912,-0.536792,-0.513072,-0.15434,-1.120257,-1.13158,-0.998912,5.0
4,-0.536792,-0.513072,-0.15434,-1.120257,-1.13158,-0.998912,-0.536792,-0.513072,-0.15434,-1.120257,-1.13158,-0.998912,19.0


In [8]:
features = [
    'origin precipitation_sum (mm)', 'origin rain_sum (mm)',
    'origin snowfall_sum (cm)', 'origin windspeed_10m_max (km/h)',
    'origin windgusts_10m_max (km/h)', 'origin et0_fao_evapotranspiration (mm)', 
    'dest precipitation_sum (mm)', 'dest rain_sum (mm)', 'dest snowfall_sum (cm)',
    'dest windspeed_10m_max (km/h)', 'dest windgusts_10m_max (km/h)',
    'dest et0_fao_evapotranspiration (mm)',
]

X = df[features]
y = df['delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
lr = LinearRegression()  
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

result_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
result_df.head()

Mean Absolute Error: 11.911628595639764
Mean Squared Error: 298.6851466190018
Root Mean Squared Error: 17.282509847212637


Unnamed: 0,Actual,Predicted
881,-11.0,-7.089242
406,4.0,-4.149585
14,-11.0,-0.171205
708,7.0,-2.127848
55,-10.0,-6.447376


In [22]:
feature = 'origin windspeed_10m_max (km/h)'
x_range = np.linspace(X[feature].min(), X[feature].max(), 100)

fig = px.scatter(df, x=feature, y='delay', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_pred, name='Regression Fit'))
fig.show()