In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
%pip install jupyterlab_play_cell_button 

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv(r'C:\data\mia_data2.csv')

In [4]:
df.keys()

Index(['Unnamed: 0.1', 'Unnamed: 0', 'YEAR', 'MONTH', 'DAY_OF_WEEK',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WHY_DELAY', 'DAY', 'DATE'],
      dtype='object')

In [5]:
# I want to create dummy values based on the why delay column
dummy = pd.get_dummies(df['WHY_DELAY'])

In [6]:
dummy

Unnamed: 0,0,carrier,late aircraft,nas,security,weather
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
736647,1,0,0,0,0,0
736648,1,0,0,0,0,0
736649,1,0,0,0,0,0
736650,0,0,0,1,0,0


In [7]:
df = pd.concat([df, dummy], axis = 1)
df.keys()

Index(['Unnamed: 0.1', 'Unnamed: 0', 'YEAR', 'MONTH', 'DAY_OF_WEEK',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WHY_DELAY', 'DAY', 'DATE', '0', 'carrier', 'late aircraft', 'nas',
       'security', 'weather'],
      dtype='object')

In [8]:
df.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0', 'OP_CARRIER_FL_NUM', 'CANCELLATION_CODE', 'DIVERTED',  'CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'DATE', 'WHY_DELAY', 'carrier', 'late aircraft', 'nas','security', 'OP_CARRIER_FL_NUM']  , inplace = True)

In [9]:
# I need to create two data frames to work on one that is about departures from MIA and the 
#other one that deals with arrivals.

df_arr = df[df.DEST == 'MIA']
df_dep = df[df.ORIGIN == 'MIA']

In [10]:
# In df_ arr, the DEST value is the same for all flights, I will be eliminating that column as well as everything that has to 
# deal with departures.

df_arr.drop(columns = ['DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW', 'CANCELLED'], inplace = True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_arr.drop(columns = ['DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW', 'CANCELLED'], inplace = True)


In [11]:
# Converting Origin to numerical data by using get_dummies.
temp = pd.get_dummies(df_arr.ORIGIN, prefix = 'ORIGIN')



In [12]:
temp

Unnamed: 0,ORIGIN_ATL,ORIGIN_AUS,ORIGIN_BDL,ORIGIN_BGR,ORIGIN_BHM,ORIGIN_BNA,ORIGIN_BOS,ORIGIN_BWI,ORIGIN_CAE,ORIGIN_CHS,...,ORIGIN_SLC,ORIGIN_STL,ORIGIN_STT,ORIGIN_STX,ORIGIN_TLH,ORIGIN_TPA,ORIGIN_TTN,ORIGIN_TUL,ORIGIN_TYS,ORIGIN_XNA
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736644,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#Join the two dataframes
df_arr = pd.concat([df_arr, temp], axis = 1)

In [14]:
y_arr = df_arr['weather']

In [15]:
df_arr.drop(columns = ['weather', 'ORIGIN'], inplace = True)

In [16]:
df_arr

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,...,ORIGIN_SLC,ORIGIN_STL,ORIGIN_STT,ORIGIN_STX,ORIGIN_TLH,ORIGIN_TPA,ORIGIN_TTN,ORIGIN_TUL,ORIGIN_TYS,ORIGIN_XNA
0,2010,5,6.0,-16.0,1840,1835.0,-5.0,0.0,185.0,196.0,...,0,0,1,0,0,0,0,0,0,0
1,2010,5,7.0,15.0,1840,1840.0,0.0,0.0,185.0,170.0,...,0,0,1,0,0,0,0,0,0,0
2,2010,5,1.0,15.0,1840,1846.0,6.0,6.0,185.0,176.0,...,0,0,1,0,0,0,0,0,0,0
3,2010,5,2.0,-2.0,1840,1829.0,-11.0,0.0,185.0,176.0,...,0,0,1,0,0,0,0,0,0,0
4,2010,5,3.0,22.0,1840,1848.0,8.0,8.0,185.0,171.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736642,2021,9,4.0,-8.0,1451,1444.0,-7.0,0.0,181.0,182.0,...,0,0,0,0,0,0,0,0,0,0
736644,2021,9,4.0,-3.0,1125,1110.0,-15.0,0.0,210.0,198.0,...,0,0,0,0,0,0,0,0,0,0
736645,2021,9,4.0,-3.0,1123,1120.0,-3.0,0.0,173.0,173.0,...,0,0,0,0,0,0,0,0,0,0
736647,2021,9,4.0,-4.0,1016,1009.0,-7.0,0.0,171.0,168.0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_arr.keys()

Index(['YEAR', 'MONTH', 'DAY_OF_WEEK', 'DEP_DELAY', 'CRS_ARR_TIME', 'ARR_TIME',
       'ARR_DELAY', 'ARR_DELAY_NEW', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME',
       'DAY', '0', 'ORIGIN_ATL', 'ORIGIN_AUS', 'ORIGIN_BDL', 'ORIGIN_BGR',
       'ORIGIN_BHM', 'ORIGIN_BNA', 'ORIGIN_BOS', 'ORIGIN_BWI', 'ORIGIN_CAE',
       'ORIGIN_CHS', 'ORIGIN_CLE', 'ORIGIN_CLT', 'ORIGIN_CMH', 'ORIGIN_CVG',
       'ORIGIN_DAL', 'ORIGIN_DCA', 'ORIGIN_DEN', 'ORIGIN_DFW', 'ORIGIN_DSM',
       'ORIGIN_DTW', 'ORIGIN_EWR', 'ORIGIN_EYW', 'ORIGIN_GNV', 'ORIGIN_GSO',
       'ORIGIN_GSP', 'ORIGIN_HOU', 'ORIGIN_HSV', 'ORIGIN_IAD', 'ORIGIN_IAH',
       'ORIGIN_IND', 'ORIGIN_ISP', 'ORIGIN_JAN', 'ORIGIN_JAX', 'ORIGIN_JFK',
       'ORIGIN_LAS', 'ORIGIN_LAX', 'ORIGIN_LGA', 'ORIGIN_LIT', 'ORIGIN_MCI',
       'ORIGIN_MCO', 'ORIGIN_MDW', 'ORIGIN_MEM', 'ORIGIN_MKE', 'ORIGIN_MSP',
       'ORIGIN_MSY', 'ORIGIN_MYR', 'ORIGIN_OKC', 'ORIGIN_OMA', 'ORIGIN_ORD',
       'ORIGIN_ORF', 'ORIGIN_PHL', 'ORIGIN_PHX', 'ORIGIN_PIT', 'ORIGIN_P

In [18]:
X_arr_train, X_arr_test, y_arr_train, y_arr_test = train_test_split(df_arr, y_arr, test_size = .33, random_state = 42)

In [19]:
df_dep.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DAY,0,weather
44,2010,5,6.0,MIA,IAH,1450,1446.0,-4.0,0.0,1640,1633.0,-7.0,0.0,0.0,170.0,167.0,1,1,0
45,2010,5,7.0,MIA,IAH,1450,1447.0,-3.0,0.0,1640,1633.0,-7.0,0.0,0.0,170.0,166.0,2,1,0
46,2010,5,1.0,MIA,IAH,1450,1447.0,-3.0,0.0,1640,1622.0,-18.0,0.0,0.0,170.0,155.0,3,1,0
47,2010,5,2.0,MIA,IAH,1450,1456.0,6.0,6.0,1640,1639.0,-1.0,0.0,0.0,170.0,163.0,4,1,0
48,2010,5,3.0,MIA,IAH,1450,1553.0,63.0,63.0,1640,1732.0,52.0,52.0,0.0,170.0,159.0,5,0,0


In [20]:
#Going to do the same things to df_dep that I just did with df_arr
df_dep.drop(columns = ['ORIGIN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'CANCELLED'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dep.drop(columns = ['ORIGIN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'CANCELLED'], inplace = True)


In [21]:
# Converting DEST to numerical data by using get_dummies.
temp = pd.get_dummies(df_dep.DEST, prefix = 'ORIGIN')

In [22]:
temp

Unnamed: 0,ORIGIN_ATL,ORIGIN_AUS,ORIGIN_BDL,ORIGIN_BGR,ORIGIN_BHM,ORIGIN_BNA,ORIGIN_BOS,ORIGIN_BWI,ORIGIN_CAE,ORIGIN_CHS,...,ORIGIN_SLC,ORIGIN_STL,ORIGIN_STT,ORIGIN_STX,ORIGIN_TLH,ORIGIN_TPA,ORIGIN_TTN,ORIGIN_TUL,ORIGIN_TYS,ORIGIN_XNA
44,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736650,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#Join the two dataframes
df_dep = pd.concat([df_dep, temp], axis = 1)

In [24]:
y_dep = df_dep['weather']

In [25]:
df_dep.drop(columns = ['weather', 'DEST'], inplace = True)

In [26]:
df_dep.keys()

Index(['YEAR', 'MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DELAY_NEW', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DAY', '0',
       'ORIGIN_ATL', 'ORIGIN_AUS', 'ORIGIN_BDL', 'ORIGIN_BGR', 'ORIGIN_BHM',
       'ORIGIN_BNA', 'ORIGIN_BOS', 'ORIGIN_BWI', 'ORIGIN_CAE', 'ORIGIN_CHS',
       'ORIGIN_CLE', 'ORIGIN_CLT', 'ORIGIN_CMH', 'ORIGIN_CVG', 'ORIGIN_DAL',
       'ORIGIN_DCA', 'ORIGIN_DEN', 'ORIGIN_DFW', 'ORIGIN_DSM', 'ORIGIN_DTW',
       'ORIGIN_EGE', 'ORIGIN_EWR', 'ORIGIN_EYW', 'ORIGIN_GNV', 'ORIGIN_GSO',
       'ORIGIN_GSP', 'ORIGIN_HOU', 'ORIGIN_HSV', 'ORIGIN_IAD', 'ORIGIN_IAH',
       'ORIGIN_IND', 'ORIGIN_ISP', 'ORIGIN_JAN', 'ORIGIN_JAX', 'ORIGIN_JFK',
       'ORIGIN_LAS', 'ORIGIN_LAX', 'ORIGIN_LGA', 'ORIGIN_LIT', 'ORIGIN_MCI',
       'ORIGIN_MCO', 'ORIGIN_MDW', 'ORIGIN_MEM', 'ORIGIN_MKE', 'ORIGIN_MSP',
       'ORIGIN_MSY', 'ORIGIN_MYR', 'ORIGIN_OKC', 'ORIGIN_OMA', 'ORIGIN_ORD',
       'ORIGIN_ORF', 'ORIGIN_PBI', 'ORIGIN_PHL', 'ORIGIN_PHX', 'ORIGIN_

In [38]:
df_dep.isnull().sum()

YEAR                0
MONTH               0
DAY_OF_WEEK     32329
CRS_DEP_TIME        0
DEP_TIME        36956
                ...  
ORIGIN_TPA          0
ORIGIN_TTN          0
ORIGIN_TUL          0
ORIGIN_TYS          0
ORIGIN_XNA          0
Length: 90, dtype: int64

In [29]:
X_dep_train, X_dep_test, y_dep_train, y_dep_test = train_test_split(df_dep, y_dep, test_size = .33, random_state = 42)

In [30]:
# The data is not balanced
X_dep_train['0'].value_counts()

1    186970
0     59835
Name: 0, dtype: int64

In [31]:
y_dep_test.value_counts()

0    117333
1      4228
Name: weather, dtype: int64

In [32]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable


In [33]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
X_dep_train, y_dep_train = sm.fit_resample(X_dep_train, y_dep_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').