## Data Prep 

Imports and Libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
np.random.seed(42)

In [3]:
airline = pd.read_csv("airline.csv")
airline.head()

Unnamed: 0,Aircraft,Number_Objects,Engines,Origin State,Phase,Description,Object Size,Weather,Warning,Altitude,Total Cost
0,PA,37,1.0,Florida,Descent,"BIRD, BROWN BIRD. A/C WAS DESCENDING INTO PATT...",Large,No Cloud,N,1500.0,6.536692
1,C,43,1.0,Florida,Approach,BIRD SHATTERED L SIDE OF WINDHSLD. STUDENT REC...,Large,No Cloud,Y,2000.0,6.345636
2,B-737,71,2.0,Oklahoma,Climb,MEDIUM SIZED BLACK BIRDS. CONTRACT MX INSPN OF...,Medium,No Cloud,N,1100.0,6.934397
3,Airbus,29,2.0,Wisconsin,Approach,"ID BY SMITHSONIAN, FAA 3881. DNA. 2 CRACKS IN ...",Large,No Cloud,Y,200.0,11.257762
4,B-737,32,2.0,Texas,Approach,BIRD SEEN AND HEARD THAT STRUCK RADOME. UPON I...,Small,No Cloud,N,1000.0,6.018593


In [4]:
train, test = train_test_split(airline, test_size=0.3)

In [5]:
train_y = train['Total Cost']
test_y = test['Total Cost']

train_x = train.drop(['Total Cost'], axis=1)
test_x = test.drop(['Total Cost'], axis=1)

## Feature Engineering 

In [6]:
def feat_eng(df):
  
    weather_mapping = {
        'No Cloud': 1,
        'Some Cloud': 2,
        'Overcast': 3
    }
    df['Weather_Code'] = df['Weather'].map(weather_mapping)

    df['Weather_Code'] = df['Weather_Code'].fillna(1)

    df['Weather_Impact'] = df['Number_Objects'] / df['Weather_Code']
    
    return df[['Weather_Impact']]

The weather impact feature captures the interaction between the number of objects and whether severity, this enhances the model's ability to interpret and predict their combined effect

In [7]:
feat_eng(train_x)


Unnamed: 0,Weather_Impact
354,12.333333
1055,28.000000
879,46.000000
256,38.000000
291,38.000000
...,...
1044,42.000000
1095,25.000000
1130,14.500000
860,24.000000


In [8]:
feat_eng(test_x)

Unnamed: 0,Weather_Impact
101,64.000000
260,17.000000
1083,33.000000
109,21.500000
649,33.000000
...,...
789,50.000000
754,47.000000
471,49.000000
1085,46.500000


## Text Mining 

In [9]:
def text_eng(df):
    df1 = df.copy()
    
    return np.array(df).ravel()

In [10]:
text_eng(train_x)

array(['B-737', 37, 2.0, ..., 2000.0, 1, 45.0], dtype=object)

## Identify Columns 

In [11]:
train_x.dtypes

Aircraft           object
Number_Objects      int64
Engines           float64
Origin State       object
Phase              object
Description        object
Object Size        object
Weather            object
Altitude          float64
Weather_Code        int64
Weather_Impact    float64
dtype: object

In [12]:
numeric_columns = train_x.select_dtypes(include=[np.number]).columns.to_list()

In [13]:
categorical_columns = train_x.select_dtypes('object').columns.to_list()

In [14]:
text_columns = ['Description']

In [15]:
for col in text_columns:
    categorical_columns.remove(col)

In [16]:
binary_columns = [ 'Warning']

In [17]:
for col in binary_columns:
    categorical_columns.remove(col)

In [18]:
feat_eng_columns = ['Number_Objects','Weather']

In [19]:
numeric_columns

['Number_Objects', 'Engines', 'Altitude', 'Weather_Code', 'Weather_Impact']

In [20]:
categorical_columns

['Aircraft', 'Origin State', 'Phase', 'Object Size', 'Weather']

In [21]:
binary_columns



In [22]:
text_columns

['Description']

In [23]:
feat_eng_columns

['Number_Objects', 'Weather']

## Column Transformers

In [24]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [25]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [26]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [27]:
feat_eng_transformer = Pipeline(steps=[
    ('feat_eng', FunctionTransformer(feat_eng, validate=False)),
    ('scaler', StandardScaler())
])

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

number_svd_components = 300

text_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('function_transformer', FunctionTransformer(text_eng, validate=False)),
    ('text', TfidfVectorizer(stop_words='english')),
    ('svd', TruncatedSVD(n_components=number_svd_components, n_iter=10))
])

## Pipeline

In [29]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_columns),
    ('cat', categorical_transformer, categorical_columns),
    ('binary', binary_transformer, binary_columns),
    ('feat_eng', feat_eng_transformer, feat_eng_columns), 
    ('text', text_transformer, text_columns)
], remainder='drop')

# Transform: fit_transform() for TRAIN

In [30]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_x)

train_x

array([[-9.32546683e-01,  1.41071687e-01,  1.98058746e-01, ...,
         6.26340769e-02, -6.81171794e-03, -3.00378851e-02],
       [ 4.30778926e-01, -1.74884108e+00, -4.44313565e-01, ...,
         1.09163723e-02,  1.93534389e-02,  2.95093425e-02],
       [-2.86760868e-01,  1.41071687e-01,  1.92752266e+00, ...,
        -6.14546649e-03, -1.67735345e-03,  5.57589332e-02],
       ...,
       [-1.50657852e+00,  1.41071687e-01, -5.40669411e-01, ...,
        -3.78141203e-02, -2.38732565e-03, -5.85514762e-02],
       [-1.43252909e-01,  1.41071687e-01, -4.44313565e-01, ...,
         3.85521229e-02,  1.47565573e-02,  4.52871344e-02],
       [-3.58514848e-01, -1.74884108e+00,  4.45125019e-01, ...,
         3.58122717e-02,  3.23087349e-03,  3.25371375e-02]])

In [31]:
train_x.shape

(844, 437)

## Tranform: transform() for TEST

In [32]:
# Transform the test data
test_x = preprocessor.transform(test_x)

test_x

array([[ 1.00481076e+00,  1.41071687e-01, -5.43140074e-01, ...,
        -7.88235033e-03,  1.50297774e-02,  1.49796770e-02],
       [ 7.20090291e-02,  1.41071687e-01,  1.43339011e+00, ...,
        -1.07829969e-02,  1.42639113e-02,  2.30995242e-02],
       [-1.21956260e+00,  1.41071687e-01, -5.43140074e-01, ...,
         4.26758431e-02,  9.36716051e-04,  7.21783614e-03],
       ...,
       [-7.14989298e-02, -1.74884108e+00, -4.90075273e-02, ...,
         1.31847867e-03,  3.80207817e-02, -1.13393737e-03],
       [ 3.08567617e+00,  1.41071687e-01, -4.93726819e-01, ...,
         1.12584533e-02, -2.39515553e-02,  7.64323075e-03],
       [ 3.59024947e-01,  1.41071687e-01, -4.93726819e-01, ...,
        -2.94355016e-02,  5.90788831e-03,  7.91192200e-02]])

In [33]:
test_x.shape

(363, 437)

## Find the Baseline 

In [34]:
from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")

dummy_regr.fit(train_x, train_y)

In [35]:
from sklearn.metrics import mean_squared_error

In [36]:
#Baseline Train RMSE
dummy_train_pred = dummy_regr.predict(train_x)

baseline_train_mse = mean_squared_error(train_y, dummy_train_pred)

baseline_train_rmse = np.sqrt(baseline_train_mse)

print('Baseline Train RMSE: {}' .format(baseline_train_rmse))

Baseline Train RMSE: 2.739665221683705


In [37]:
#Baseline Test RMSE
dummy_test_pred = dummy_regr.predict(test_x)

baseline_test_mse = mean_squared_error (test_y, dummy_test_pred)

baseline_test_rmse = np.sqrt(baseline_test_mse)

print('Baseline Test RMSE: {}' .format(baseline_test_rmse))

Baseline Test RMSE: 2.5730518279317


## Decision Tree

In [38]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=5) 

tree_reg.fit(train_x, train_y)

In [39]:
#Train RMSE
train_pred = tree_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1.84707548814799


In [40]:
#Test RMSE
test_pred = tree_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 2.209028773668436


## Voting regressor 



In [41]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor 
from sklearn.svm import SVR 
from sklearn.ensemble import VotingRegressor


dtree_reg = DecisionTreeRegressor(max_depth=1)
svm_reg = SVR(kernel="rbf", C=1, epsilon=0.01, gamma='scale') 
sgd_reg = SGDRegressor(max_iter=10000, tol=1e-3)

voting_reg = VotingRegressor(
            estimators=[('dt', dtree_reg), 
                        ('svr', svm_reg), 
                        ('sgd', sgd_reg)])

voting_reg.fit(train_x, train_y)

In [42]:
#Train RMSE
train_pred = voting_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1.8762079446510955


In [43]:
#Test RMSE
test_pred = voting_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1.9816228874097128


## A Boosting model


In [44]:
from sklearn.ensemble import AdaBoostRegressor 


ada_reg = AdaBoostRegressor( 
            DecisionTreeRegressor(max_depth=5), n_estimators=250, 
            learning_rate=0.25) 

ada_reg.fit(train_x, train_y)

In [45]:
#Train RMSE
train_pred = ada_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1.2210204993394298


In [46]:
#Test RMSE
test_pred = ada_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1.8419022934171796


## Neural network

In [47]:
from sklearn.neural_network import MLPRegressor

#Default settings create 1 hidden layer with 100 neurons
mlp_reg = MLPRegressor(hidden_layer_sizes=(25,))

mlp_reg.fit(train_x, train_y)



In [48]:
#Train RMSE
train_pred = mlp_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1.1843938483131071


In [49]:
#Test RMSE
test_pred = mlp_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1.946331331699112


In [50]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(25,), 
                       alpha=2,  # L2 regularization strength
                       max_iter=500, 
                       random_state=42)
mlp_reg.fit(train_x, train_y)



In [51]:
#Train RMSE
train_pred = mlp_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1.2160458754219687


In [52]:
#Test RMSE
test_pred = mlp_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1.8461226850074564
