<a href="https://colab.research.google.com/github/youssefjedidi/Aircraft_Noise_Predictor/blob/main/aircraft_noise_predictor_ML_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML Regression models

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('aircraft_noise.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[['LAmax' 'A' 1500 ... 1 1695 2325]
 ['LAmax' 'A' 1600 ... 1 1695 2325]
 ['PNLTM' 'A' 1500 ... 1 1695 2325]
 ...
 ['EPNL' 'D' 80000 ... 4 6752 1254430]
 ['SEL' 'D' 80000 ... 4 6837 1254430]
 ['EPNL' 'D' 80000 ... 4 6837 1254430]]


## Encoding categorical data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [  1 , 0 , 3 ,4  ])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 ... 1 1695 2325]
 [1.0 0.0 0.0 ... 1 1695 2325]
 [1.0 0.0 0.0 ... 1 1695 2325]
 ...
 [0.0 1.0 1.0 ... 4 6752 1254430]
 [0.0 1.0 0.0 ... 4 6837 1254430]
 [0.0 1.0 1.0 ... 4 6837 1254430]]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Multiple Linear Regression


### Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[52.65 49.1 ]
 [71.42 69.8 ]
 [68.44 61.  ]
 ...
 [67.17 62.9 ]
 [61.12 65.2 ]
 [50.41 45.2 ]]


### Testing accuracy

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return (np.mean(np.abs((y_true - y_pred) / y_true)) * 100)

mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
#print(regressor.predict([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 300]]))

from sklearn.metrics import r2_score

# Assuming y_test contains the actual target values and y_pred contains the predicted values
# Calculate R squared
r2 = r2_score(y_test, y_pred)

# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

Mean Absolute Percentage Error (MAPE): 8.36%
R squared (r2): 0.6604
Adjusted R squared (adj_r2): 0.6504


## Polynomial Regression

### Training

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y_train)

### Predecting the Test set Results

In [None]:
y_poly = lin_reg_2.predict(poly_reg.fit_transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_poly.reshape(len(y_poly),1), y_test.reshape(len(y_test),1)),1))

[[50.5  49.1 ]
 [72.2  69.8 ]
 [61.8  61.  ]
 ...
 [69.3  62.9 ]
 [60.68 65.2 ]
 [51.17 45.2 ]]


### Testing accuracy

In [None]:

mape = mean_absolute_percentage_error(y_test, y_poly)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
#print(lin_reg_2.predict(poly_reg.fit_transform([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 300]])))

from sklearn.metrics import r2_score

# Assuming y_test contains the actual target values and y_pred contains the predicted values
# Calculate R squared
r2_poly = r2_score(y_test, y_poly)

# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2_poly) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2_poly:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

Mean Absolute Percentage Error (MAPE): 7.07%
R squared (r2): 0.7528
Adjusted R squared (adj_r2): 0.7455


## Support Vector Regression (SVR)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler


sc_X = StandardScaler()
X_scaled = sc_X.fit_transform(X_train)  # Scale and update the fourth column
X_test_scaled = sc_X.transform(X_test)

X_scaled[: , 0:-4] = X_train[: , 0:-4]
X_test_scaled[: , 0:-4] = X_test[: ,0:-4]

# Initialize StandardScaler for the target variable
sc_y = StandardScaler()
y_scaled = sc_y.fit_transform(y_train.reshape(-1, 1))  # Fit and transform the target variable
y_test_scaled = sc_y.transform(y_test.reshape(-1, 1))

# Now X_scaled and y_scaled are scaled versions of X and y using StandardScaler
print("Scaled X:")
print(X_scaled)
print("\nScaled y:")
print(y_scaled)

Scaled X:
[[ 0.    1.    0.   ... -0.44 -0.54 -0.74]
 [ 0.    1.    0.   ...  0.83  0.37 -0.13]
 [ 0.    1.    0.   ...  0.83  0.08 -0.26]
 ...
 [ 1.    0.    0.   ... -0.44 -0.99 -0.53]
 [ 0.    1.    1.   ... -0.44  0.22 -0.45]
 [ 1.    0.    0.   ... -0.44 -0.04 -0.64]]

Scaled y:
[[ 0.64]
 [ 1.69]
 [ 0.13]
 ...
 [-0.51]
 [ 1.03]
 [-1.56]]


### Training the SVR model

In [None]:
from sklearn.svm import SVR
sv_regressor = SVR(kernel = 'rbf')
sv_regressor.fit(X_scaled, y_scaled)

  y = column_or_1d(y, warn=True)


### Predicting

In [None]:
# Perform predictions using the SVR model on the scaled test features
y_svr_scaled = sv_regressor.predict(X_test_scaled)

# Inverse transform the scaled predictions to get them back to the original scale
y_svr = sc_y.inverse_transform(y_svr_scaled.reshape(-1, 1))

np.set_printoptions(precision=2)
print(np.concatenate((y_svr.reshape(len(y_svr),1), y_test.reshape(len(y_test),1)),1))

[[47.24 49.1 ]
 [67.39 69.8 ]
 [62.66 61.  ]
 ...
 [67.09 62.9 ]
 [59.93 65.2 ]
 [49.33 45.2 ]]


### Testing accuracy

In [None]:

mape = mean_absolute_percentage_error(y_test, y_svr)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

from sklearn.metrics import r2_score

# Assuming y_test contains the actual target values and y_pred contains the predicted values
# Calculate R squared
r2_svr = r2_score(y_test, y_svr)

# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2_svr) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2_svr:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

Mean Absolute Percentage Error (MAPE): 20.13%
R squared (r2): 0.7955
Adjusted R squared (adj_r2): 0.7894


## Decision Tree



### Training the Decision Tree model on the Training set

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train, y_train)

###Predicting

In [None]:
y_tree = tree_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_tree.reshape(len(y_tree),1), y_test.reshape(len(y_test),1)),1))

[[49.1  49.1 ]
 [64.6  69.8 ]
 [61.   61.  ]
 ...
 [67.6  62.9 ]
 [68.55 65.2 ]
 [45.2  45.2 ]]


### Testing accuracy

In [None]:
mape = mean_absolute_percentage_error(y_test, y_tree)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
#print(tree_regressor.predict([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 300]]))

from sklearn.metrics import r2_score

# Assuming y_test contains the actual target values and y_pred contains the predicted values
# Calculate R squared
r2 = r2_score(y_test, y_tree)
# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

Mean Absolute Percentage Error (MAPE): 4.06%
R squared (r2): 0.8898
Adjusted R squared (adj_r2): 0.8866


## Random Forest

### Training the Random Forest model on the Training set

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor(n_estimators = 100)
forest_regressor.fit(X_train, y_train)

### Predicting

In [None]:
y_forest = forest_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_forest.reshape(len(y_forest),1), y_test.reshape(len(y_test),1)),1))

[[46.23 49.1 ]
 [66.46 69.8 ]
 [60.59 61.  ]
 ...
 [65.07 62.9 ]
 [66.76 65.2 ]
 [48.29 45.2 ]]


### Testing accuracy

In [None]:

#print(tree_regressor.predict([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 300 , 2 , 2624 , 13000 ]]))

from sklearn.metrics import r2_score

# Assuming y_test contains the actual target values and y_pred contains the predicted values
# Calculate R squared
r2 = r2_score(y_test, y_forest)
# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

R squared (r2): 0.9258
Adjusted R squared (adj_r2): 0.9236


##XGBoost Model

In [None]:
from xgboost import XGBRegressor
XGB_regressor = XGBRegressor(    max_depth = 4 , learning_rate =0.201560733941063 , subsample= 0.8 , n_estimators =558)
XGB_regressor.fit(X_train, y_train)

In [None]:
y_XGB = XGB_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_XGB.reshape(len(y_XGB),1), y_test.reshape(len(y_test),1)),1))

[[48.56 49.1 ]
 [68.01 69.8 ]
 [62.4  61.  ]
 ...
 [64.41 62.9 ]
 [66.1  65.2 ]
 [44.48 45.2 ]]


In [None]:
r2 = r2_score(y_test, y_XGB)

# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

R squared (r2): 0.9757
Adjusted R squared (adj_r2): 0.9750


##catboost Model

In [None]:
!pip install catboost

from catboost import CatBoostRegressor

cat_regressor = CatBoostRegressor()
cat_regressor.fit(X_train, y_train)

In [None]:
y_cat = cat_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_cat.reshape(len(y_cat),1), y_test.reshape(len(y_test),1)),1))

[[47.85 49.1 ]
 [67.73 69.8 ]
 [61.36 61.  ]
 ...
 [65.24 62.9 ]
 [66.88 65.2 ]
 [47.41 45.2 ]]


In [None]:
r2 = r2_score(y_test, y_cat)

# Calculate adjusted R squared
n = len(y_test)  # Number of samples
p = X.shape[1]   # Number of predictors (features) in your model
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print R-squared and adjusted R-squared
print(f"R squared (r2): {r2:.4f}")
print(f"Adjusted R squared (adj_r2): {adj_r2:.4f}")

R squared (r2): 0.9713
Adjusted R squared (adj_r2): 0.9704


##Validation


In [None]:
print(forest_regressor.predict([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 1, 0, 0 , 300 , 2 , 2624 , 13000 ]]))
print(XGB_regressor.predict([[1, 0, 0, 1 ,0 ,0 , 0,0 ,1 , 1, 0, 0 , 300 , 2 , 2624 , 13000 ]]))

[41.63]
[42.83]


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': [4],
    'learning_rate': [0.201560733941063 ],
    'subsample': [ 0.8 ],
    'n_estimators':[558]
}

# Create the XGBoost model object
xgb_model = xgb.XGBRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


Best set of hyperparameters:  {'learning_rate': 0.201560733941063, 'max_depth': 4, 'n_estimators': 558, 'subsample': 0.8}
Best score:  0.9735142844200235


In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = XGB_regressor, X = X_train, y = y_train, cv=10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 97.45 %
Standard Deviation: 0.71 %
