In [1]:
# Use the PRS dataset to create a dataframe

import pandas as pd
import numpy as np

df = pd.read_csv('Final.csv')
df.head(5)

Unnamed: 0,CUSTOMER_ORDER_ID,SALES_ORG,DISTRIBUTION_CHANNEL,DIVISION,RELEASED_CREDIT_VALUE,PURCHASE_ORDER_TYPE,COMPANY_CODE,ORDER_CREATION_DATE,ORDER_CREATION_TIME,CREDIT_CONTROL_AREA,SOLD_TO_PARTY,ORDER_AMOUNT,REQUESTED_DELIVERY_DATE,ORDER_CURRENCY,CREDIT_STATUS,CUSTOMER_NUMBER
0,946851639,3537,United States of America,South-Region,0,1000,3220,20220101,43012,SR02,756141537,95461,20220113,EUR,,12311807
1,963432061,3449,Martinique,South-Region,0,1000,3220,20220101,43114,NR03,798847812,78736,20220111,EUR,,12311807
2,971991639,3238,Moldova,South-Region,8234202,I200,3260,20220101,110019,NR01,960984659,6749346,20220112,EUR,93.0,12118758
3,754349803,3911,United Arab Emirates,South-Region,147124,N000,3290,20220101,153013,SR02,925857642,140554,20220106,EUR,93.0,1210499770
4,930253442,2381,Greece,South-Region,0,N000,3290,20220101,160020,SR01,947942786,0,20220106,EUR,64.0,1210351400


In [42]:
# Modify the dataset to pass into any type of machine learning models.

In [43]:
numerical_columns = df.select_dtypes(include=['int', 'float']).columns
print("Numerical colums: \n", numerical_columns)

Numerical colums: 
 Index(['CUSTOMER_ORDER_ID', 'SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION',
       'RELEASED_CREDIT_VALUE', 'PURCHASE_ORDER_TYPE', 'COMPANY_CODE',
       'ORDER_CREATION_TIME', 'CREDIT_CONTROL_AREA', 'SOLD_TO_PARTY',
       'ORDER_AMOUNT', 'ORDER_CURRENCY', 'CREDIT_STATUS', 'CUSTOMER_NUMBER',
       'amount_in_usd', 'unique_cust_id'],
      dtype='object')


In [44]:
print(df.dtypes)

CUSTOMER_ORDER_ID                   int64
SALES_ORG                           int64
DISTRIBUTION_CHANNEL                int32
DIVISION                            int32
RELEASED_CREDIT_VALUE             float64
PURCHASE_ORDER_TYPE                 int32
COMPANY_CODE                        int64
ORDER_CREATION_DATE        datetime64[ns]
ORDER_CREATION_TIME                 int64
CREDIT_CONTROL_AREA                 int32
SOLD_TO_PARTY                       int64
ORDER_AMOUNT                      float64
REQUESTED_DELIVERY_DATE    datetime64[ns]
ORDER_CURRENCY                      int32
CREDIT_STATUS                       int32
CUSTOMER_NUMBER                     int64
amount_in_usd                     float64
unique_cust_id                      int32
dtype: object


In [45]:
# Drop the irrelevant columns

irrelevant_columns = ['CREDIT_CONTROL_AREA', 'SOLD_TO_PARTY', 'CUSTOMER_NUMBER', 'COMPANY_CODE', 'ORDER_AMOUNT']
df = df.drop(irrelevant_columns, axis=1)

In [46]:
# Drop the target variable and store in the y as the dependent variable

X = df.drop('amount_in_usd', axis=1)
y = df['amount_in_usd']

In [47]:
X.columns

Index(['CUSTOMER_ORDER_ID', 'SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION',
       'RELEASED_CREDIT_VALUE', 'PURCHASE_ORDER_TYPE', 'ORDER_CREATION_DATE',
       'ORDER_CREATION_TIME', 'REQUESTED_DELIVERY_DATE', 'ORDER_CURRENCY',
       'CREDIT_STATUS', 'unique_cust_id'],
      dtype='object')

In [48]:
# One hot Encoding on all the independent features' columns

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
x_encoded = encoder.fit_transform(X)

In [49]:
# Split the dataset into training and testing subsets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.2, random_state=42)

In [50]:
# Standardizing data with StandardScaler() function.

from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
# Try different machine learning models like -
#        a. Linear Regression
#        b. Support Vector Machine
#        c. Decision Tree
#        d. Random Forest
#        e. AdaBoost
#        f. Xgboost etc..

In [52]:
# a. Linear Regression

from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

LinearRegression()

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.svm import SVR

# linear_regression = LinearRegression()
# decision_tree = DecisionTreeRegressor()
# random_forest = RandomForestRegressor()
# adaboost = AdaBoostRegressor()
# xgboost = XGBRegressor()
# support_vector_machine = SVR()

# models = [linear_regression, decision_tree, random_forest, adaboost, xgboost]
# model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'AdaBoost', 'XGBoost']

# for model, name in zip(models, model_names):
#     model.fit(X_train, y_train)

In [None]:
# b. Support Vector Machine

from sklearn.svm import SVR

support_vector_machine = SVR()
support_vector_machine.fit(X_train, y_train)

In [None]:
# c. Decision Tree

from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)

In [None]:
# d. Random Forest

from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

In [None]:
# e. AdaBoost

from sklearn.ensemble import AdaBoostRegressor

adaboost = AdaBoostRegressor()
adaboost.fit(X_train, y_train)

In [54]:
# f. Xgboost 

from xgboost import XGBRegressor

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [None]:
# Perform Regression model evaluations like MSE, RMSE, R-Square etc.

# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# models = [linear_regression, decision_tree, random_forest, adaboost, xgboost]
# model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'AdaBoost', 'XGBoost']

# for model, name in zip(models, model_names):
#     model.fit(X_train, y_train)
    
#     y_pred = model.predict(X_test)

#     mse = mean_squared_error(y_test, y_pred)

#     rmse = np.sqrt(mse)

#     r2 = r2_score(y_test, y_pred)
    
#     mae = mean_absolute_error(y_test, y_pred)
    
#     print(f'{name}:')
#     print(f'Mean Squared Error (MSE): {mse:.4f}')
#     print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
#     print(f'R-squared: {r2:.4f}')
#     print(f'Mean Absoluate Error: {mae:.4f}')
#     print()

In [53]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# a. Linear Regression model evaluations

y_pred = linear_regression.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred)
mae_lr = mean_absolute_error(y_test, y_pred)
print('Linear Regression:')
print(f'Mean Squared Error (MSE): {mse_lr:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse_lr:.4f}')
print(f'R-squared: {r2_lr:.4f}')
print(f'Mean Absoluate Error: {mae_lr:.4f}')

Linear Regression:
Mean Squared Error (MSE): 4.9678
Root Mean Squared Error (RMSE): 2.2288
R-squared: 0.4698
Mean Absoluate Error: 1.7068


In [None]:
b. Support Vector Machine model evaluations

y_pred = support_vector_machine.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Support Vector Machine:')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared: {r2:.4f}')
print(f'Mean Absoluate Error: {mae:.4f}')

In [None]:
# c. Decision Tree model evaluations

y_pred = decision_tree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Decision Tree:')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared: {r2:.4f}')
print(f'Mean Absoluate Error: {mae:.4f}')

In [None]:
# d. Random Forest model evaluations

y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Random Forest:')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared: {r2:.4f}')
print(f'Mean Absoluate Error: {mae:.4f}')

In [None]:
# e. AdaBoost model evaluations

y_pred = adaboost.predict(X_test)
mse_ada = mean_squared_error(y_test, y_pred)
rmse_ada = np.sqrt(mse_ada)
r2_ada = r2_score(y_test, y_pred)
mae_ada = mean_absolute_error(y_test, y_pred)
print('AdaBoost Prediction:')
print(f'Mean Squared Error (MSE): {mse_ada:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse_ada:.4f}')
print(f'R-squared: {r2_ada:.4f}')
print(f'Mean Absoluate Error: {mae_ada:.4f}')

In [55]:
# f. Xgboost model evaluations

y_pred = xgboost.predict(X_test)
mse_xg = mean_squared_error(y_test, y_pred)
rmse_xg = np.sqrt(mse_xg)
r2_xg = r2_score(y_test, y_pred)
mae_xg = mean_absolute_error(y_test, y_pred)
print('XGBoost Prediction:')
print(f'Mean Squared Error (MSE): {mse_xg:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse_xg:.4f}')
print(f'R-squared: {r2_xg:.4f}')
print(f'Mean Absoluate Error: {mae_xg:.4f}')

XGBoost Prediction:
Mean Squared Error (MSE): 3.0818
Root Mean Squared Error (RMSE): 1.7555
R-squared: 0.6711
Mean Absoluate Error: 1.2288


In [None]:
# Compare the accuracies of all the models

In [None]:
labels = ['Linear Regression', 'AdaBoost', 'XGBoost']
mse_values = [mse_lr, mse_ada, mse_xg]
rmse_values = [rmse_lr, rmse_ada, rmse_xg]
r2_values = [r2_lr, r2_ada, r2_xg]

data = {
    'Model': labels,
    'MSE': mse_values,
    'RMSE': rmse_values,
    'R-Squared': r2_values,
}
df_info = pd.DataFrame(data)
df_info.set_index('Model', inplace=True)
df_info

In [None]:
from sklearn.metrics import r2_score

# models = [linear_regression, decision_tree, random_forest, adaboost, xgboost, support_vector_machine]
# model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'AdaBoost', 'XGBoost', 'Support Vector Machine']

models = [linear_regression, adaboost, xgboost]
model_names = ['Linear Regression', 'AdaBoost', 'XGBoost']

plt.bar(model_names, r2_values)
plt.xlabel('Model')
plt.ylabel('R-squared')
plt.title('Comparison of Model Accuracies')
plt.show()

In [None]:
# Select the best possible model

In the provided list of regression models, you can evaluate the models based on their performance metrics such as 
R-squared,mean squared error (MSE), and root mean squared error (RMSE). Generally, a higher R-squared value and lower 
MSE/RMSE indicate better model performance.

To select the best possible model, you can compare the performance metrics of the models and choose the one that achieves 
the highest R-squared value and the lowest MSE/RMSE. 

In [None]:
from sklearn.model_selection import cross_val_score

models = {'Linear Regression': linear_regression,'AdaBoost': adaboost,'XGBoost': xgboost}

cv_rmse_scores = {}
cv_mse_scores = {}
cv_r2_scores = {}
cv_mae_scores = {}

In [None]:
for model_name, model in models.items():
    cv_rmse = np.sqrt(-cross_val_score(model, x_encoded, y, cv=3, scoring='neg_mean_squared_error'))
    cv_rmse_scores[model_name] = cv_rmse.mean()

    cv_mse = -cross_val_score(model, x_encoded, y, cv=3, scoring='neg_mean_squared_error')
    cv_mse_scores[model_name] = cv_mse.mean()

    cv_r2 = cross_val_score(model, x_encoded, y, cv=3, scoring='r2')
    cv_r2_scores[model_name] = cv_r2.mean()

In [None]:
best_model_rmse = min(cv_rmse_scores, key=cv_rmse_scores.get)
best_model_mse = min(cv_mse_scores, key=cv_mse_scores.get)
best_model_r2 = max(cv_r2_scores, key=cv_r2_scores.get)

print(f"Cross-Validation Scores:\nRMSE: {cv_rmse_scores}\nMSE: {cv_mse_scores}\nR2: {cv_r2_scores}")

print(f"Best model based on \nRMSE: {best_model_rmse}\n MSE: {best_model_mse}\n R2: {best_model_r2}")

In [None]:
# Perform Hyperparameter tuning, select best hyperparameters by using appropriate algorithms.


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist_xg = {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.0001, 0.1)}

random_search_xg = RandomizedSearchCV(xgboost, param_dist_xg, cv=5, scoring='neg_mean_squared_error', n_iter=10)

random_search_xg.fit(X_train, y_train) 

In [None]:
best_params_xg = random_search_xg.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

def hyperparam(model, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(estimator = model, param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}, cv = 5, n_jobs = -1, verbose = 5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    print(grid_search.best_estimator_)
    y_pred = grid_search.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print('MSE: ', mse)
    print('RMSE: ', rmse)
    print('R2: ', r2)
    return grid_search.best_estimator_, mse, rmse, r2

hyperparam(xgboost, X_train, y_train, X_test, y_test)

In [None]:
# Come up with the best possible model accuracy.

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('XGBoost:')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared: {r2:.4f}')