In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split


# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


# Models
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Load the dataset
saudi_pro_dataset = pd.read_csv("dataset.csv")

In [3]:
# Handling missing values
saudi_pro_dataset = saudi_pro_dataset.drop(['Unnamed: 0'],axis=1)
cols = ['region_project']
saudi_pro_dataset[cols] = saudi_pro_dataset[cols].fillna(saudi_pro_dataset.mode().iloc[0])
mean = ['project_area']
saudi_pro_dataset[mean] = saudi_pro_dataset[mean].fillna(saudi_pro_dataset.mean().iloc[0])
saudi_pro_dataset = saudi_pro_dataset.dropna(subset=['budget_project'])
saudi_pro_dataset = saudi_pro_dataset[saudi_pro_dataset['budget_project'] <= 86537642000]

  saudi_pro_dataset[mean] = saudi_pro_dataset[mean].fillna(saudi_pro_dataset.mean().iloc[0])


In [4]:
# Split the dataset 

train_saudiPro, test_saudiPro = train_test_split(
    saudi_pro_dataset,
    test_size=0.2,
    train_size = 0.8,
    random_state=9000
)

In [5]:
X_train_saudiPro = saudi_pro_dataset.drop(['budget_project','end_month','end_year','enddate_project','start_month', 'start_year','startday_project','duration_project'],axis = 1)
y_train_saudiPro = saudi_pro_dataset['budget_project']

X_test_saudiPro = saudi_pro_dataset.drop(['budget_project','end_month','end_year','enddate_project','start_month', 'start_year','startday_project','duration_project'],axis = 1)
y_test_saudiPro = saudi_pro_dataset['budget_project']

X_train_saudiPro.shape

(835, 6)

In [6]:
# Define a function to calculate the cost for each model
def reg_cost(method, actual, preds):
    mae = mean_absolute_error(y_true=actual, y_pred=preds)
    mse = mean_squared_error(y_true=actual, y_pred=preds)
    
    print(f'Cost functions for the {method} regression is:')
    print(f'Mean Square Error: {round(mse,2)}')
    print(f'Mean Absolute Error: {round(mae,2)}\n\n')

In [7]:
#ordinal encoder
ordinal_saudiPro = OrdinalEncoder()

col_names_saudiPro = ["sectors","type_project","region_project","status_project"]

## Ordinal encode the column
ordinal_ls_saudiPro = ordinal_saudiPro.fit_transform(X_train_saudiPro[col_names_saudiPro])
ordinal_ls_val_saudiPro = ordinal_saudiPro.transform(X_test_saudiPro[col_names_saudiPro])

In [8]:
ordinal_saudiPro.categories_

[array([' Charity', ' Commercial', ' Educational', ' Governmental',
        ' Health', ' Industrial', ' Residential', ' Scientific', ' Sports',
        ' Tourist'], dtype=object),
 array(['Charity', 'Charity, Health', 'Charity, Residential', 'Commercial',
        'Commercial, Educational',
        'Commercial, Educational, Charity, Residential, Health',
        'Commercial, Educational, Governmental, Charity, Residential, Health',
        'Commercial, Educational, Health',
        'Commercial, Educational, Residential', 'Commercial, Governmental',
        'Commercial, Health', 'Commercial, Industrial',
        'Commercial, Residential', 'Commercial, Tourist',
        'Commercial, Tourist, Governmental, Sports',
        'Commercial, Tourist, Residential', 'Educational',
        'Educational, Charity', 'Educational, Charity, Sports',
        'Educational, Governmental', 'Educational, Governmental, Health',
        'Educational, Governmental, Residential', 'Educational, Health',
        '

In [9]:
X_train_saudiPro[col_names_saudiPro] = ordinal_ls_saudiPro
X_test_saudiPro[col_names_saudiPro] = ordinal_ls_val_saudiPro

X_train_saudiPro.sample(40)

Unnamed: 0,sectors,sector_budgets,type_project,project_area,region_project,status_project
1058,3.0,1436611825301,33.0,919853200000.0,48.0,6.0
1731,6.0,709916354518,77.0,1999999.0,33.0,3.0
1092,3.0,1436611825301,6.0,919853200000.0,36.0,3.0
2103,4.0,549859453510,29.0,2000000.0,49.0,6.0
827,2.0,567443636867,16.0,919853200000.0,48.0,2.0
605,9.0,672120928423,13.0,180000.0,30.0,6.0
1079,3.0,1436611825301,77.0,999774.0,0.0,3.0
1372,3.0,1436611825301,58.0,60000.0,40.0,3.0
1263,3.0,1436611825301,76.0,440000000.0,55.0,6.0
421,1.0,1058790791316,50.0,919853200000.0,23.0,0.0


In [10]:
X_train_saudiPro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 835 entries, 0 to 2176
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sectors         835 non-null    float64
 1   sector_budgets  835 non-null    int64  
 2   type_project    835 non-null    float64
 3   project_area    835 non-null    float64
 4   region_project  835 non-null    float64
 5   status_project  835 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 45.7 KB


In [11]:
X_train_saudiPro.shape

(835, 6)

In [12]:
scaler_saudiPro = StandardScaler()

## Fit & transform data.
X_train_saudiPro_sc = scaler_saudiPro.fit_transform(X_train_saudiPro)
X_test_saudiPro_sc = scaler_saudiPro.transform(X_test_saudiPro)

In [13]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

reg_forest = RandomForestRegressor(n_estimators = 10, random_state = 0, criterion = 'squared_error')
reg_forest.fit(X_train_saudiPro_sc, y_train_saudiPro)

preds_forest = reg_forest.predict(X_test_saudiPro_sc)
mean_absolute_error(y_true=y_test_saudiPro, y_pred=preds_forest)

991374043.9535304

In [14]:
# Save the model
import pickle
filename_RF = 'saudi_projects_regression_RF.pkl'
pickle.dump(reg_forest, open(filename_RF, 'wb'))

In [15]:
# load the model from disk
loaded_model = pickle.load(open(filename_RF, 'rb'))
result_saudiPro = loaded_model.score(X_test_saudiPro_sc, y_test_saudiPro)
print(result_saudiPro)

0.9104971863733573


In [20]:
X_test_saudiPro_sc

array([[-1.27972782,  0.27808702, -1.48840716, -0.70657652, -1.83024486,
         1.38025705],
       [-1.27972782,  0.27808702, -1.16756903, -0.70657649,  0.98492705,
        -0.70009428],
       [-1.27972782,  0.27808702, -0.20505463, -0.70657653,  0.02520935,
         1.38025705],
       ...,
       [-0.05336279, -1.06860613,  0.36532427, -0.70657285,  0.72900233,
        -0.18000645],
       [-0.05336279, -1.06860613,  0.22272955, -0.70656546,  0.72900233,
         1.38025705],
       [-0.05336279, -1.06860613, -1.23886639, -0.70657647, -1.0624707 ,
        -0.18000645]])

In [16]:
#################################################

In [17]:
# SVR
from sklearn.svm import SVR
reg_svr = SVR(kernel = 'linear')
reg_svr.fit(X_train_saudiPro_sc, y_train_saudiPro)

preds_svr = reg_svr.predict(X_test_saudiPro_sc)
mean_absolute_error(y_true=y_test_saudiPro, y_pred=preds_svr)

3085830004.047136

In [18]:
# Save the model
import pickle
filename_SVM = 'saudi_projects_regression_SVM.pkl'
pickle.dump(reg_svr, open(filename_SVM, 'wb'))

In [19]:
# load the model from disk
loaded_model = pickle.load(open(filename_SVM, 'rb'))
result_saudiPro = loaded_model.score(X_test_saudiPro_sc, y_test_saudiPro)
print(result_saudiPro)

-0.10157225065668163
