# Operations Perfomed:
> 1. Reading in the Upper_Secondary school data
> 2. Rescaling the data to prevent data redundancy and to get optimal model performance
> 3. Hypertuning of models' performance for optimal model perfomance
> 4. Model building on the training datasets, evaluation on the testing sets using various regression techniques such as: Linear, Ridge, Lasso, and Gradient boost regression
> 5. Evaluation of models using various metrics in regression such as mean_absolute_error, mean_squared_error root_mean_squared_error, r2_score, residual_sum of squares to evaluate model performance
> 6. Performed comparison on the model between the actual values and the predicted values




In [1]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')


In [89]:
df= pd.read_csv(r'Scaled_Upper_Secondary_edu.csv')

In [90]:
df.head(3)

Unnamed: 0,Year,Completion Rate for upper secondary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Southern Asia,Sub-Saharan Africa,Gender_numerical
0,2012,96.9,47.348558,5.871662,68.735,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2012,96.2,47.348558,5.871662,68.735,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2013,97.0,47.348558,5.871662,68.735,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [91]:
df.rename(columns = {'Childhood Education GER':'Childhood_Education_GER', 'Gross enrolment ratio, early childhood educational development programmes':'Gross_enrolment_ratio_early_childhood_educational_development_programmes',
                    'Literacy rate for 25-64 years old':'Literacy_rate_for_25_64_years_old', 'Expenditure on education as a percentage of total government expenditure (%)':'Expenditure_on_education_as_a_percentage_of_total_government_expenditure',
                    'Government expenditure on education as a percentage of GDP (%)':'Government_expenditure_on_education_as_a_percentage_of_GDP',
                    'Central Asia':'Central_Asia', 'Central and Southern Asia':'Central_and_Southern_Asia', 'Eastern and South-Eastern Asia':'Eastern_and_South_Eastern_Asia',
                    'Europe and Northern America':'Europe_and_Northern_America', 'Latin America and the Caribbean':'Latin_America_and_the_Caribbean',
                    'Northern Africa and Western Asia':'Northern_Africa_and_Western_Asia', 'Oceania':'Oceania', 'Southern Asia': 'Southern_Asia', 'Sub-Saharan_Africa':'Sub_Saharan_Africa','Gender_numerical':'Gender_numerical'}, inplace = True)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876 entries, 0 to 1875
Data columns (total 17 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Year                                                                      1876 non-null   int64  
 1   Completion Rate for upper secondary Edu                                   1876 non-null   float64
 2   Childhood_Education_GER                                                   1876 non-null   float64
 3   Gross_enrolment_ratio_early_childhood_educational_development_programmes  1876 non-null   float64
 4   Literacy_rate_for_25_64_years_old                                         1876 non-null   float64
 5   Expenditure_on_education_as_a_percentage_of_total_government_expenditure  1876 non-null   float64
 6   Government_expenditure_on_education_as_a_percentage_of_GDP      

## Data Transformation

#### 1. Data Rescaling

In [92]:
# Data Rescaling

data_col = df['Year']
df.set_index('Year',inplace =True)

df.head()


Unnamed: 0_level_0,Completion Rate for upper secondary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Southern_Asia,Sub-Saharan Africa,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2012,96.9,47.348558,5.871662,68.735,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2012,96.2,47.348558,5.871662,68.735,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2013,97.0,47.348558,5.871662,68.735,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2013,96.4,47.348558,5.871662,68.735,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2014,97.2,36.65,0.0,68.735,17.32,3.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [93]:
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df['Year'] = data_col
normalized_df.set_index('Year', inplace =True)
normalized_df.head(3)

Unnamed: 0_level_0,Completion Rate for upper secondary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Southern_Asia,Sub-Saharan Africa,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2012,0.985625,0.291174,0.062772,0.657782,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2012,0.978437,0.291174,0.062772,0.657782,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,0.986652,0.291174,0.062772,0.657782,0.446206,0.252941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [94]:
normalized_df.corr().head(3)

Unnamed: 0,Completion Rate for upper secondary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Southern_Asia,Sub-Saharan Africa,Gender_numerical
Completion Rate for upper secondary Edu,1.0,0.232376,0.350079,0.31273,-0.05501,0.100356,0.250122,0.046776,0.0649,0.45088,0.176922,0.088123,0.003803,-0.119431,-0.601039,0.031859
Childhood_Education_GER,0.232376,1.0,0.298705,0.118609,-0.182103,-0.017314,-0.064456,0.087568,0.157829,0.185646,0.024161,-0.050692,0.053348,0.149808,-0.346218,0.001945
Gross_enrolment_ratio_early_childhood_educational_development_programmes,0.350079,0.298705,1.0,0.122515,0.00237,-0.017604,-0.008424,-0.093573,0.091822,0.362286,0.14447,-0.110872,-0.091845,-0.105898,-0.195579,0.000438


In [95]:
# spliting the dataset into training and testing set
x = normalized_df.drop('Completion Rate for upper secondary Edu', axis = 1)
y = normalized_df['Completion Rate for upper secondary Edu']


# making the train set to be 70% of the entire dataset.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)


### Parameters hypertuning

In [96]:
def parameters_hypertuning(model, parameters = {}):
  gs = GridSearchCV(model, parameters)
  gs.fit(x_train, y_train)
  hypertuned_parameters = gs.best_params_

  return hypertuned_parameters


In [119]:
#hypertuning of paramters for the linear regression model
parameters_hypertuning(linear_model, {'n_jobs': range(0, 50, 5)})

{'n_jobs': 0}

In [120]:
#hypertuning of paramters for the ridge regression model

parameters_hypertuning(ridge_model,
 {'alpha': np.arange(1.0, 20.0),
  'max_iter': range(0, 1500, 50)
  })

{'alpha': 1.0, 'max_iter': 50}

In [121]:
#hypertuning of paramters for the lasso regression model
parameters_hypertuning(lasso_model, {'alpha': np.arange(1.0, 20.0),
  'max_iter': range(0, 1500, 50)})

{'alpha': 1.0, 'max_iter': 50}

In [87]:
#hypertuning of paramters for the Gradient boosting regression model
parameters_hypertuning(Gbr, {'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
                             'learning_rate': np.arange(1.0, 20.0, 2.0),
                             'n_estimators': range(0, 50, 5)})

{'learning_rate': 1.0, 'loss': 'squared_error', 'n_estimators': 40}

## Model Training, Testing and evaluation


 Because the target variable been 'Completion rate for primary school Edu' is a continoues data and not categorical. therfore, the model algorithm to be use is the regression and not classifiers.

#### 1. Linear Regression


In [97]:

import time

# Record start time
start_time = time.time()

# Create a Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training dataset
linear_model.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))





Training took 0.03 seconds.


In [98]:
# Obtain predictions
predicted_values = linear_model.predict(x_test)

#Measuring regression performance
MAE = mean_absolute_error(y_test, predicted_values)
MSE = mean_squared_error(y_test, predicted_values)
RMSE =np.sqrt(mean_squared_error(y_test, predicted_values))
RSS =np.sum(np.square(y_test - predicted_values))
r2=r2_score(y_test, predicted_values)




In [63]:
# getting the linear regression evaluation results

results = pd.DataFrame([['Linear regression', MAE, MSE, RMSE, RSS,r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.153095,0.038898,0.197225,21.899426,0.559145


In [99]:
lreg_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":predicted_values * 100})
lreg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,19.601602,48.437500
2012,16.315844,17.773438
2019,84.289968,88.867188
2018,15.289044,51.757812
2014,33.771434,28.906250
...,...,...
2013,9.538967,22.851562
2013,50.508266,51.367188
2018,65.766506,53.613281
2015,18.061403,54.687500


## Ridge Regression

In [100]:
# ridge regression
start_time = time.time()

ridge_model = Ridge(alpha = 1.0, max_iter = 50)
ridge_model.fit(x_train, y_train)
# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))


Training took 0.02 seconds.


In [101]:
y_preds = ridge_model.predict(x_test)

In [102]:
# getting the ridge model evaluation results
mae= mean_absolute_error(y_test, y_preds)
mse= mean_squared_error(y_test, y_preds)
rmse=np.sqrt(mean_squared_error(y_test, y_preds))
rss=np.sum(np.square(y_test - y_preds))
r_squared=r2_score(y_test, y_preds)

In [103]:
model_results = pd.DataFrame([['Ridge regression', mae, mse, rmse, rss,r_squared]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Ridge regression,0.153265,0.038952,0.197364,21.930257,0.558524


In [104]:
ridg_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":predicted_values * 100})
ridg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,19.601602,48.437500
2012,16.315844,17.773438
2019,84.289968,88.867188
2018,15.289044,51.757812
2014,33.771434,28.906250
...,...,...
2013,9.538967,22.851562
2013,50.508266,51.367188
2018,65.766506,53.613281
2015,18.061403,54.687500


#### Lasso regression

In [105]:

# Record start time
start_time = time.time()

lasso_model = Lasso(alpha = 1.0, max_iter = 50)

lasso_model.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))




Training took 0.01 seconds.


In [106]:
lasso_predictions  = lasso_model.predict(x_test)

In [107]:
print(lasso_predictions.shape, y_test.shape)

(563,) (563,)


In [108]:
# getting the lasso model evaluation results
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_mse = mean_squared_error( y_test,lasso_predictions)
lasso_rmse = np.sqrt(lasso_mse)
lasso_mae = mean_absolute_error(y_test,lasso_predictions)
lasso_rss = np.sum(np.square(y_test - lasso_predictions))

In [109]:
model_results = pd.DataFrame([['lasso regression', lasso_mae, lasso_mse, lasso_rmse, lasso_rss, lasso_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [110]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,lasso regression,0.263145,0.088245,0.297061,49.682155,-0.000147


In [111]:
lasso_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":lasso_predictions * 100})
lasso_df.sort_index(ascending=True, inplace=True)
lasso_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,11.181846,47.193455
2012,51.227025,47.193455
2012,98.562481,47.193455
2012,48.742171,47.193455
2012,29.561557,47.193455
...,...,...
2021,88.048054,47.193455
2021,88.725742,47.193455
2021,54.307424,47.193455
2021,73.518842,47.193455


#### GradientBoostingRegressor

In [112]:
from sklearn.ensemble import GradientBoostingRegressor


# Record start time
start_time = time.time()

Gbr = GradientBoostingRegressor(learning_rate= 1.0, loss= 'squared_error', n_estimators= 40)
Gbr.fit(x_train,y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))



Training took 0.42 seconds.


In [113]:
Gbr_predictions  = Gbr.predict(x_test)

In [114]:
# getting the Gradient boosting Model evaluation results
Gbr_r2 = r2_score(y_test, Gbr_predictions)
Gbr_mse = mean_squared_error( y_test,Gbr_predictions)
Gbr_rmse = np.sqrt(Gbr_mse)
Gbr_mae = mean_absolute_error(y_test,Gbr_predictions)
Gbr_rss = np.sum(np.square(y_test - Gbr_predictions))

In [115]:
model_results = pd.DataFrame([['GBR regression', Gbr_mae, Gbr_mse, Gbr_rmse, Gbr_rss, Gbr_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [116]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,GBR regression,0.090996,0.018369,0.135531,10.341571,0.791815


In [117]:
Gbr_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":lasso_predictions * 100})
Gbr_df.sort_index(ascending=True, inplace=True)
Gbr_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,11.181846,47.193455
2012,51.227025,47.193455
2012,98.562481,47.193455
2012,48.742171,47.193455
2012,29.561557,47.193455
...,...,...
2021,88.048054,47.193455
2021,88.725742,47.193455
2021,54.307424,47.193455
2021,73.518842,47.193455


In [118]:
# Printing the stored results of all the three models
results.drop_duplicates().head(4)


Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.153095,0.038898,0.197225,21.899426,0.559145
1,Ridge regression,0.153265,0.038952,0.197364,21.930257,0.558524
2,lasso regression,0.263145,0.088245,0.297061,49.682155,-0.000147
3,GBR regression,0.09107,0.018378,0.135566,10.34684,0.791709


### conclusion

> Based on the results presented above, it is evident that the Ridge regression model exhibits the most favorable performance. This conclusion is drawn from the fact that the model achieves a mean_absolute_error of 0.1532 which is slightly lower than the linear regression having a mean_absolute error value of 0.1539. This value approaching zero signifies a strong indication of optimal performance within the context of regression modeling.






In [122]:
# creating pickle file for deployment
file = open('pickledUpper_Gbr.pkl', 'wb')
pickle.dump(Gbr, file)