# Operations Perfomed:
> 1. Reading in the Primary school data
> 2. Rescaling the data to prevent data redundancy and to get optimal model performance
> 3. Hypertuning of models' performance for optimal model perfomance
> 4. Model building on the training datasets, evaluation on the testing sets using various regression techniques such as: Linear, Ridge, Lasso, and Gradient boost regression
> 5. Evaluation of models using various metrics in regression such as mean_absolute_error, mean_squared_error root_mean_squared_error, r2_score, residual_sum of squares to evaluate model performance
> 6. Performed comparison on the model between the actual values and the predicted values




In [None]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')


In [None]:
df= pd.read_csv('/content/Scaled_Primary.csv_')

In [None]:
df.head(3)

Unnamed: 0,Year,Completion Rate for Primary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes,","Gross intake ratio to the last grade of primary education,",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Gender_numerical
0,2012,99.8,46.379381,6.361501,108.19,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2012,99.7,46.379381,6.361501,106.88,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2013,99.9,46.379381,6.361501,109.76,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


## Data Transformation

#### 1. Data Rescaling

In [None]:
# Data Rescaling

data_col = df['Year']
df.set_index('Year',inplace =True)

df.head()


Unnamed: 0_level_0,Completion Rate for Primary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes,","Gross intake ratio to the last grade of primary education,",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012,99.8,46.379381,6.361501,108.19,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2012,99.7,46.379381,6.361501,106.88,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2013,99.9,46.379381,6.361501,109.76,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2013,99.8,46.379381,6.361501,108.9,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2014,99.9,36.65,0.0,115.17,73.7825,17.32,3.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df['Year'] = data_col
normalized_df.set_index('Year', inplace =True)
normalized_df.head(3)

Unnamed: 0_level_0,Completion Rate for Primary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes,","Gross intake ratio to the last grade of primary education,",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012,0.997413,0.285122,0.068008,0.845059,0.713031,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2012,0.99612,0.285122,0.068008,0.831545,0.713031,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,0.998707,0.285122,0.068008,0.861254,0.713031,0.446206,0.252941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
normalized_df.corr().head(3)

Unnamed: 0,Completion Rate for Primary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes,","Gross intake ratio to the last grade of primary education,",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Gender_numerical
Completion Rate for Primary Edu,1.0,0.351886,0.305259,0.671671,0.398736,0.006356,0.284199,0.056772,0.108787,0.316906,0.267968,0.129879,0.126447,-0.691665,0.055559
Childhood Education GER,0.351886,1.0,0.334212,0.377295,0.143463,-0.168636,-0.001142,0.101699,0.177365,0.208522,0.043067,-0.049118,0.063413,-0.355638,0.004159
"Gross enrolment ratio, early childhood educational development programmes,",0.305259,0.334212,1.0,0.200007,0.123545,-0.011862,-0.025302,-0.104493,0.085812,0.359931,0.132204,-0.121722,-0.100788,-0.224524,0.00039


In [None]:
# spliting the dataset into training and testing set
x = normalized_df.drop('Completion Rate for Primary Edu', axis = 1)
y = normalized_df['Completion Rate for Primary Edu']


# making the train set to be 75% of the entire dataset.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

# test is now 10% of the entire dataset
# validation is now 15% of the entire dataset

## Model Training, Testing and evaluation


 Because the target variable been 'Completion rate for primary school Edu' is a continoues data and not categorical. therfore, the model algorithm to be use is the regression and not classifiers.

#### 1. Linear Regression


In [None]:
def parameters_hypertuning(model, parameters = {}):
  gs = GridSearchCV(model, parameters)
  gs.fit(x_train, y_train)
  hypertuned_parameters = gs.best_params_

  return hypertuned_parameters


In [None]:
#hypertuning of paramters for the linear regression model
parameters_hypertuning(linear_model, {'n_jobs': range(0, 50, 5)})

{'n_jobs': 0}

In [None]:
#hypertuning of paramters for the ridge regression model

parameters_hypertuning(ridge_model,
 {'alpha': np.arange(1.0, 20.0),
  'max_iter': range(0, 1500, 50)
  })

{'alpha': 1.0, 'max_iter': 50}

In [None]:
#hypertuning of paramters for the lasso regression model
parameters_hypertuning(lasso_model, {'alpha': np.arange(1.0, 20.0),
  'max_iter': range(0, 1500, 50)})

{'alpha': 1.0, 'max_iter': 50}

In [None]:
#hypertuning of paramters for the Gradient boosting regression model
parameters_hypertuning(Gbr, {'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
                             'learning_rate': np.arange(1.0, 20.0, 2.0),
                             'n_estimators': range(0, 50, 5)})

{'learning_rate': 1.0, 'loss': 'squared_error', 'n_estimators': 45}

In [None]:

import time

# Record start time
start_time = time.time()

# Create a Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training dataset
linear_model.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))





Training took 0.01 seconds.


In [None]:
# Obtain predictions
predicted_values = linear_model.predict(x_test)

#Linear regression model evaluation results
MAE = mean_absolute_error(y_test, predicted_values)
MSE = mean_squared_error(y_test, predicted_values)
RMSE =np.sqrt(mean_squared_error(y_test, predicted_values))
RSS =np.sum(np.square(y_test - predicted_values))
r2=r2_score(y_test, predicted_values)




In [None]:
# Model evaluation

results = pd.DataFrame([['Linear regression', MAE, MSE, RMSE, RSS,r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.117826,0.023224,0.152394,11.844171,0.685425


In [None]:
lreg_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":predicted_values * 100})
lreg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,96.546818,91.308594
2019,89.653389,76.464844
2019,98.189343,75.488281
2019,100.000000,127.490234
2012,6.751164,40.087891
...,...,...
2014,97.284014,98.486328
2016,15.028453,14.746094
2014,35.851009,38.671875
2020,87.454734,79.833984


## Ridge Regression

In [None]:
# ridge regression
start_time = time.time()

ridge_model = Ridge(alpha = 1.0, max_iter = 50)
ridge_model.fit(x_train, y_train)
# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))


Training took 0.01 seconds.


In [None]:
y_preds = ridge_model.predict(x_test)

In [None]:
# Ridge regression model Evaluation results
mae= mean_absolute_error(y_test, y_preds)
mse= mean_squared_error(y_test, y_preds)
rmse=np.sqrt(mean_squared_error(y_test, y_preds))
rss=np.sum(np.square(y_test - y_preds))
r_squared=r2_score(y_test, y_preds)

In [None]:
model_results = pd.DataFrame([['Ridge regression', mae, mse, rmse, rss,r_squared]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Ridge regression,0.117327,0.023119,0.152051,11.790922,0.686839


In [None]:
ridg_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":predicted_values * 100})
ridg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,96.546818,91.308594
2019,89.653389,76.464844
2019,98.189343,75.488281
2019,100.000000,127.490234
2012,6.751164,40.087891
...,...,...
2014,97.284014,98.486328
2016,15.028453,14.746094
2014,35.851009,38.671875
2020,87.454734,79.833984


#### Lasso regression

In [None]:

# Record start time
start_time = time.time()

lasso_model = Lasso(alpha = 1.0, max_iter = 50)

lasso_model.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))




Training took 0.01 seconds.


In [None]:
lasso_predictions  = lasso_model.predict(x_test)

In [None]:
print(lasso_predictions.shape, y_test.shape)

(510,) (510,)


In [None]:
# lasso model evealuation results
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_mse = mean_squared_error( y_test,lasso_predictions)
lasso_rmse = np.sqrt(lasso_mse)
lasso_mae = mean_absolute_error(y_test,lasso_predictions)
lasso_rss = np.sum(np.square(y_test - lasso_predictions))

In [None]:
model_results = pd.DataFrame([['lasso regression', lasso_mae, lasso_mse, lasso_rmse, lasso_rss,lasso_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [None]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,lasso regression,0.236072,0.074312,0.272603,37.899326,-0.006588


In [None]:
knn_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":lasso_predictions * 100})
knn_df.sort_index(ascending=True, inplace=True)
knn_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,84.350750,73.447856
2012,56.026901,73.447856
2012,97.258148,73.447856
2012,99.353337,73.447856
2012,96.107087,73.447856
...,...,...
2021,96.572685,73.447856
2021,98.085877,73.447856
2021,97.452147,73.447856
2021,100.000000,73.447856


#### GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor


# Record start time
start_time = time.time()

Gbr = GradientBoostingRegressor(learning_rate= 1.0, loss= 'squared_error', n_estimators= 25)
Gbr.fit(x_train,y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))



Training took 0.06 seconds.


In [None]:
Gbr_predictions  = Gbr.predict(x_test)

In [None]:
# Gradient boosting model evaluation results
Gbr_r2 = r2_score(y_test, Gbr_predictions)
Gbr_mse = mean_squared_error( y_test,Gbr_predictions)
Gbr_rmse = np.sqrt(Gbr_mse)
Gbr_mae = mean_absolute_error(y_test,Gbr_predictions)
Gbr_rss = np.sum(np.square(y_test - Gbr_predictions))

In [None]:
model_results = pd.DataFrame([['GBR regression', Gbr_mae, Gbr_mse, Gbr_rmse, Gbr_rss, Gbr_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [None]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,GBR regression,0.110365,0.020823,0.144301,10.619632,0.717948


In [None]:
Gbr_df = pd.DataFrame(data ={"actual_values": y_test * 100, "predictions":lasso_predictions * 100})
Gbr_df.sort_index(ascending=True, inplace=True)
Gbr_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,84.350750,73.447856
2012,56.026901,73.447856
2012,97.258148,73.447856
2012,99.353337,73.447856
2012,96.107087,73.447856
...,...,...
2021,96.572685,73.447856
2021,98.085877,73.447856
2021,97.452147,73.447856
2021,100.000000,73.447856


In [None]:
# Printing the stored results of all the three models
results.drop_duplicates().head(4)


Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.117826,0.023224,0.152394,11.844171,0.685425
1,Ridge regression,0.117327,0.023119,0.152051,11.790922,0.686839
2,lasso regression,0.236072,0.074312,0.272603,37.899326,-0.006588
3,GBR regression,0.088021,0.015209,0.123327,329.54507,0.793982


### **Conclusion**
> Based on the results presented above, it is evident that the Ridge regression model exhibits the most favorable performance. This conclusion is drawn from the fact that the model achieves a mean_absolute_error of 0.1173. This value approaching zero signifies a strong indication of optimal performance within the context of regression modeling.






In [None]:
def save_model(filename):
  filename=filename
  pickle.dump([linear_model, ridge_model, lasso_model, Gbr], open(filename, 'wb'))

In [None]:
save_model('regression_model.cav')