# Model Algorithm 

#### Linear Regressor
#### K Neighbors Regressor
#### Decision Tree Regressor
#### Gradient boosting Regressor



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')


In [3]:
df= pd.read_csv(r'Scaled_Primary.csv_')

In [4]:
df.head()

Unnamed: 0,Year,Completion Rate for Primary Edu,Childhood Education GER,"Gross enrolment ratio, early childhood educational development programmes,","Gross intake ratio to the last grade of primary education,",Literacy rate for 25-64 years old,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Central Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Europe and Northern America,Latin America and the Caribbean,Northern Africa and Western Asia,Oceania,Gender_numerical
0,2012,99.8,46.379381,6.361501,108.19,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2012,99.7,46.379381,6.361501,106.88,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2013,99.9,46.379381,6.361501,109.76,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2013,99.8,46.379381,6.361501,108.9,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2014,99.9,36.65,0.0,115.17,73.7825,17.32,3.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
df.rename(columns = {'Childhood Education GER':'Childhood_Education_GER', 'Gross enrolment ratio, early childhood educational development programmes,':'Gross_enrolment_ratio_early_childhood_educational_development_programmes', 
                    'Gross intake ratio to the last grade of primary education,':'Gross_intake_ratio_to_the_last_grade_of_primary_education',
                    'Literacy rate for 25-64 years old':'Literacy_rate_for_25_64_years_old', 'Expenditure on education as a percentage of total government expenditure (%)':'Expenditure_on_education_as_a_percentage_of_total_government_expenditure',
                    'Government expenditure on education as a percentage of GDP (%)':'Government_expenditure_on_education_as_a_percentage_of_GDP',
                    'Central Asia':'Central_Asia', 'Central and Southern Asia':'Central_and_Southern_Asia', 'Eastern and South-Eastern Asia':'Eastern_and_South_Eastern_Asia',
                    'Europe and Northern America':'Europe_and_Northern_America', 'Latin America and the Caribbean':'Latin_America_and_the_Caribbean',
                    'Northern Africa and Western Asia':'Northern_Africa_and_Western_Asia', 'Oceania':'Oceania', 'Gender_numerical':'Gender_numerical'}, inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 16 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Year                                                                      1698 non-null   int64  
 1   Completion Rate for Primary Edu                                           1698 non-null   float64
 2   Childhood_Education_GER                                                   1698 non-null   float64
 3   Gross_enrolment_ratio_early_childhood_educational_development_programmes  1698 non-null   float64
 4   Gross_intake_ratio_to_the_last_grade_of_primary_education                 1698 non-null   float64
 5   Literacy_rate_for_25_64_years_old                                         1698 non-null   float64
 6   Expenditure_on_education_as_a_percentage_of_total_government_exp

## Data Transformation

#### 1. Data Rescaling

In [6]:
# Data Rescaling

data_col = df['Year']
df.set_index('Year',inplace =True)

df.head()


Unnamed: 0_level_0,Completion Rate for Primary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Gross_intake_ratio_to_the_last_grade_of_primary_education,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012,99.8,46.379381,6.361501,108.19,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2012,99.7,46.379381,6.361501,106.88,73.7825,19.25,3.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2013,99.9,46.379381,6.361501,109.76,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2013,99.8,46.379381,6.361501,108.9,73.7825,17.96,3.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2014,99.9,36.65,0.0,115.17,73.7825,17.32,3.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [7]:
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df['Year'] = data_col
normalized_df.set_index('Year', inplace =True)
normalized_df.head()

Unnamed: 0_level_0,Completion Rate for Primary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Gross_intake_ratio_to_the_last_grade_of_primary_education,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Gender_numerical
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012,0.997413,0.285122,0.068008,0.845059,0.713031,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2012,0.99612,0.285122,0.068008,0.831545,0.713031,0.482729,0.286765,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,0.998707,0.285122,0.068008,0.861254,0.713031,0.446206,0.252941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2013,0.997413,0.285122,0.068008,0.852383,0.713031,0.446206,0.252941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014,0.998707,0.224366,0.0,0.917062,0.713031,0.428086,0.251471,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
normalized_df.corr()

Unnamed: 0,Completion Rate for Primary Edu,Childhood_Education_GER,Gross_enrolment_ratio_early_childhood_educational_development_programmes,Gross_intake_ratio_to_the_last_grade_of_primary_education,Literacy_rate_for_25_64_years_old,Expenditure_on_education_as_a_percentage_of_total_government_expenditure,Government_expenditure_on_education_as_a_percentage_of_GDP,Central_Asia,Central_and_Southern_Asia,Eastern_and_South_Eastern_Asia,Europe_and_Northern_America,Latin_America_and_the_Caribbean,Northern_Africa_and_Western_Asia,Oceania,Gender_numerical
Completion Rate for Primary Edu,1.0,0.351886,0.305259,0.671671,0.398736,0.006356004,0.284199,0.05677172,0.1087869,0.3169058,0.267968,0.1298787,0.1264468,-0.6916647,0.055559
Childhood_Education_GER,0.351886,1.0,0.334212,0.377295,0.143463,-0.1686364,-0.001141602,0.1016988,0.1773651,0.208522,0.04306678,-0.04911766,0.06341314,-0.355638,0.004159161
Gross_enrolment_ratio_early_childhood_educational_development_programmes,0.305259,0.334212,1.0,0.200007,0.123545,-0.01186181,-0.02530235,-0.1044926,0.08581179,0.3599311,0.1322042,-0.1217224,-0.100788,-0.2245242,0.0003903865
Gross_intake_ratio_to_the_last_grade_of_primary_education,0.671671,0.377295,0.200007,1.0,0.329531,-0.05938324,0.2255793,0.1113258,0.1021789,0.1786076,0.2322325,0.02125456,0.1365556,-0.5428933,-0.001899245
Literacy_rate_for_25_64_years_old,0.398736,0.143463,0.123545,0.329531,1.0,0.0164774,0.09509635,-0.1187706,0.0637823,0.0202949,0.4064745,0.008498196,-0.05155189,-0.3150705,-0.07313633
Expenditure_on_education_as_a_percentage_of_total_government_expenditure,0.006356,-0.168636,-0.011862,-0.059383,0.016477,1.0,0.3856848,-0.03808543,-0.1319102,-0.2450364,0.1310693,-0.04273537,-0.02578618,0.1882934,1.117825e-16
Government_expenditure_on_education_as_a_percentage_of_GDP,0.284199,-0.001142,-0.025302,0.225579,0.095096,0.3856848,1.0,-0.09883898,-0.09509577,0.01931285,0.06713575,0.01360231,0.2169195,-0.07387889,9.778019000000001e-17
Central_Asia,0.056772,0.101699,-0.104493,0.111326,-0.118771,-0.03808543,-0.09883898,1.0,-0.102858,-0.1213554,-0.1838918,-0.0948398,-0.08893101,-0.2382951,-4.750145e-16
Central_and_Southern_Asia,0.108787,0.177365,0.085812,0.102179,0.063782,-0.1319102,-0.09509577,-0.102858,1.0,-0.1093283,-0.165667,-0.08544059,-0.0801174,-0.2146786,-9.549145000000001e-17
Eastern_and_South_Eastern_Asia,0.316906,0.208522,0.359931,0.178608,0.020295,-0.2450364,0.01931285,-0.1213554,-0.1093283,1.0,-0.1954597,-0.1008058,-0.09452529,-0.2532853,-3.643105e-17


In [9]:
# spliting the dataset into train set, test set, validation set.
datax = normalized_df.drop('Completion Rate for Primary Edu', axis = 1)
datay = normalized_df['Completion Rate for Primary Edu']

train_ratio = 0.75
val_ratio = 0.15
test_ratio = 0.10

# making the train set to be 75% of the entire dataset.
x_train, x_test, y_train, y_test = train_test_split(datax, datay, test_size = 1-train_ratio)

# test is now 10% of the entire dataset
# validation is now 15% of the entire dataset

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size = test_ratio/(test_ratio + val_ratio))

print(x_train, x_val, x_test)

      Childhood_Education_GER  \
Year                            
2014                 0.645623   
2016                 0.095604   
2021                 0.200262   
2015                 0.285122   
2019                 0.208068   
...                       ...   
2019                 0.110091   
2020                 0.135382   
2016                 0.285122   
2018                 0.322156   
2019                 0.345073   

      Gross_enrolment_ratio_early_childhood_educational_development_programmes  \
Year                                                                             
2014                                           0.000000                          
2016                                           0.000000                          
2021                                           0.000000                          
2015                                           0.000000                          
2019                                           0.000000                       

[255 rows x 14 columns]       Childhood_Education_GER  \
Year                            
2015                 0.285122   
2019                 0.285122   
2015                 0.199263   
2012                 0.285122   
2012                 0.285122   
...                       ...   
2020                 0.094979   
2020                 0.285122   
2017                 0.440177   
2015                 0.195891   
2015                 0.337018   

      Gross_enrolment_ratio_early_childhood_educational_development_programmes  \
Year                                                                             
2015                                           0.068008                          
2019                                           0.068008                          
2015                                           0.000000                          
2012                                           0.068008                          
2012                                           0.00000

## Model Training, Testing and evaluation

 
 Because the target variable been 'Completion rate for primary school Edu' is a continoues data and not categorical. therfore, the model algorithm to be use is the regression and not classifiers.

#### 1. Linear Regression


In [10]:

import time

# Record start time
start_time = time.time()

# Create a Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training dataset
linear_model.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))





Training took 2.02 seconds.


In [11]:
# Obtain predictions
predicted_values = linear_model.predict(x_test)

#Measuring regression performance
MAE = mean_absolute_error(y_test, predicted_values)
MSE = mean_squared_error(y_test, predicted_values)
RMSE =np.sqrt(mean_squared_error(y_test, predicted_values))
RSS =np.sum(np.square(y_test, predicted_values))
r2=r2_score(y_test, predicted_values)




In [12]:
# Model evaluation

results = pd.DataFrame([['Linear regression', MAE, MSE, RMSE, RSS,r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.126248,0.025239,0.158869,99.505067,0.723679


In [13]:
lreg_df = pd.DataFrame(data ={"actual_values": y_test, "predictions":predicted_values})
lreg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,0.867693,0.752891
2019,0.943999,0.891134
2015,0.804708,0.647554
2012,0.709002,0.502683
2012,0.123125,0.015160
...,...,...
2020,0.318417,0.101389
2020,0.410243,0.168299
2017,0.358510,0.128529
2015,0.643042,0.413503


#### Decition Tree Regressor

In [14]:
from sklearn.tree import DecisionTreeRegressor

import time


# Record start time
start_time = time.time()

DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model.fit(x_train,y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))


Training took 0.04 seconds.


In [15]:
y_preds = DecisionTree_model.predict(x_test)

In [16]:
# Model Evaluation
mae= mean_absolute_error(y_test, y_preds)
mse= mean_squared_error(y_test, y_preds)
rmse=np.sqrt(mean_squared_error(y_test, y_preds))
rss=np.sum(np.square(y_test, y_preds))
r_squared=r2_score(y_test, y_preds)

In [17]:
model_results = pd.DataFrame([['Decision Tree regression', mae, mse, rmse, rss,r_squared]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [18]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Decision Tree regression,0.063265,0.010886,0.104335,99.505067,0.723679


In [19]:
Dreg_df = pd.DataFrame(data ={"actual_values": y_test, "predictions":predicted_values})
Dreg_df


Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,0.867693,0.752891
2019,0.943999,0.891134
2015,0.804708,0.647554
2012,0.709002,0.502683
2012,0.123125,0.015160
...,...,...
2020,0.318417,0.101389
2020,0.410243,0.168299
2017,0.358510,0.128529
2015,0.643042,0.413503


#### K-Nearest Neighbors Regression (KNN Regression)

In [20]:

# Record start time
start_time = time.time()

knn = KNeighborsRegressor()

knn.fit(x_train, y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))




Training took 0.03 seconds.


In [21]:
knn_predictions  = knn.predict(x_test)

In [22]:
print(knn_predictions.shape, y_test.shape)

(170,) (170,)


In [23]:
# getting model scores
knn_r2 = r2_score(y_test, knn_predictions)
knn_mse = mean_squared_error( y_test,knn_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_mae = mean_absolute_error(y_test,knn_predictions)
knn_rss = np.sum(np.square(y_test, knn_predictions))

In [24]:
model_results = pd.DataFrame([['KNN regression', knn_mae, knn_mse, knn_rmse, knn_rss,knn_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [25]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,KNN regression,0.095335,0.019552,0.139828,99.505067,0.772738


In [26]:
knn_df = pd.DataFrame(data ={"actual_values": y_test, "predictions":knn_predictions})
knn_df.sort_index(ascending=True, inplace=True)
knn_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,0.606829,0.368241
2012,0.857087,0.734599
2012,0.605535,0.366673
2012,0.950336,0.903139
2012,0.401190,0.160953
...,...,...
2020,0.777548,0.604581
2020,0.989653,0.979414
2020,0.405070,0.164082
2020,0.710295,0.504519


In [27]:
from sklearn.ensemble import GradientBoostingRegressor


# Record start time
start_time = time.time()

Gbr = GradientBoostingRegressor()
Gbr.fit(x_train,y_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Display training time
print("Training took {:.2f} seconds.".format(elapsed_time))



Training took 2.76 seconds.


In [28]:
Gbr_predictions  = Gbr.predict(x_test)

In [29]:
# getting model scores
Gbr_r2 = r2_score(y_test, Gbr_predictions)
Gbr_mse = mean_squared_error( y_test,Gbr_predictions)
Gbr_rmse = np.sqrt(Gbr_mse)
Gbr_mae = mean_absolute_error(y_test,Gbr_predictions)
Gbr_rss = np.sum(np.square(y_test, Gbr_predictions))

In [30]:
model_results = pd.DataFrame([['GBR regression', knn_mae, knn_mse, knn_rmse, knn_rss,knn_r2]],
               columns = ['Model', 'MAE', 'MSE', 'RMSE', 'RSS','r2_score'])
results = pd.concat([results, model_results], ignore_index=True)

In [31]:
model_results

Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,GBR regression,0.095335,0.019552,0.139828,99.505067,0.772738


In [32]:
Gbr_df = pd.DataFrame(data ={"actual_values": y_test, "predictions":knn_predictions})
Gbr_df.sort_index(ascending=True, inplace=True)
Gbr_df

Unnamed: 0_level_0,actual_values,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,0.606829,0.368241
2012,0.857087,0.734599
2012,0.605535,0.366673
2012,0.950336,0.903139
2012,0.401190,0.160953
...,...,...
2020,0.777548,0.604581
2020,0.989653,0.979414
2020,0.405070,0.164082
2020,0.710295,0.504519


In [33]:
# Printing the stored results of all the three models
results.drop_duplicates()


Unnamed: 0,Model,MAE,MSE,RMSE,RSS,r2_score
0,Linear regression,0.126248,0.025239,0.158869,99.505067,0.723679
1,Decision Tree regression,0.063265,0.010886,0.104335,99.505067,0.723679
2,KNN regression,0.095335,0.019552,0.139828,99.505067,0.772738
3,GBR regression,0.095335,0.019552,0.139828,99.505067,0.772738


### conclusion

1.Linear Regression time for the model to train is 0.06 seconds

1.Decision Tree Regression seems to perform well in terms of minimizing errors and time for the model to tarin is 0.13 seconds.

2.KNN Regression and GBR Regression have higher R-squared values, suggesting better explanatory power.
also, the time for the two models to train are 0.16 and 0.68 respectively.

3.Both KNN and GBR have the same MAE, MSE, and RMSE, indicating similar performance on average.

In [34]:
# creating pickle file for deployment
file = open('pickled_Gbr.pkl', 'wb')
pickle.dump(Gbr, file)