# 6.- Modeling 

Machine learning model implementation

In [12]:
#Libraries utilized
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#evaluating the model
from sklearn.metrics import mean_squared_error, r2_score

#Libraries for the 
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split

from pandasgui import show

### Loading the reduced dataset

In [13]:
#Loading from the previously processed dataframe
population_data_origin  = pd.read_csv('datasets/population_merged_reduced.csv',  sep=",",low_memory=False)

# Display the DataFrame information including data types
population_data_origin.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17538 entries, 0 to 17537
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       17538 non-null  int64  
 1   LocID            17538 non-null  int64  
 2   Time             17538 non-null  int64  
 3   TPopulation1Jan  17538 non-null  float64
 4   PopDensity       17538 non-null  float64
 5   PopSexRatio      17538 non-null  float64
 6   MedianAgePop     17538 non-null  float64
 7   PopGrowthRate    17538 non-null  float64
 8   DoublingTime     17538 non-null  float64
 9   MAC              17538 non-null  float64
 10  SRB              17538 non-null  float64
 11  CDR              17538 non-null  float64
 12  NetMigrations    17538 non-null  float64
 13  Location         17538 non-null  object 
dtypes: float64(10), int64(3), object(1)
memory usage: 1.9+ MB


In [14]:
#Deleting first attribute garbage generated
population_data_origin = population_data_origin.iloc[:, 1:]

#show the dataframe
population_data_origin.head()



Unnamed: 0,LocID,Time,TPopulation1Jan,PopDensity,PopSexRatio,MedianAgePop,PopGrowthRate,DoublingTime,MAC,SRB,CDR,NetMigrations,Location
0,108,1950,2229.322,86.8637,91.9472,18.3147,2.2,31.5067,30.995,102.5,23.546,-13.343,Burundi
1,108,1951,2278.903,88.7571,92.1448,18.0842,2.114,32.7884,30.996,102.5,23.879,-13.217,Burundi
2,108,1952,2327.593,90.6179,92.3191,17.8744,2.036,34.0446,31.026,102.5,23.815,-13.715,Burundi
3,108,1953,2375.478,92.4508,92.488,17.6693,1.969,35.203,31.03,102.5,23.604,-14.962,Burundi
4,108,1954,2422.721,94.2874,92.6503,17.4706,1.965,35.2747,31.036,102.5,23.347,-14.599,Burundi


In [15]:
# For modeling we do not need the location name varchar type
population_data =  population_data_origin.drop("Location", axis='columns')

## Applying Modeling algorithms

Part of the questions I may need to ask involve my dependent variable, which is population size. However, I am specifically interested in making predictions based on years and country codes. While regression analysis is suitable for this task, I am also considering other algorithms to explore alternative approaches.

###  Multi-variable Linear Regression
is a statistical modeling technique used to analyze the relationship between multiple independent variables (also known as predictors or features) and a single continuous dependent variable. It extends the concept of simple linear regression, which considers only one independent variable, to incorporate multiple predictors.

In [16]:
# Separate the independent variables (attributes) and the dependent variable
X = population_data.drop('TPopulation1Jan', axis=1)  # The dependent variavle drop
y = population_data['TPopulation1Jan']  # y for the dependent variable


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Create an instance of the linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

In [18]:
# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)

# In the above code, X_train and y_train represent the independent variables and the corresponding dependent 
# variable from the training set, respectively. X_test and y_test represent the independent variables 
# and the corresponding dependent variable from the test set, respectively.



Mean Squared Error (MSE): 6323442486.498052
R-squared: 0.0668133501326188


In [19]:
# Print the coefficients (slopes) and intercept of the linear regression model
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [-1.64036908e+01  6.47497980e+02 -1.10975530e+00  1.25414943e+02
 -5.97829033e+02  3.07743314e+03  2.65789187e+02 -7.61267615e+03
  1.26701645e+04  1.58381051e+03 -5.67975392e+01]
Intercept: -2399521.1002219487


### Long Short-Term Memory (LSTM) Networks: 
 
LSTM is a type of recurrent neural network (RNN) that can effectively model and predict sequences, including time series data. LSTM networks can capture long-term dependencies and are commonly used for time series forecasting tasks.



In [8]:
# Separate the independent variables (attributes) and the dependent variable
X = population_data.drop('TPopulation1Jan', axis=1)  # Replace 'TPopulation1Jan' with the column name of your dependent variable
y = population_data['TPopulation1Jan']  # Replace 'TPopulation1Jan' with the column name of your dependent variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input data to fit the LSTM input shape [samples, time steps, features]
X_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

# Create the LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], 1)))
model.add(Dense(units=1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 7154476032.0


In [9]:

# Data prepared in the following format:
# X_train: Training features (input sequence)
# y_train: Training labels (output/target)
# X_test: Test features (input sequence)
# y_test: Test labels (output/target)

# Reshape input data to fit the LSTM input shape [samples, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]*X_test.shape[2]))

# Create the LSTM model
model = Sequential()
#model.add(LSTM(units=50, input_shape=(X_train.shape[1],)))
model.add(LSTM(units=50, input_shape=(X_train.shape[1], 1)))

model.add(Dense(units=1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 7154458112.0
