In [3]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

indicatorsDf = pd.read_excel('https://github.com/tiecia/inequality-project/raw/refs/heads/main/StateIndicatorsDatabase_2024.xlsx', sheet_name='Data')

x_cols = ['necm_fundinggap_q1', 'necm_fundinggap_q2', 'necm_fundinggap_q3', 'necm_fundinggap_q4', 'necm_fundinggap_q5',
          'tchsalary25_30', 'tchsalary31_40', 'tchsalary41_50', 'tchsalary51_60',
          'predicted_tchph0_', 'predicted_tchph10_', 'predicted_tchph20_', 'predicted_tchph30_']

y_cols = ['necm_outcomegap_q1', 'necm_outcomegap_q2', 'necm_outcomegap_q3', 'necm_outcomegap_q4', 'necm_outcomegap_q5']

df = indicatorsDf[x_cols + y_cols]

df.dropna(inplace=True)

X = df[x_cols]
y = df[y_cols]


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import copy

class Network(nn.Module):
    def __init__(self, input_size, output_size):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class Model:
  def __init__(self, input_dims, output_dims):
    self.model = Network(input_dims, output_dims)

  def train(self, xtrain, ytrain, xval, yval):
    loss_fn = nn.MSELoss(reduction="mean")
    optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    n_epochs = 400
    batch_size = 15
    batch_start = torch.arange(0, len(xtrain), batch_size)

    best_loss = np.inf
    best_weights = None

    print(f"Training with {n_epochs} epochs and a batch size of {batch_size}")

    sample_times = []

    for epoch in range(n_epochs):
      self.model.train()

      for start in batch_start:
          # take a batch
          X_batch = xtrain[start:start+batch_size]
          y_batch = ytrain[start:start+batch_size]

          start_time = time.time()

          # forward pass
          outputs = self.model(X_batch)
          loss = loss_fn(outputs, y_batch)
          # backward pass
          optimizer.zero_grad()
          loss.backward()
          # update weights
          optimizer.step()

          end_time = time.time()
          sample_times.append((end_time - start_time) / batch_size)

      self.model.eval()
      outputs = self.model(xval)
      loss = loss_fn(outputs, yval)
      if loss < best_loss:
          best_loss = loss
          best_weights = copy.deepcopy(self.model.state_dict())

      if (epoch+1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}, Best Loss: {:.4f}'.format(epoch+1, n_epochs, loss, best_loss))

    print(f"Average Sample Training Time: {np.mean(sample_times)} seconds")
    self.model.load_state_dict(best_weights)

  def predict(self, value):
    return self.model(value)

In [39]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.6, shuffle=True)
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.2, shuffle=True)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)
xval = scaler.transform(xval)

xtrain = torch.tensor(xtrain, dtype=torch.float32)
xtest = torch.tensor(xtest, dtype=torch.float32)
xval = torch.tensor(xval, dtype=torch.float32)

ytrain = torch.tensor(ytrain.values, dtype=torch.float32)
ytest = torch.tensor(ytest.values, dtype=torch.float32)
yval = torch.tensor(yval.values, dtype=torch.float32)

model = Model(X.shape[1], y.shape[1])
start_time = time.time()
model.train(xtrain, ytrain, xtest, ytest)
end_time = time.time()
print(f"Training Time: {end_time - start_time} seconds")

Training with 400 epochs and a batch size of 15
Epoch [10/400], Loss: 0.0308, Best Loss: 0.0308
Epoch [20/400], Loss: 0.0266, Best Loss: 0.0266
Epoch [30/400], Loss: 0.0250, Best Loss: 0.0250
Epoch [40/400], Loss: 0.0244, Best Loss: 0.0244
Epoch [50/400], Loss: 0.0232, Best Loss: 0.0222
Epoch [60/400], Loss: 0.0239, Best Loss: 0.0215
Epoch [70/400], Loss: 0.0211, Best Loss: 0.0204
Epoch [80/400], Loss: 0.0215, Best Loss: 0.0194
Epoch [90/400], Loss: 0.0214, Best Loss: 0.0194
Epoch [100/400], Loss: 0.0202, Best Loss: 0.0190
Epoch [110/400], Loss: 0.0194, Best Loss: 0.0187
Epoch [120/400], Loss: 0.0204, Best Loss: 0.0187
Epoch [130/400], Loss: 0.0181, Best Loss: 0.0180
Epoch [140/400], Loss: 0.0194, Best Loss: 0.0171
Epoch [150/400], Loss: 0.0176, Best Loss: 0.0171
Epoch [160/400], Loss: 0.0179, Best Loss: 0.0171
Epoch [170/400], Loss: 0.0166, Best Loss: 0.0166
Epoch [180/400], Loss: 0.0162, Best Loss: 0.0162
Epoch [190/400], Loss: 0.0170, Best Loss: 0.0162
Epoch [200/400], Loss: 0.0158,

In [40]:
model.model.eval()
outputs = model.predict(xval).detach()

loss_fn = nn.MSELoss(reduction="mean")
loss = loss_fn(outputs, yval)
print(f"Validation Loss: {loss}")

differences = []
for i in range(len(yval)):
  differences.append(yval[i] - outputs[i])

mean = np.mean(differences)
actual_stdev = yval.std()
print(f"Average Difference: {mean}")
print(f"Stdevs: {actual_stdev}")

Validation Loss: 0.019542936235666275
Average Difference: 0.043055061250925064
Stdevs: 0.27105629444122314


In [41]:
def predict(value):
    value = scaler.transform([value])
    value = torch.tensor(value, dtype=torch.float32).reshape(1, -1)
    return model.predict(value)

# Funding gap (USD)
poverty_q1 = 0
poverty_q2 = 0
poverty_q3 = 0
poverty_q4 = 0
poverty_q5 = 1000

teacher_salary = 50000
teacher_per_100_students = 60

print(predict([poverty_q1, poverty_q2, poverty_q3, poverty_q4, poverty_q5,
               teacher_salary, teacher_salary, teacher_salary, teacher_salary,
               teacher_per_100_students, teacher_per_100_students, teacher_per_100_students, teacher_per_100_students]))

tensor([[3.3540, 0.9323, 0.7792, 2.1304, 2.4886]], grad_fn=<AddmmBackward0>)
