<a href="https://colab.research.google.com/github/yshin1209/Awesome-CGM/blob/master/Glucose_Prediction_Simple_Working_GPT2_XL_Batch_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Train empty GPT-2 for glucose level prediction.

In [1]:
!pip install torch
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel


In [20]:
# Preprocess the training data
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
# train_data = [" 87 97 105 117 123 132 154 176 192", " 100 98 106 118 124 131 156 179 193", " 102 99 108 113 129 132 156 178 199"]
# input_ids = [tokenizer.encode(text) for text in train_data

# max_length: 864 measurements = 12 measurements/hour * 24 hours/day * 3 days
# Open the file in read mode
input_ids = []
with open('data.txt', 'r') as file:
    # Loop over each line in the file
    for line in file:
        line = ' ' + line # Add a single space to the first measurement
        input_ids.append(tokenizer.encode(line, max_length = 864, truncation=True ))

inputs = torch.tensor(input_ids)
# print(inputs)

'''
# Set up the GPT-2 model configuration
model_config = GPT2Config(
    vocab_size=len(tokenizer),
    n_embd=300,
    n_layer=36,
    n_head=12,
    attn_pdrop=0.5,
    resid_pdrop=0.5,
    initializer_range=0.02,
    repetition_penalty=1,  # No penalty
    temperature = 0
)
'''

# GPT2-XL
model_config = GPT2Config(
    vocab_size=50257,
    n_positions=2048,
    n_ctx=2048,
    n_embd=1600,
    n_layer=48,
    n_head=25,
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5)


# Set up the GPT-2 model with language modeling head
model = GPT2LMHeadModel(model_config)

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(5):
    model.train()
    total_loss = 0
    for input_seq in inputs:
        print (f'total_loss b: {total_loss}')
        optimizer.zero_grad()
        print (f'total_loss a: {total_loss}')
        input_seq = input_seq.unsqueeze(0)  # Reshape to 2D tensor
        outputs = model(input_seq[:, :-1], labels=input_seq[:, 1:])
        #print (f'outputs {outputs}')
        loss = outputs.loss
        # print (f'loss: {loss}')
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} loss: {total_loss / len(inputs)}")



total_loss b: 0
total_loss a: 0
total_loss b: 11.139847755432129
total_loss a: 11.139847755432129
total_loss b: 20.84402561187744
total_loss a: 20.84402561187744
total_loss b: 29.676016807556152
total_loss a: 29.676016807556152
total_loss b: 38.19645881652832
total_loss a: 38.19645881652832
total_loss b: 46.78644847869873
total_loss a: 46.78644847869873
total_loss b: 55.02399253845215
total_loss a: 55.02399253845215
total_loss b: 63.32712936401367
total_loss a: 63.32712936401367
total_loss b: 71.4927568435669
total_loss a: 71.4927568435669
total_loss b: 78.94521903991699
total_loss a: 78.94521903991699
total_loss b: 86.71383333206177
total_loss a: 86.71383333206177
total_loss b: 94.33285570144653
total_loss a: 94.33285570144653
total_loss b: 101.08673477172852
total_loss a: 101.08673477172852
total_loss b: 108.00678062438965
total_loss a: 108.00678062438965
total_loss b: 114.80997943878174
total_loss a: 114.80997943878174
total_loss b: 121.7494592666626
total_loss a: 121.7494592666626


In [6]:
tokenizer.decode (inputs[0])

' 160 160 142 133 128 128 126 119 113 106 103 101 100 101 102 103 103 100 98 118 127 129 131 133 133 133 133 132 131 130 129 128 128 127 128 129 129 129 128 125 122 121 119 120 120 120 121 122 122 124 125 128 136 138 140 139 137 135 132 126 118 114 111 110 113 114 113 111 108 109 112 112 111 109 104 102 101 100 99 97 95 94 93 93 92 93 96 95 92 91 92 98 157 168 166 165 158 148 137 135 132 126 119 117 114 114 109 99 90 84 80 79 80 82 83 83 84 84 86 87 85 85 88 93 95 95 95 95 99 106 105 105 104 101 99 94 89 93 96 97 87 83 84 87 96 110 128 141 152 166 180 187 199 206 208 213 209 217 220 216 213 209 202 194 189 186 181 177 170 164 165 161 156 154 150 148 146 141 132 121 116 109 103 100 94 90 96 101 104 98 91 99 109 124 133 136 136 131 129 139 143 152 174 184 191 197 201 203 204 205 203 199 194 187 177 165 152 142 135 128 123 126 132 137 136 133 128 129 135 142 145 145 139 128 122 117 111 109 106 105 107 113 129 134 133 129 121 118 114 109 107 107 104 99 99 102 110 120 127 127 127 132 143 15

In [25]:
# Make a prediction
# 136 136 131 129 139 143 152 174 184 191 197 201 203 204 205 203 199 194 187 177 165 152 142 135
text = " 136 136 131 129 139 143 152"
input_ids = tokenizer.encode(text, return_tensors='pt')
outputs = model.generate(input_ids, max_length=20)
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(predicted_text)
#epoch = 1: 136 136 131 129 139 143 152 174 184 117 117 117 117 117 117 132 117 117 117 117
#epoch = 5 (start: 337.21 end:307.81 compute units: 29.4, approx. 2 hours)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 136 136 131 129 139 143 152 163 157 163 157 164 163 163 163 163 163 163 163 163


In [21]:
337.21 - 307.81

29.399999999999977