In [None]:
%pip install openai nltk transformers tensorflow


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Do

In [None]:
import os
import re
import string

# Define the input and output file paths
input_file = "mywords.txt"
output_file = "training_data.txt"

# Define the regular expression pattern to remove all non-alphanumeric characters
pattern = re.compile(r'\W+', re.UNICODE)

# Define the list of stopwords to remove
stopwords = ["a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "be", "been", "am", "as", "at", "by", "for", "from", "in", "into", "of", "off", "on", "onto", "out", "over", "to", "up", "with"]

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    # Remove all non-alphanumeric characters
    text = pattern.sub(' ', text)
    # Remove all stopwords
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

# Open the input and output files
with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    # Loop through each line in the input file
    for line in f_in:
        # Preprocess the line
        preprocessed_line = preprocess_text(line)
        # Write the preprocessed line to the output file
        f_out.write(preprocessed_line + "\n")


In [None]:

import tensorflow as tf
import numpy as np
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

# Load the GPT-3.5 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')

# Load the pre-trained GPT-3.5 model
model = TFGPT2LMHeadModel.from_pretrained('EleutherAI/gpt-neo-2.7B', from_pt=True)

# Load the training data
with open('training_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Encode the training data
inputs = tokenizer.encode(text, return_tensors='tf')

# Define the training parameters
batch_size = 2
epochs = 2
learning_rate = 5e-5

# Create the training dataset
dataset = tf.data.Dataset.from_tensor_slices(inputs)
dataset = dataset.shuffle(len(inputs))
dataset = dataset.batch(batch_size, drop_remainder=True)

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define the training step
@tf.function
def train_step(inputs):
    inputs = {'input_ids': inputs[:, :-1], 'attention_mask': tf.ones_like(inputs[:, :-1])}
    labels = inputs['input_ids']
    with tf.GradientTape() as tape:
        outputs = model(inputs)
        logits = outputs.logits
        loss = loss_fn(labels, logits)
    gradilonts = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [None]:
from transformers import GPTNeoForCausalLM

# Fine-tune the GPT-Neo model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B')
model.resize_token_embeddings(len(tokenizer))

trainer = Trainer(
    model=model,
    data_collator=DataCollatorWithPadding(tokenizer, max_length=2048),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    args=TrainingArguments(
        output_dir='./results',
        evaluation_strategy='steps',
        eval_steps=500,
        save_total_limit=2,
        learning_rate=2e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=2,
        weight_decay=0.01,
        push_to_hub=False,
    ),
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained('my_chat_model', save_function=push_to_hub) 
