In [None]:
! pip3 install datasets transformers matplotlib

In [None]:
from datasets import load_dataset
# Load the SQuAD dataset using Hugging Face Datasets
squad = load_dataset("squad")

# Function to process data (modify for specific needs)
def process_data(data):
  content_question_pairs = []
  for example in data:
    content_question_pairs.append(example["context"] + "\n" + example["question"] + "\n" + example["answers"]["text"][0] + "[END]")
  return content_question_pairs

# Process training and validation data
train_data_processed = "\n".join(process_data(squad["train"]))
validation_data_processed = process_data(squad["validation"])

In [None]:
def tokenizer(text):
  """
  This function tokenizes a string by splitting it on whitespace after
  removing all non-alphanumeric characters and converting everything to lowercase.

  Args:
      text: The string to tokenize.

  Returns:
      A list of the individual words in the tokenized string.
  """
  # Remove non-alphanumeric characters and convert to lowercase
  alphanumeric_text = "".join(char.lower() for char in text if char.isalnum() or char.isspace())
  # Split the text on whitespace
  tokens = alphanumeric_text.split()
  return tokens

# Example usage
text = "This is some text with! punctuation & symbols."
tokens = tokenizer(train_data_processed)
print(tokens)

In [None]:
WINDOW_LENGTH = 10

import pandas as pd
def generate_training_pairs(tokens):
    inputs = []
    outputs = []
    for i in range(0, len(tokens) - WINDOW_LENGTH):
        inputs.append(tokens[i:i+WINDOW_LENGTH])
        outputs.append([tokens[i+WINDOW_LENGTH]])
    return inputs, outputs

training_data_X, training_data_y = generate_training_pairs(tokens)

In [None]:
training_data_X[0], training_data_y[0]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

class EmbeddingModel(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokens_to_int = {}
        self.int_to_tokens = {}

    def fit(self, X, y):
        tokens =  set([item for sublist in X + y for item in sublist]) 
        self.tokens_to_int = dict(zip(tokens,range(0,len(tokens))))
        self.int_to_tokens = dict(zip(range(0,len(tokens)),tokens))

        return self

    def transform(self, X):
        return [[self.tokens_to_int.get(x,self.int_to_tokens.get(x)) for x in xs] for xs in X]


pipeline = Pipeline(
    [
        (
            "sentence_embedding",
            EmbeddingModel(),
        ), 
        (
            "language_model",
            RandomForestClassifier(n_estimators=100, max_depth=10) 
        ),  
    ]
)

pipeline.fit(training_data_X[0:10000], training_data_y[0:10000])

In [None]:
(training_data_X[10001],training_data_y[10001])

In [None]:
pipeline.predict([training_data_X[10001]])

In [None]:
from sklearn.tree import plot_tree

plot_tree(pipeline[1][1])

In [None]:
pipeline.transform([["hello","world"]])