https://www.tensorflow.org/tutorials/load_data/text

In [None]:
!pip install tensorflow_text

In [52]:
import collections
import pathlib
import csv
import os
import tensorflow as tf
from tqdm import tqdm

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization

import tensorflow_datasets as tfds
# import tensorflow_text as tf_text

In [53]:
train_dir = "./train/"
names, tests = [], []
with open("coursedata.csv", "r") as infile:
    reader = csv.reader(infile)
    for i, row in enumerate(tqdm(reader)):
        names.append(row[0])
        tests.append(row[3])
        if not os.path.exists(train_dir+row[1]+"/"):
            os.makedirs(train_dir+row[1]+"/")
        with open(train_dir+row[1]+"/"+str(i)+".txt", "w") as outfile:
          outfile.write(row[3].lower().strip()) 

1905it [00:00, 8262.98it/s]


In [54]:
batch_size = 32
seed = 42

raw_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Found 1905 files belonging to 82 classes.
Using 1524 files for training.


In [55]:
for i, label in enumerate(raw_train_ds.class_names):
  print("Label", i, "corresponds to", label)

Label 0 corresponds to AAAS
Label 1 corresponds to ACC
Label 2 corresponds to AHST
Label 3 corresponds to AME
Label 4 corresponds to AMST
Label 5 corresponds to ANTH
Label 6 corresponds to ARBC
Label 7 corresponds to ASLA
Label 8 corresponds to ASTR
Label 9 corresponds to ATHS
Label 10 corresponds to BCSC
Label 11 corresponds to BIOL
Label 12 corresponds to BME
Label 13 corresponds to BUS
Label 14 corresponds to CASC
Label 15 corresponds to CGRK
Label 16 corresponds to CHE
Label 17 corresponds to CHEM
Label 18 corresponds to CHIN
Label 19 corresponds to CIS
Label 20 corresponds to CLST
Label 21 corresponds to CLTR
Label 22 corresponds to CSC
Label 23 corresponds to CVSC
Label 24 corresponds to DANC
Label 25 corresponds to DMST
Label 26 corresponds to DSCC
Label 27 corresponds to EAS
Label 28 corresponds to ECE
Label 29 corresponds to ECON
Label 30 corresponds to EESC
Label 31 corresponds to EHUM
Label 32 corresponds to ENGL
Label 33 corresponds to ENT
Label 34 corresponds to FIN
Label 

In [56]:
# Create a validation set.
raw_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 1905 files belonging to 82 classes.
Using 381 files for validation.


In [57]:
VOCAB_SIZE = 10000

binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

In [58]:
MAX_SEQUENCE_LENGTH = 1024

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [59]:
# Make a text-only dataset (without labels), then call `TextVectorization.adapt`.
train_text = raw_train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [60]:
def binary_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), label

def int_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), label

In [61]:
# Retrieve a batch (of 32 reviews and labels) from the dataset.
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

Question tf.Tensor(b'in this course, students with prior yoga and/or dance experience will learn how to refine their skills through a continued exploration of asanas, pranayama, philosophy, and meditation. we will explore a more rigorous vinyasa flow practice, resulting in students developing more clarity regarding alignment, breath support, core aliveness, and, ultimately, body/self-awareness. while this is an individualized practice, the importance of community will be emphasized throughout as students share aspects of their practice with each other. readings, discussion, and reflective writing are inherent to deepening ones practice.', shape=(), dtype=string)
Label tf.Tensor(24, shape=(), dtype=int32)


In [62]:
print("'binary' vectorized question:",
      binary_vectorize_text(first_question, first_label)[0])
print("'int' vectorized question:",
      int_vectorize_text(first_question, first_label)[0])

'binary' vectorized question: tf.Tensor([[0. 1. 1. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)
'int' vectorized question: tf.Tensor([[ 6 10  9 ...  0  0  0]], shape=(1, 1024), dtype=int64)


In [63]:
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

1289 --->  scheduled
313 --->  sources
Vocabulary size: 10000


In [64]:
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)

In [65]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [66]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)

In [67]:
binary_model = tf.keras.Sequential([layers.Dense(82)])

binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [68]:
def create_model(vocab_size, num_labels):
  model = tf.keras.Sequential([
      layers.Embedding(vocab_size, 64, mask_zero=True),
      layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      layers.GlobalMaxPooling1D(),
      layers.Dense(num_labels)
  ])
  return model

In [69]:
# `vocab_size` is `VOCAB_SIZE + 1` since `0` is used additionally for padding.
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=82)
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [70]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

Linear model on binary vectorized data:
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 82)                820082    
                                                                 
Total params: 820,082
Trainable params: 820,082
Non-trainable params: 0
_________________________________________________________________
None


In [71]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

ConvNet model on int vectorized data:
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          640064    
                                                                 
 conv1d_1 (Conv1D)           (None, None, 64)          20544     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_3 (Dense)             (None, 82)                5330      
                                                                 
Total params: 665,938
Trainable params: 665,938
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
export_model = tf.keras.Sequential(
    [binary_vectorize_layer, binary_model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])

In [73]:
def get_string_labels(predicted_scores_batch):
  #predicted_int_labels = tf.math.argmax(predicted_scores_batch, axis=1)
  #predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)

  predicted_values, predicted_indices = tf.math.top_k(predicted_scores_batch, k=10)
  predicted_labels = tf.gather(raw_train_ds.class_names, predicted_indices)

  return (predicted_values, predicted_labels)

In [77]:
inputs = [x.lower() for x in tests]
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)
with open("coursepreds.csv", "w") as outputpredfile:
  writer = csv.writer(outputpredfile, delimiter=',')
  for i in range(0,len(inputs)):
    print("Question: ", inputs[i])
    print("Predicted label: ", predicted_labels[1][i].numpy())
    print("Predicted weights: ", predicted_labels[0][i].numpy())
    writer.writerow([names[i], str(predicted_labels[1][i].numpy()), str(predicted_labels[0][i].numpy())])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Question:  as an introduction to the art of film, this course will present the concepts of film form, film aesthetics, and film style, while remaining attentive to the various ways in which cinema also involves an interaction with audiences and larger social structures.
Predicted label:  [b'FMST' b'ENGL' b'MUSC' b'AAAS' b'AHST' b'HIST' b'DANC' b'WRTG' b'CLTR'
 b'DMST']
Predicted weights:  [0.93933773 0.17573126 0.096711   0.08513809 0.08497202 0.08176523
 0.08022037 0.06423562 0.06182894 0.05597597]
Question:  this course merges contemporary art production with technologies and social interventions. students will combine historical, inter-media approaches with new, evolving trends in social practice. this course offering uses cyberpunk, a subgenre of science fiction, as a framework for examining contemporary art and media production in both theory and practice. students will deploy introductory level techniques to create 