In [None]:
pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, TFBertModel
from transformers import InputExample, InputFeatures
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=9)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import tensorflow as tf
import pandas as pd

In [None]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [None]:
df = pd.read_csv("combined.csv")

In [None]:
#!/usr/bin/env python
import re

# get rid of emojis
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


In [None]:
# loop through posts, delete emojis, any random codes, punctuation
marks = '''!()-[]{};?@#$%:'"\,./^&*_\n'''

for _, row in df.iterrows():
  row["post"]= deEmojify(row["post"])
  row["post"] = row["post"].replace("&amp;#x200B;", "")
  
  for char in row["post"]:
    if char in marks:
      row["post"] = row["post"].replace(char, "")
  

In [None]:
# get rid of all subreddit labels
df.drop(df.index[df["subreddit"] == 'subreddit'], inplace = True)

In [None]:
# make all posts lowercase
df["post"]= df["post"].apply(lambda x: x.lower())

In [None]:
ds = df.sample(frac=1)

In [None]:
ds.head()

Unnamed: 0,LABEL_COLUMN,DATA_COLUMN
6127,4,autism amp the ability to feel my contributio...
993,0,dreamed i went on a date in exchange for colon...
2832,2,easy snooze solution without upc scanning im f...
7720,6,why do i even exist currently i am now 14 year...
5660,4,is it an autistic thing to look younger than y...


In [None]:
ds.rename(columns={'subreddit': 'LABEL_COLUMN', 'post': 'DATA_COLUMN'}, inplace=True)

In [None]:
df["subreddit"].unique()
# ED = 0, addiction = 1, adhd = 2, alcholism = 3, autism = 4, bipolar = 5, depression = 6, ptsd = 7, schizophrenia = 8
dictionary = {'EDAnonymous':0, 'addiction':1, 'adhd':2, 'alcoholism':3, 'autism':4,
       'bipolarreddit':5, 'depression':6, 'ptsd':7, 'schizophrenia':8}

In [None]:
for _, row in ds.iterrows():
    row['LABEL_COLUMN'] = dictionary[row['LABEL_COLUMN']]

In [None]:
df1 = ds.iloc[10000:]
df2 = ds.iloc[:10000]

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(df1, df2, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0796ccac10>

In [None]:
pred_sentences = ['I am always zoning out in class, cant seem to focus on anything']

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['EDAnonymous', 'addiction', 'adhd', 'alcoholism', 'autism',
       'bipolarreddit', 'depression', 'ptsd', 'schizophrenia']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

I am always zoning out in class, cant seem to focus on anything : 
 adhd
