In [1]:
!pip install pandas
!pip install tensorflow
!pip install transformers
!pip install scikit-learn



In [2]:
import pandas as pd
import tensorflow as tf

from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split


In [3]:
!wget https://raw.githubusercontent.com/vennietweek/aita-analysis-tool/main/submissions_part1.csv
!wget https://raw.githubusercontent.com/vennietweek/aita-analysis-tool/main/submissions_part2.csv

--2024-03-16 09:45:53--  https://raw.githubusercontent.com/vennietweek/aita-analysis-tool/main/submissions_part1.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15093834 (14M) [text/plain]
Saving to: ‘submissions_part1.csv’


2024-03-16 09:45:54 (156 MB/s) - ‘submissions_part1.csv’ saved [15093834/15093834]

--2024-03-16 09:45:54--  https://raw.githubusercontent.com/vennietweek/aita-analysis-tool/main/submissions_part2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13782676 (13M) [text/plain]
Saving to: ‘submissions_part2

In [4]:

p1 = pd.read_csv('submissions_part1.csv')
p2 = pd.read_csv('submissions_part2.csv')
df = pd.concat([p1, p2])
df.head(), df.shape

(          created_time                                              title  \
 0  2021-07-03 12:36:22  AITA for telling my wife the lock on my daught...   
 1  2020-05-26 23:35:45  AITA For suing my girlfriend after she had my ...   
 2  2019-12-03 17:07:53  AITA for pretending to get fired when customer...   
 3  2022-12-26 12:44:43  AITA for bringing up my brother's "premature" ...   
 4  2022-08-26 20:25:55  AITA for not taking down my video that was a g...   
 
    score      id                                            content  \
 0  81014  ocx94s  My brother in-law (Sammy) lost his home shortl...   
 1  70809  gr8bp3  I'll try to keep this short. I had a [1967 Imp...   
 2  63520  e5k3z2  I am a high schooler with a weekend job at a c...   
 3  60429  zvmflw  I am a nurse practitioner and I am the primary...   
 4  55748  wyjbjs  I have a sister that’s 6 years older than me. ...   
 
              AITA  upvotes  
 0  Not the A-hole    81014  
 1  Not the A-hole    70809  
 2  No

In [5]:
df['label'] = df['AITA'].apply(lambda x : 1 if x == "Asshole" else 0)
df_cleaned = df.drop(['created_time','upvotes', 'score', 'id', 'upvotes', 'title', 'AITA'],axis=1)
df_cleaned

Unnamed: 0,content,label
0,My brother in-law (Sammy) lost his home shortl...,0
1,I'll try to keep this short. I had a [1967 Imp...,0
2,I am a high schooler with a weekend job at a c...,0
3,I am a nurse practitioner and I am the primary...,0
4,I have a sister that’s 6 years older than me. ...,0
...,...,...
5956,I'm the middle child of 3 siblings. There's my...,0
5957,"Please let me know if I’m in the wrong, becaus...",0
5958,My wife Dee (28F) and I (27M) recently had our...,1
5959,**UPDATE:** *I do believe because of my anger ...,0


In [6]:
df_cleaned = df.drop(['created_time','upvotes', 'score', 'id', 'upvotes', 'title', 'AITA'],axis=1)
df_cleaned

Unnamed: 0,content,label
0,My brother in-law (Sammy) lost his home shortl...,0
1,I'll try to keep this short. I had a [1967 Imp...,0
2,I am a high schooler with a weekend job at a c...,0
3,I am a nurse practitioner and I am the primary...,0
4,I have a sister that’s 6 years older than me. ...,0
...,...,...
5956,I'm the middle child of 3 siblings. There's my...,0
5957,"Please let me know if I’m in the wrong, becaus...",0
5958,My wife Dee (28F) and I (27M) recently had our...,1
5959,**UPDATE:** *I do believe because of my anger ...,0


In [7]:
train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def aita_tokenize(c):
    return tokenizer(c, padding='max_length', truncation=True, max_length=512) # longer text will be truncated, the max context length for this model is 512

train_encodings = [aita_tokenize(text) for text in train_df['content']]
test_encodings = [aita_tokenize(text) for text in test_df['content']]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
def create_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': [encoding['input_ids'] for encoding in encodings], # numerical rep of tokens
            'attention_mask': [encoding['attention_mask'] for encoding in encodings],
            'token_type_ids': [encoding['token_type_ids'] for encoding in encodings] if 'token_type_ids' in encodings[0] else None,
        },
        labels
    ))

train_labels = train_df['label'].values
test_labels = test_df['label'].values

train_dataset = create_tf_dataset(train_encodings, train_labels)
test_dataset = create_tf_dataset(test_encodings, test_labels)


In [17]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([  101,  1045,  2109,  2000,  2022,  2428,  2204,  2814,  2007,
        3419,  1012,  3419, 10047,  2080,  2038,  2081,  1037,  2843,
        1997,  2919,  4319,  9804,  1999,  2166,  1012,  2002,  2356,
        2033,  1037,  2261,  2706,  2067,  2065,  2002,  2071, 17781,
        1019,  2243,  1012,  2002,  2056,  2002,  2001, 11573,  1998,
        2734,  2393,  1998,  2002,  2071,  3477,  2033,  2067,  2184,
        2243,  2306,  1020,  2706,  1012,  1045,  3530,  1998,  2435,
        2032,  1996,  1019,  2243,  1012,  1045,  2069,  2191,  2871,
        2243,  1037,  2095,  1012,  1020,  2706,  2101,  2002,  2001,
       12097,  2025,  2000,  3342,  1998,  1045,  2741,  2032,  1037,
       12117, 12326,  1997,  1996,  2310,  2078,  5302,  1998,  2256,
        6981,  2002,  3530,  2000,  3477,  2033,  2067,  1012,  2002,
        2056,  1045,  2196,  2435,  2032,  2769,  1998,  2000,  2681,
        2032,  2894,  1012,  1

In [18]:
BATCH_SIZE = 16

train_dataset = train_dataset.shuffle(len(train_dataset)).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)


In [19]:
model = TFAutoModel.from_pretrained("bert-base-uncased")


class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

classifier = BERTForClassification(model, num_classes=2)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [20]:
history = classifier.fit(
    train_dataset,
    epochs=2,  # Number of epochs can be adjusted
    validation_data=test_dataset
)


Epoch 1/2
Epoch 2/2


In [21]:
classifier.evaluate(test_dataset)



[0.39542436599731445, 0.845283031463623]