In [1]:
# !kaggle competitions download -c artificial-text-detection-homework

In [7]:
# !mkdir atd_kaggle
# !mkdir atd_kaggle/data
# !mv artificial-text-detection-homework.zip atd_kaggle/data

mkdir: atd_kaggle: File exists
mkdir: atd_kaggle/data: File exists
mv: artificial-text-detection-homework.zip: No such file or directory


In [10]:
# !unzip atd_kaggle/data/artificial-text-detection-homework.zip -d atd_kaggle/data

Archive:  atd_kaggle/data/artificial-text-detection-homework.zip
  inflating: atd_kaggle/data/dev.csv  
  inflating: atd_kaggle/data/test.csv  


In [1]:
import pandas as pd

dev_df = pd.read_csv('atd_kaggle/data/dev.csv')
dev_df.rename(columns={'Text': 'text', 'Class': 'label'}, inplace=True)
dev_df.drop(columns=['ID'], inplace=True)
dev_df.head()

In [None]:
def remove_br_tags(text):
    return text.replace('<br />', '')

In [5]:
dev_df['text'] = dev_df['text'].apply(remove_br_tags)

In [6]:
from sklearn.model_selection import train_test_split

# split the data into train, validation and test sets
train, test = train_test_split(dev_df, test_size=0.4)
val, test = train_test_split(test, test_size=0.5)

# reset the index of the dataframes after the split
train.reset_index(inplace=True)
val.reset_index(inplace=True)
test.reset_index(inplace=True)

In [7]:
from datasets import Dataset, DatasetDict

# create an empty DatasetDict object
ds = DatasetDict()

# create a Dataset object from the train, validation and test dataframes
ds['train'] = Dataset.from_pandas(train)
ds['validation'] = Dataset.from_pandas(val)
ds['test'] = Dataset.from_pandas(test)

print(ds)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 400
    })
})


In [8]:
# save the dataset to disk
ds.save_to_disk('atd_kaggle/data/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

In [9]:
# map class IDs to labels
id2label = {0: 'H', 1: 'M'}

# map labels to class IDs
label2id = {'H': 0, 'M': 1}

In [10]:
from transformers import AutoTokenizer

# load a pre-trained tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
# preprocess the data by tokenizing using the tokenizer dictionary and mapping the labels to class IDs
def preprocess(batch):
    tokenizer_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokenizer_batch['label'] = [label2id[label] for label in batch['label']]
    return tokenizer_batch

In [12]:
tokenized_ds = ds.map(preprocess, batched=True)
tokenized_ds

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
import evaluate

accuracy = evaluate.load('accuracy')

In [15]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [18]:
training_args = TrainingArguments(
    output_dir=f'tmp/',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
# trainer.train()

In [23]:
model_ = AutoModelForSequenceClassification.from_pretrained('atd_kaggle/tmp/checkpoint-190')

trainer_ = Trainer(
    model=model_,
    compute_metrics=compute_metrics,
)

In [24]:
prediction = trainer_.predict(tokenized_ds['test'])
prediction.metrics

{'test_loss': 0.039804574102163315,
 'test_accuracy': 0.9925,
 'test_runtime': 7.8116,
 'test_samples_per_second': 51.206,
 'test_steps_per_second': 6.401}

In [1]:
import pandas as pd

test_df = pd.read_csv('atd_kaggle/data/test.csv')
test_df.rename(columns={'Text': 'text'}, inplace=True)
test_df.drop(columns=['ID'], inplace=True)
test_df.head()

Unnamed: 0,text
0,Although The Mole People isn't the best of Uni...
1,"This is a terrible, terrible film.<br /><br />..."
2,I had been warned about Mike Leigh's 'All or N...
3,"Burt Lancaster(who I just thought was great, t..."
4,"It's not plot driven, OK; !!!!!! ----------..."


In [5]:
def remove_br_tags(text):
    return text.replace('<br />', '')

In [6]:
test_df['text'] = test_df['text'].apply(remove_br_tags)

In [7]:
from datasets import Dataset, DatasetDict

# create an empty DatasetDict object
ds = DatasetDict()

ds['test'] = Dataset.from_pandas(test_df)

In [8]:
ds

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 20000
    })
})

In [9]:
from transformers import AutoTokenizer

# load a pre-trained tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [10]:
def preprocess(batch):
    tokenizer_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    return tokenizer_batch

In [11]:
tokenized_ds = ds.map(preprocess, batched=True)
tokenized_ds

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
})

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('atd_kaggle/tmp/checkpoint-190')

trainer = Trainer(
    model=model,
    # compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
prediction = trainer.predict(tokenized_ds['test'])

loc("mps_reshape_2"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":303:0)): error: 'anec.reshape' op failed: input tensor dimensions are not supported on ANEs.
loc("Cast back to bool"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<8x12x128x128xi1>'


In [14]:
prediction.metrics

{'test_runtime': 290.8517,
 'test_samples_per_second': 68.764,
 'test_steps_per_second': 8.595}

In [15]:
import numpy as np

predicted_labels = np.argmax(prediction.predictions, axis=1)

id2label = {0: 'H', 1: 'M'}
predicted_labels_str = [id2label[label] for label in predicted_labels]

In [27]:
test_df['label'] = predicted_labels_str
test_df

Unnamed: 0,Text,Class,label
0,Although The Mole People isn't the best of Uni...,H,H
1,"This is a terrible, terrible film.The first tw...",H,H
2,I had been warned about Mike Leigh's 'All or N...,H,H
3,"Burt Lancaster(who I just thought was great, t...",H,H
4,"It's not plot driven, OK; !!!!!! ----------...",M,M
...,...,...,...
19995,People don't realize this is the first all dig...,H,H
19996,I wasn't going to write a review but I had to ...,H,H
19997,We're deep into student film urchins; as far a...,M,M
19998,Its about time for a Marvel movie to be great....,H,H


In [28]:
submission_df = pd.read_csv('atd_kaggle/data/test.csv')
submission_df = pd.DataFrame({
    'ID': submission_df['ID'],
    'Class': test_df['label']
})
submission_df.head()

Unnamed: 0,ID,Class
0,0,H
1,1,H
2,2,H
3,3,H
4,4,M


In [29]:
submission_df.to_csv('atd_kaggle/data/submission.csv', index=False)

In [30]:
!kaggle competitions submit -c artificial-text-detection-homework -f atd_kaggle/data/submission.csv -m "Submission"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|████████████████████████████████████████| 145k/145k [00:02<00:00, 71.5kB/s]
Successfully submitted to Artificial Text Detection Homework