In [2]:
import os
import shutil

import pandas as pd
import tensorflow as tf
import torch
import numpy as np
from transformers import BertTokenizer
import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [3]:
import io
df = pd.read_csv("events.csv")
to_drop = ['sort_order', 'player_in', 'player_out', 'shot_place', 'shot_outcome', 'is_goal',
       'location', 'bodypart', 'assist_method', 'situation', 'fast_break']
df.drop(to_drop, axis=1)
df_train = df[["text", "event_type"]]
df_train.isna().sum()

text          0
event_type    0
dtype: int64

In [4]:
df_train = df_train[:1000]

In [5]:
df_train, df_val, df_test = np.split(df_train.sample(frac=1, random_state=42), [int(.8*len(df_train)), int(.9*len(df_train))])

In [6]:
df_train["event_type"].replace({10: 5, 9:6}, inplace=True)
df_train["event_type"].unique()

array([1, 3, 2, 8, 6, 4, 7, 5], dtype=int64)

In [7]:
df_train["event_type"] = df_train["event_type"] - 1

In [8]:
model_name = "bert-base-uncased"
max_length = 512

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=11)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from datasets import Dataset

df_train["label"] = df_train["event_type"]
train_dataset = Dataset.from_pandas(df_train[["text","label"]])


In [11]:
train_dataset = train_dataset.remove_columns('__index_level_0__')

In [12]:
train_dataset = train_dataset.map(lambda e: tokenizer(e["text"], padding='max_length', truncation=True), batched=True)

100%|██████████| 1/1 [00:00<00:00,  7.09ba/s]


In [13]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading: 3.20kB [00:00, 1.57MB/s]                   


In [14]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./bert_results", 
    num_train_epochs =5, 
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./bert_logs',
    logging_steps=10,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 800
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 500
  0%|          | 1/500 [00:18<2:37:50, 18.98s/it]

KeyboardInterrupt: 

In [29]:
df_test["event_type"].replace({10: 5, 9:6}, inplace=True)
df_test["event_type"].unique()
df_test["event_type"] = df_test["event_type"] - 1

In [30]:
from datasets import Dataset

df_test["label"] = df_test["event_type"]
test_dataset = Dataset.from_pandas(df_test[["text","label"]])

In [31]:
test_dataset = test_dataset.remove_columns('__index_level_0__')

In [32]:
test_dataset = test_dataset.map(lambda e: tokenizer(e["text"], padding='max_length', truncation=True), batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [33]:
pred = trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


RuntimeError: ignored

In [24]:
pred

PredictionOutput(predictions=array([[-0.77190715, -0.38698167, -1.34062   , ..., -0.60797393,
        -0.6146431 , -0.22512552],
       [-1.376326  , -0.77220845, -0.8273568 , ..., -0.9733109 ,
        -1.1197319 , -1.0041003 ],
       [-1.372234  , -0.7297904 , -0.876247  , ..., -0.9916304 ,
        -1.1221824 , -1.0055766 ],
       ...,
       [-0.962296  , -0.9263065 , -1.0154383 , ..., -0.75616115,
        -0.35822767, -0.733696  ],
       [-0.62212867, -0.79449713,  8.304422  , ..., -0.68821806,
        -0.63883156, -0.5953833 ],
       [-0.9946683 , -0.97754455, -0.9638526 , ..., -0.7337582 ,
        -0.3462433 , -0.6984777 ]], dtype=float32), label_ids=array([7, 8, 8, 3, 2, 7, 1, 9, 1, 8, 8, 3, 8, 1, 8, 2, 8, 3, 8, 8, 3, 1,
       1, 9, 8, 8, 2, 1, 1, 1, 2, 1, 2, 3, 1, 3, 8, 3, 3, 8, 8, 1, 8, 2,
       1, 1, 1, 8, 8, 2, 1, 1, 1, 3, 7, 3, 3, 7, 8, 1, 4, 3, 2, 3, 9, 2,
       1, 1, 8, 8, 2, 3, 9, 2, 2, 3, 1, 9, 8, 1, 3, 3, 1, 3, 8, 8, 1, 4,
       7, 2, 3, 3, 1, 3, 1, 8, 3, 9, 3, 

In [25]:
df_test

Unnamed: 0,text,event_type,label
815,"Substitution, Evian Thonon Gaillard. Guillaume...",6,7
862,Antoine Devaux (Toulouse) wins a free kick in ...,7,8
269,Markus Rosenberg (SV Werder Bremen) wins a fre...,7,8
201,Foul by Daniel Caligiuri (SC Freiburg).,2,3
161,"Corner, FC Augsburg. Conceded by Yacine Abdes...",1,2
...,...,...,...
106,Ilkay Gundogan (Borussia Dortmund) wins a free...,7,8
270,Foul by Richard Sukuta-Pasu (Kaiserslautern).,2,3
860,"Offside, Ajaccio. Fabrice Begeorgi tries a thr...",5,9
435,Foul by Siaka TiA©nA© (Paris Saint Germain).,2,3


In [28]:
pred.logits

AttributeError: ignored