In [12]:
import os
import shutil

import pandas as pd
import tensorflow as tf
import torch
import numpy as np
from transformers import BertTokenizer
import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [13]:
df = pd.read_csv("events.csv")
to_drop = ['sort_order', 'player_in', 'player_out', 'shot_place', 'shot_outcome', 'is_goal',
       'location', 'bodypart', 'assist_method', 'situation', 'fast_break']
df.drop(to_drop, axis=1)
df_train = df[["text", "event_type"]]

In [14]:
df_train["event_type"] = df_train["event_type"].replace(dict.fromkeys([2,3,4,5,6,7,8,9,10,11], 0))
df_train.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["event_type"] = df_train["event_type"].replace(dict.fromkeys([2,3,4,5,6,7,8,9,10,11], 0))


Unnamed: 0,text,event_type
0,Attempt missed. Mladen Petric (Hamburg) left f...,1
1,"Corner, Borussia Dortmund. Conceded by Dennis...",0
2,"Corner, Borussia Dortmund. Conceded by Heiko ...",0
3,Foul by Sven Bender (Borussia Dortmund).,0
4,Gokhan Tore (Hamburg) wins a free kick in the ...,0


In [None]:
df_train = df_train[:1000]
df_train, df_val, df_test = np.split(df_train.sample(frac=1, random_state=42), [int(.8*len(df_train)), int(.9*len(df_train))])

In [None]:
model_name = "bert-base-uncased"
max_length = 512

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=11)

In [None]:
from datasets import Dataset

df_train["label"] = df_train["event_type"]
train_dataset = Dataset.from_pandas(df_train[["text","label"]])


In [None]:
train_dataset = train_dataset.remove_columns('__index_level_0__')

In [None]:
train_dataset = train_dataset.map(lambda e: tokenizer(e["text"], padding='max_length', truncation=True), batched=True)

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./bert_results", 
    num_train_epochs =5, 
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./bert_logs',
    logging_steps=10,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()