# 导入训练集和测试集

In [1]:
import pandas as pd
train = pd.read_csv("../input/nlp-getting-started/train.csv", index_col=0)
test = pd.read_csv("../input/nlp-getting-started/test.csv", index_col=0)

In [2]:
train.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


# 从训练集中拆分验证集

In [4]:
from sklearn.model_selection import train_test_split

target = train.target
train.drop(columns='target', inplace=True)

df_train, df_val, y_train, y_val = train_test_split(train, target, test_size=0.05, stratify=target, random_state=42)
df_train['label'] = y_train
df_val['label'] = y_val

# 更改数据形状以适应 huggingface 框架

In [5]:
from datasets import Dataset, DatasetDict
tds = Dataset.from_pandas(df_train)
vds = Dataset.from_pandas(df_val)

disaster_tweets = DatasetDict()
disaster_tweets['train'] = tds
disaster_tweets['validation'] = vds

In [6]:
test = Dataset.from_pandas(test)
disaster_tweets_test = DatasetDict()
disaster_tweets_test['test'] = test

# 将数据编码为序列

In [7]:
from transformers import AutoTokenizer
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

disaster_tweets_encoded = disaster_tweets.map(tokenize, batched=True, batch_size=None)
disaster_tweets_test_encoded = disaster_tweets_test.map(tokenize, batched=True, batch_size=None)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# 模型训练

In [8]:
# 调用GPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 实例化模型
from transformers import AutoModelForSequenceClassification
num_labels = 2
model = AutoModelForSequenceClassification\
    .from_pretrained(model_ckpt, num_labels=num_labels)\
    .to(device)

# 设置训练参数
from transformers import TrainingArguments
batch_size = 64
logging_steps = len(disaster_tweets_encoded['train']) // batch_size
model_name = f"{model_ckpt}-finetuned-disaster"
training_args = TrainingArguments(
    report_to='none',
    output_dir=model_name,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=.01,
    evaluation_strategy='epoch',
    disable_tqdm=False,
    logging_steps=logging_steps,
    log_level='error',
)

# 模型训练和评估
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred) -> dict:
    """Compute accuracy and f1 score
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=disaster_tweets_encoded['train'],
    eval_dataset=disaster_tweets_encoded['validation'],
    tokenizer=tokenizer,
)
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4644,0.391632,0.826772,0.827009
2,0.3555,0.384677,0.824147,0.823625
3,0.314,0.388273,0.83727,0.836185


TrainOutput(global_step=339, training_loss=0.3779284496926277, metrics={'train_runtime': 87.7905, 'train_samples_per_second': 247.134, 'train_steps_per_second': 3.861, 'total_flos': 471517705516032.0, 'train_loss': 0.3779284496926277, 'epoch': 3.0})

In [9]:
proba_prediction_test = trainer.predict(disaster_tweets_test_encoded['test'])

# 提交结果

In [10]:
import numpy as np
prediction_test = pd.DataFrame(data={
    'id': disaster_tweets_test['test']['id'],
    'target': np.argmax(proba_prediction_test.predictions, axis=-1)
})

prediction_test.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [11]:
prediction_test.to_csv('submission.csv', index=False)