In [None]:
!pip install datasets transformers

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline


# Load dataset

In [None]:
path = '/content/drive/MyDrive/NLP with Disaster Tweets/'
train = pd.read_csv(path + 'data/train.csv')
test = pd.read_csv(path + 'data/test.csv')

print(train.info())
#print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None


# Mislabeled Samples
There are 18 unique tweets in training set which are labeled differently in their duplicates. Those tweets are probably labeled by different people and they interpreted the meaning differently because some of them are not very clear. Tweets with two unique target values are relabeled since they can affect the training score.

In [None]:
train_mislabeled = train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
texts_mislabeled = train_mislabeled[train_mislabeled['target'] >1]['target'].index.tolist()
texts_mislabeled

['like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!',
 "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!',
 'To fight bioterrorism sir.',
 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE',
 '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption',
 '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect',
 'He came to a land which was engulfed in tribal war and turned it into a land 

In [None]:
train['target_relabeled'] = train['target'].copy()
train.loc[train['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
train.loc[train['text'] == 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!', 'target_relabeled'] = 0
train.loc[train['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
train.loc[train['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
train.loc[train['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
train.loc[train['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
train.loc[train['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
train.loc[train['text'] == '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect', 'target_relabeled'] = 0
train.loc[train['text'] == 'He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam', 'target_relabeled'] = 0
train.loc[train['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
train.loc[train['text'] == 'Hellfire is surrounded by desires so be careful and don\x89Ûªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
train.loc[train['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
train.loc[train['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
train.loc[train['text'] == 'wowo--=== 12000 Nigerian refugees repatriated from Cameroon', 'target_relabeled'] = 0
train.loc[train['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
train.loc[train['text'] == 'Caution: breathing may be hazardous to your health.', 'target_relabeled'] = 1
train.loc[train['text'] == 'I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????', 'target_relabeled'] = 0
train.loc[train['text'] == 'that horrible sinking feeling when you\x89Ûªve been at home on your phone for a while and you realise its been on 3G this whole time', 'target_relabeled'] = 0

# Data Cleaning


In [None]:
df = pd.concat([train, test])
df.shape

(10876, 6)

In [None]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
print(tokenizer.tokenize('Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!'))

['hell', '##fire', '!', 'we', 'don', '##u', '##ª', '##t', 'even', 'want', 'to', 'think', 'about', 'it', 'or', 'mention', 'it', 'so', 'let', '##u', '##ª', '##s', 'not', 'do', 'anything', 'that', 'leads', 'to', 'it', '#', 'islam', '!']


## Removing urls

In [None]:
import re

In [None]:
example='RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG'
print(tokenizer.tokenize(example))

['rt', 'note', '##x', '##pl', '##aine', '##d', ':', 'the', 'only', 'known', 'image', 'of', 'infamous', 'hi', '##jack', '##er', 'd', '.', 'b', '.', 'cooper', '.', 'http', ':', '/', '/', 't', '.', 'co', '/', 'j', '##lz', '##k', '##2', '##hd', '##et', '##g']


In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

print(tokenizer.tokenize( remove_URL(example)))

['rt', 'note', '##x', '##pl', '##aine', '##d', ':', 'the', 'only', 'known', 'image', 'of', 'infamous', 'hi', '##jack', '##er', 'd', '.', 'b', '.', 'cooper', '.']


In [None]:
df['text'] = df['text'].apply(lambda x : remove_URL(x))

##Removing HTML tags

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""
print(tokenizer.tokenize(example))

['<', 'di', '##v', '>', '<', 'h', '##1', '>', 'real', 'or', 'fake', '<', '/', 'h', '##1', '>', '<', 'p', '>', 'ka', '##ggle', '<', '/', 'p', '>', '<', 'a', 'hr', '##ef', '=', '"', 'https', ':', '/', '/', 'www', '.', 'ka', '##ggle', '.', 'com', '/', 'c', '/', 'nl', '##p', '-', 'getting', '-', 'started', '"', '>', 'getting', 'started', '<', '/', 'a', '>', '<', '/', 'di', '##v', '>']


In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

print(tokenizer.tokenize( remove_html(example)))

['real', 'or', 'fake', 'ka', '##ggle', 'getting', 'started']


In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

## Removeing \x89Ûª

In [None]:
example = 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!'
#print(tokenizer.tokenize(example))

In [None]:
def remove_ua(text):
    ua = re.compile(r'\x89Ûª')
    return ua.sub(r"'", text)

print(tokenizer.tokenize( remove_ua(example)))

['hell', '##fire', '!', 'we', 'don', "'", 't', 'even', 'want', 'to', 'think', 'about', 'it', 'or', 'mention', 'it', 'so', 'let', "'", 's', 'not', 'do', 'anything', 'that', 'leads', 'to', 'it', '#', 'islam', '!']


In [None]:
df['text']=df['text'].apply(lambda x : remove_ua(x))

## Romoving Mentions

In [None]:
example = df.iloc[97].text
print(example)
print(tokenizer.tokenize(example))

#BREAKING: there was a deadly motorcycle car accident that happened to #Hagerstown today. I'll have more details at 5 @Your4State. #WHAG
['#', 'breaking', ':', 'there', 'was', 'a', 'deadly', 'motorcycle', 'car', 'accident', 'that', 'happened', 'to', '#', 'ha', '##gers', '##town', 'today', '.', 'i', "'", 'll', 'have', 'more', 'details', 'at', '5', '@', 'your', '##4', '##sta', '##te', '.', '#', 'w', '##ha', '##g']


In [None]:
def remove_mention(text):
    mention = re.compile("@[A-Za-z0-9_]+")
    return mention.sub(r'', text)
print(tokenizer.tokenize(remove_mention(example)))

['#', 'breaking', ':', 'there', 'was', 'a', 'deadly', 'motorcycle', 'car', 'accident', 'that', 'happened', 'to', '#', 'ha', '##gers', '##town', 'today', '.', 'i', "'", 'll', 'have', 'more', 'details', 'at', '5', '.', '#', 'w', '##ha', '##g']


In [None]:
df['text']=df['text'].apply(lambda x : remove_mention(x))

## removing hashtag

In [None]:
#example = "#BREAKING: there was a deadly motorcycle car accident that happened to #Hagerstown today. I'll have more details at 5 @Your4State. #WHAG"
example = df.iloc[97].text
print(example)
print(tokenizer.tokenize(example))

#BREAKING: there was a deadly motorcycle car accident that happened to #Hagerstown today. I'll have more details at 5 . #WHAG
['#', 'breaking', ':', 'there', 'was', 'a', 'deadly', 'motorcycle', 'car', 'accident', 'that', 'happened', 'to', '#', 'ha', '##gers', '##town', 'today', '.', 'i', "'", 'll', 'have', 'more', 'details', 'at', '5', '.', '#', 'w', '##ha', '##g']


In [None]:
import string
def remove_hashtag(text):
    hashtag = re.compile(r'#')
    return hashtag.sub(r'', text)

print(tokenizer.tokenize(remove_hashtag(example)))

['breaking', ':', 'there', 'was', 'a', 'deadly', 'motorcycle', 'car', 'accident', 'that', 'happened', 'to', 'ha', '##gers', '##town', 'today', '.', 'i', "'", 'll', 'have', 'more', 'details', 'at', '5', '.', 'w', '##ha', '##g']


In [None]:
df['text']=df['text'].apply(lambda x : remove_hashtag(x))

## Removing Emojis

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [None]:
df.iloc[8485].text

" \x89Û¢This Is Called DAMAGE CONTROL. Don't Be Fooled Folks. The Perps Will Just Come Up With Plan B..."

## removing \x89Û

In [None]:
example = df.iloc[8485].text
print(example)
print(tokenizer.tokenize(example))

 Û¢This Is Called DAMAGE CONTROL. Don't Be Fooled Folks. The Perps Will Just Come Up With Plan B...
['u', '##¢', '##thi', '##s', 'is', 'called', 'damage', 'control', '.', 'don', "'", 't', 'be', 'fooled', 'folks', '.', 'the', 'per', '##ps', 'will', 'just', 'come', 'up', 'with', 'plan', 'b', '.', '.', '.']


In [None]:
def remove_x89(text):
    x89 = re.compile(r'\x89Û[\W]*', re.ASCII)
    return x89.sub(r'', text)

remove_x89("\x89ÛÓ\x89Û¢$¢¢åÊThis Is Called DAMAGE CONTROL. Don't Be Fooled Folks. The Perps Will Just Come Up With Plan")
#remove_x89("\x89ÛÏWhen")
#remove_x89("fromåÊwounds")

"This Is Called DAMAGE CONTROL. Don't Be Fooled Folks. The Perps Will Just Come Up With Plan"

In [None]:
df['text']=df['text'].apply(lambda x: remove_x89(x))

## to be determined

In [None]:
for i in np.random.randint(0, 10876, 10):
    print(i)
    print(df.iloc[i].text)

6282
Finna storm. Fuck my back boutta start hurting like a mf ??????
5588
The Latest: More Homes Razed by Northern California Wildfire - ABC News 
141
family members of osama bin laden have died in an airplane accident how ironic ?????? mhmmm gov shit i suspect
96
Has an accident changed your life? We will help you determine options that can financially support life care plans and on-going treatment.
9988
'I shut my eyes and the music broke over me like a rainstorm.' - Sylvia Plath (via petrichour) 
7686
BreakingNews Experts in France begin examining airplane debris found on Reunion Island: French air accident e...  
10626
I had trouble breathing while listening to kian singing omg
9806


SAN ONOFRE NUCLEAR REACTOR WASTE TO BE BURIED UNDER SHORELINE USING 3 BILLION IN TAX PAYER DOLLARS
4321
Tension In Bayelsa As Patience Jonathan Plans To Hijack APC PDP - 
1909
Disillusioned lead character 
Check
Happy go lucky free spirit girl
Check
Dream life crushed
Check
Great music
Check
All Crowe

In [None]:
pd.concat([train, test], axis = 0).iloc[8485].text

"@beforeitsnews \x89Û¢This Is Called DAMAGE CONTROL. Don't Be Fooled Folks. The Perps Will Just Come Up With Plan B..."

# correct spelling

## Unkown Words

In [None]:
!pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.0-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 12.5 MB/s 
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.0


In [None]:
from spellchecker import SpellChecker
import string

spell = SpellChecker()

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def unknown_words(text):
    unknown_words = []
    text = remove_punct(text)
    unknown_words += spell.unknown(text.split())
    return unknown_words

unknown_words(df.text.iloc[1])

['sask', 'ronge']

In [None]:
df['unkown_words'] = df['text'].apply(lambda x: unknown_words(x))

## Correct

In [None]:
df['unkown_words'] 

0                              []
1                   [sask, ronge]
2                              []
3                              []
4                              []
                  ...            
3258            [xrwn, fasteners]
3259     [20000k, cityamp3others]
3260                           []
3261                        [hwo]
3262    [cityofcalgary, yycstorm]
Name: unkown_words, Length: 10876, dtype: object

In [None]:
def auto_correct(unknown_words):
    if  unknown_words == []:
        return [[],[],[]]

    words_w_corr = []
    auto_corrections = []
    words_wo_corr = []

    for word in unknown_words:
        corr = spell.correction(word)
        if corr is None:
            words_wo_corr.append(word)
        else:
            words_w_corr.append(word)
            auto_corrections.append(corr)
    return [words_w_corr, auto_corrections, words_wo_corr]

auto_correct(df['unkown_words'].iloc[1])

[['sask', 'ronge'], ['ask', 'range'], []]

In [None]:
df[['words_w_corr', 'auto_corrections', 'words_wo_corr']] = df['unkown_words'].apply(lambda x: pd.Series(auto_correct(x)))

In [None]:
df_auto_corr = df
df_auto_corr.to_parquet(path + 'data/data_w_spell_correction', index = False)

In [None]:
df[['text', 'words_w_corr', 'auto_corrections', 'words_wo_corr']]

Unnamed: 0,text,words_w_corr,auto_corrections,words_wo_corr
0,Our Deeds are the Reason of this earthquake Ma...,[],[],[]
1,Forest fire near La Ronge Sask. Canada,"[sask, ronge]","[ask, range]",[]
2,All residents asked to 'shelter in place' are ...,[],[],[]
3,"13,000 people receive wildfires evacuation ord...",[],[],[]
4,Just got sent this photo from Ruby Alaska as s...,[],[],[]
...,...,...,...,...
3258,EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENERS...,"[xrwn, fasteners]","[down, fastener]",[]
3259,Storm in RI worse than last hurricane. My city...,[],[],"[20000k, cityamp3others]"
3260,Green Line derailment in Chicago,[],[],[]
3261,MEG issues Hazardous Weather Outlook (HWO),[hwo],[how],[]


## hand Correction

In [None]:
df = pd.read_parquet(path + 'data/data_w_spell_correction')

In [None]:
df[['text', 'words_w_corr', 'auto_corrections', 'words_wo_corr']]

Unnamed: 0,text,words_w_corr,auto_corrections,words_wo_corr
0,Our Deeds are the Reason of this earthquake Ma...,[],[],[]
1,Forest fire near La Ronge Sask. Canada,"[sask, ronge]","[ask, range]",[]
2,All residents asked to 'shelter in place' are ...,[],[],[]
3,"13,000 people receive wildfires evacuation ord...",[],[],[]
4,Just got sent this photo from Ruby Alaska as s...,[],[],[]
...,...,...,...,...
10871,EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENERS...,"[xrwn, fasteners]","[down, fastener]",[]
10872,Storm in RI worse than last hurricane. My city...,[],[],"[20000k, cityamp3others]"
10873,Green Line derailment in Chicago,[],[],[]
10874,MEG issues Hazardous Weather Outlook (HWO),[hwo],[how],[]


In [None]:
type(df.unkown_words.iloc[1])

str

# train_val_split


In [None]:
train_cleaned = df[:train.shape[0]]
test_cleaned = df[train.shape[0]:]
#test_cleaned

In [None]:
from sklearn.model_selection import train_test_split

train_cleaned, val_cleaned = train_test_split(train_cleaned, test_size=0.2)
#val_cleaned = train_cleaned

In [None]:
from datasets import Dataset
train_hf = Dataset.from_pandas(train_cleaned, preserve_index = False)
val_hf = Dataset.from_pandas(val_cleaned, preserve_index = False)
test_hf = Dataset.from_pandas(test_cleaned, preserve_index = False)

train_hf = train_hf.rename_column('target_relabeled', 'label')
val_hf = val_hf.rename_column('target_relabeled', 'label')

train_hf

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'label'],
    num_rows: 6090
})

#Preprocess

In [None]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
for i in range(5,10):
    print(train_hf[i]['text'])
    print(tokenizer.tokenize(train_hf[i]['text']))

FedEx no longer will ship potential bioterror pathogens - Atlanta Business Chronicle  - keyword: bioterror
['fed', '##ex', 'no', 'longer', 'will', 'ship', 'potential', 'bio', '##ter', '##ror', 'pathogen', '##s', '-', 'atlanta', 'business', 'chronicle', '-', 'key', '##word', ':', 'bio', '##ter', '##ror']
Remove the  and Linkury Browser Hijacker   - keyword: hijacker
['remove', 'the', 'and', 'link', '##ury', 'browser', 'hi', '##jack', '##er', '-', 'key', '##word', ':', 'hi', '##jack', '##er']
Beach did damage to my shit - keyword: damage
['beach', 'did', 'damage', 'to', 'my', 'shit', '-', 'key', '##word', ':', 'damage']
Help me win $$$$ by having the most shares on my article! A Lifetime Of Fear   Thanks! BlackInAmerica GrowingUpBlack - keyword: fear
['help', 'me', 'win', '$', '$', '$', '$', 'by', 'having', 'the', 'most', 'shares', 'on', 'my', 'article', '!', 'a', 'lifetime', 'of', 'fear', 'thanks', '!', 'black', '##ina', '##meric', '##a', 'growing', '##up', '##bla', '##ck', '-', 'key', 

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation = True)

In [None]:
tokenized_train = train_hf.map(tokenize_function, batched = True)
tokenized_val = val_hf.map(tokenize_function, batched=True)
tokenized_test = test_hf.map(tokenize_function, batched = True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
print(tokenized_train)
print(tokenized_test)

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'label', 'input_ids', 'attention_mask'],
    num_rows: 6090
})
Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'target_relabeled', 'input_ids', 'attention_mask'],
    num_rows: 3263
})


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer, return_tensors='tf')

# Train

In [None]:
tf_train_set = tokenized_train.to_tf_dataset(
    batch_size = 16,
    columns=['input_ids', 'attention_mask', 'label'],
    shuffle = True,
    collate_fn = data_collator
)

tf_val_set = tokenized_val.to_tf_dataset(
    batch_size = 16,
    columns=['input_ids', 'attention_mask', 'label'],
    shuffle = True,
    collate_fn = data_collator
)

tf_test_set = tokenized_test.to_tf_dataset(
    batch_size = 16,
    columns=['input_ids', 'attention_mask'],
    shuffle = False,
    collate_fn = data_collator
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
print(tf_train_set)
print(tf_test_set)

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}>
<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>


In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 3

batches_per_epoch = len(tokenized_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=2e-5, 
    num_train_steps=total_train_steps,
    num_warmup_steps=0
)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
#!pip install tensorflow_addons
#import tensorflow_addons as tfa

In [None]:
model.compile(
    optimizer=optimizer,
    metrics=["accuracy"]
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
model.fit(x = tf_train_set,
          validation_data = tf_val_set,
          epochs = num_epochs,
          callbacks = [tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 1, restore_best_weights=True)]

          )

Epoch 1/3
Epoch 2/3


<keras.callbacks.History at 0x7f4298072450>

# retrain using the whole train set

In [None]:
train_cleaned = df[:train.shape[0]]
train_hf = Dataset.from_pandas(train_cleaned, preserve_index = False)
train_hf = train_hf.rename_column('target_relabeled', 'label')
tokenized_train = train_hf.map(tokenize_function, batched = True)
tf_train_set = tokenized_train.to_tf_dataset(
    batch_size = 16,
    columns=['input_ids', 'attention_mask', 'label'],
    shuffle = True,
    collate_fn = data_collator
)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
batch_size = 16
num_epochs = 1

batches_per_epoch = len(tokenized_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=2e-5, 
    num_train_steps=total_train_steps,
    num_warmup_steps=0
)

In [None]:
model.compile(
    optimizer=optimizer,
    metrics=["accuracy"]
)

model.fit(x = tf_train_set,
          validation_data = tf_val_set,
          epochs = num_epochs
          )

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




<keras.callbacks.History at 0x7f41d0e2f8d0>

# Predict

In [None]:
results = model.predict(x = tf_test_set, use_multiprocessing=True)

In [None]:
probs = tf.nn.softmax(results.logits)
preds = tf.argmax(probs, axis = -1)
preds

<tf.Tensor: shape=(3263,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 1, 0])>

In [None]:
print(np.sum(preds)/ len(preds))
print(np.sum(train['target'])/ len(train))

0.37205026049647566
0.4296597924602653


In [None]:
sample_submission = pd.read_csv(path + 'data/sample_submission.csv')
sample_submission['target'] = preds

In [None]:
sample_submission.to_csv(path +'submission/V5_attach_keyword_to_text.csv', index=False)