# Solution of kaggle competition for Telia practice

Competition link: https://www.kaggle.com/competitions/nlp-getting-started

Author: Valerija Jerina

## Imports

In [1]:
#pip install transformers

In [2]:
import gc
import re
import string
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tokenization
from wordcloud import STOPWORDS
import transformers


from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import random

random.seed(1201)
df_train = pd.read_csv('kaggle/input/train.csv', dtype={'id': np.int16, 'target': np.int8})
df_test = pd.read_csv('kaggle/input/test.csv', dtype={'id': np.int16})

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Handling missing values

First thing in machine learning that we have to consider from the benning is how we treat missing values. 

Thus, i suggest that we take a look at the missing values ratio in training and test set. 


In [4]:

values_train = df_train[['keyword', 'location']].isnull().sum()
values_test = df_test[['keyword', 'location']].isnull().sum()

print("Ratio of the mussed keywords in location in training set " + str(values_train[0]/values_train[1]))
print("Ratio of the mussed keywords in location in test set " + str(values_test[0]/values_test[1]))


Ratio of the mussed keywords in location in training set 0.024082116067903673
Ratio of the mussed keywords in location in test set 0.023529411764705882


As we can see, those ratios are very similar. We can assume that both sets has been taken from the same sample. 

Now, let's fill those values since we don't want to get rid of the data this easily. 
 

In [5]:
df_train['keyword'] = df_train['keyword'].fillna(f'no_keyword')
df_train['location'] = df_train['location'].fillna(f'no_location')
df_test['keyword'] = df_test['keyword'].fillna(f'no_keyword')
df_test['location'] = df_test['location'].fillna(f'no_location')

## Meta features

Sometimes, we have to create some extra features to as much as we can from the data we use.

In this case, as in the most language models, following things might matter:
1. text length(number of words and characters in it)
2. Number of unique words in text(debatable, but can affect the model)
3. Hashtags(or their count, since we are watching tweets)
4. Mentions in the text(as we have tweets where people can be taged using @)
5. Stop words, is it a stop word and it's count. 
6. Average word length
7. there is a possibility, that punctuation count is also impotant.
8. Are there any links(urls)?

Let's implement some of those.

In [6]:
# word_count
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

In [7]:
# character_count
df_train['character_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['character_count'] = df_test['text'].apply(lambda x: len(str(x)))

In [8]:
#amount of unique_words
df_train['unique_words'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
df_test['unique_words'] = df_test['text'].apply(lambda x: len(set(str(x).split())))

In [9]:
# stop_word_count
df_train['stop_words'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_words'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

In [10]:
df_train['urls'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df_test['urls'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

df_train['hashtags'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
df_test['hashtags'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

df_train['mentions'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
df_test['mentions'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

Let's stop at this point. Many more meta features can be added, however, if result is not satisfyiong, we can always add mor later

## Target

Now we have to check if the model will require extra targeting.

In [11]:
not_disaster = df_train.groupby('target').count()['id'][0]
disaster = df_train.groupby('target').count()['id'][1]
print(not_disaster/(disaster+not_disaster))

0.5703402075397347


As we can see, a little above 57% are not disasters, that means that a little below 43% will be not disasters. The difference is not big enough for us to worry about stratification by target.

## Potential steps:

So far, I am not completely sure whether i'll be able to complete all the ideas that i have for this dataset. Thus, i am suggesting a list of potential steps that could be done.

* N-grams -- N-grams are continuous sequences of words or symbols or tokens in a document. We want to see whether any of those appear too often. 
* Embeddings coverage -- We want to get vocabulary as close to the embeddings as possible in order to not lose any important information
* Text cleaning -- good models are trained on the clean data! First, we would want to separate words with punctuation, then we would consider removing special characters that appear in the words. Then, expanding contractions, removing urls, correcting slang and typos, changing informal words to full forms. Check for arconyms, replace those that can be replacesd, if possible, then expanding hashtags and usernames. Possible step is changing everything to lowercase, lemmatization.
* Handling mislabeld samples - data contains duplicates and some of those can be interpreted differently. Thus, those have to ba handled manually.
* Cross-Validation - stratification of the data by 'keyword'
* Model training - getting accuracy, precision, trecall, since F1 by itself is not very informative. applying BERT layer if needed.
* Train, Evaluate, Predict. 
* Test!

### N-Grams

As i do not have that much time, I will not go into detail and explore N-Grams, however it would be beneficial. Thus, I will leave a code piece that would potentially help with it.

In [12]:
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]


## Embeddings

In order to save time i went through embeddings that have been used by other submitters and chose one that i believe would suit the most - GloVe.

In [13]:
glove_embeddings = np.load('kaggle/embeddings/glove.840B.300d.pkl', allow_pickle=True)

We can also check for what is the coverage of vocabulary by this model

In [14]:
import numpy as np

# Calculate vocabulary coverage for a given set of text data
def calculate_coverage(text, embeddings):
    vocab = {}
    for t in text:
        for w in t:
            try:
                vocab[w] += 1
            except KeyError:
                vocab[w] = 1 

    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    return vocab_coverage, text_coverage

# Example usage
train_text = df_train['text'].apply(lambda s: s.split()).values
test_text = df_test['text'].apply(lambda s: s.split()).values

train_vocab_coverage, train_text_coverage = calculate_coverage(train_text, glove_embeddings)
test_vocab_coverage, test_text_coverage = calculate_coverage(test_text, glove_embeddings)

print(f"Training data: Vocabulary coverage = {train_vocab_coverage:.4f}, Text coverage = {train_text_coverage:.4f}")
print(f"Testing data: Vocabulary coverage = {test_vocab_coverage:.4f}, Text coverage = {test_text_coverage:.4f}")



Training data: Vocabulary coverage = 0.5206, Text coverage = 0.8268
Testing data: Vocabulary coverage = 0.5721, Text coverage = 0.8185


## Text cleaning


Here i've realised that i won't manage to insert most of the special characters manually and take into consideration all the `\x89Û_`, `\x89Ûª`, `JapÌ_n`, etc.

Same with hashtags, it'll take a very long time to clean all of it, so i'll just do some.

In [15]:
def clean(text): 
    
    # Contractions
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"We're", "We are", text)
    text = re.sub(r"That's", "That is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"Can't", "Cannot", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"What's", "What is", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"You're", "You are", text)
    text = re.sub(r"I'M", "I am", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r"Isn't", "is not", text)
    text = re.sub(r"Here's", "Here is", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"it'll", "it will", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"We've", "We have", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"Y'all", "You all", text)
    text = re.sub(r"Weren't", "Were not", text)
    text = re.sub(r"Didn't", "Did not", text)
    text = re.sub(r"they'll", "they will", text)
    text = re.sub(r"they'd", "they would", text)
    text = re.sub(r"DON'T", "DO NOT", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"we'd", "we would", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"They're", "They are", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"i've", "I have", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"I've", "I have", text)
    text = re.sub(r"Don't", "do not", text)
    text = re.sub(r"I'll", "I will", text)
    text = re.sub(r"I'd", "I would", text)
    text = re.sub(r"Let's", "Let us", text)
    text = re.sub(r"you'd", "You would", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"Ain't", "am not", text)
    text = re.sub(r"Haven't", "Have not", text)
    text = re.sub(r"Could've", "Could have", text)
    text = re.sub(r"youve", "you have", text)  
            
    # Character entity references
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)
    
    # Typos, slang and informal abbreviations
    text = re.sub(r"16yr", "16 year", text)
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"w/", "with", text)
    text = re.sub(r"USAgov", "USA government", text)
    text = re.sub(r"recentlu", "recently", text)
    text = re.sub(r"TRAUMATISED", "traumatized", text)
    text = re.sub(r"<3", "love", text)
    text = re.sub(r"8/5/2015", "2015-08-05", text)
    text = re.sub(r"8/6/2015", "2015-08-06", text)
    text = re.sub(r"10:38PM", "10:38 PM", text)
    text = re.sub(r"10:30pm", "10:30 PM", text)
    text = re.sub(r"lmao", "laughing my ass off", text)   
    
    # Hashtags and usernames
    text = re.sub(r"IranDeal", "Iran Deal", text)
    text = re.sub(r"ArianaGrande", "Ariana Grande", text)
    text = re.sub(r"camilacabello97", "camila cabello", text) 
    text = re.sub(r"RondaRousey", "Ronda Rousey", text)     
    text = re.sub(r"ProphetMuhammad", "Prophet Muhammad", text)
    text = re.sub(r"PantherAttack", "Panther Attack", text)
    text = re.sub(r"StrategicPatience", "Strategic Patience", text)
    text = re.sub(r"socialnews", "social news", text)
    text = re.sub(r"NASAHurricane", "NASA Hurricane", text)
    text = re.sub(r"humanconsumption", "human consumption", text)
    text = re.sub(r"BeingAuthor", "Being Author", text)
    text = re.sub(r"OffensiveContent", "Offensive Content", text)
    text = re.sub(r"WorstSummerJob", "Worst Summer Job", text)
    text = re.sub(r"HarryBeCareful", "Harry Be Careful", text)
    text = re.sub(r"NASASolarSystem", "NASA Solar System", text)
    text = re.sub(r"NewsInTweets", "News In Tweets", text)
    text = re.sub(r"abstorm", "Alberta Storm", text)
    text = re.sub(r"Time2015", "Time 2015", text)
    text = re.sub(r"djicemoon", "dj icemoon", text)
    text = re.sub(r"greatbritishbakeoff", "great british bake off", text)
    text = re.sub(r"ENGvAUS", "England vs Australia", text)
    text = re.sub(r"ScottWalker", "Scott Walker", text)
    text = re.sub(r"saddlebrooke", "Saddlebrooke", text)
    text = re.sub(r"RAmag", "Royal Academy Magazine", text)
    text = re.sub(r"MNPDNashville", "Metropolitan Nashville Police Department", text)
    text = re.sub(r"TfLBusAlerts", "TfL Bus Alerts", text)
    text = re.sub(r"GamerGate", "Gamer Gate", text)
    text = re.sub(r"alexbelloli", "Alex Belloli", text)
    text = re.sub(r"Japton", "Arkansas", text)
    text = re.sub(r"timkaine", "Tim Kaine", text)
    text = re.sub(r"IdentityTheft", "Identity Theft", text)
    text = re.sub(r"AllLivesMatter", "All Lives Matter", text)
    text = re.sub(r"mishacollins", "Misha Collins", text)
    text = re.sub(r"BillNeelyNBC", "Bill Neely", text)
    text = re.sub(r"BeClearOnCancer", "be clear on cancer", text)
    text = re.sub(r"Kowing", "Knowing", text)
    text = re.sub(r"ScreamQueens", "Scream Queens", text)
    text = re.sub(r"AskCharley", "Ask Charley", text)
    text = re.sub(r"BlizzHeroes", "Heroes of the Storm", text)
    text = re.sub(r"BradleyBrad47", "Bradley Brad", text)
    text = re.sub(r"HannaPH", "Typhoon Hanna", text)
    text = re.sub(r"meinlcymbals", "MEINL Cymbals", text)
    text = re.sub(r"RohnertParkDPS", "Rohnert Park Police Department", text)
    text = re.sub(r"THISIZBWRIGHT", "Bonnie Wright", text)
    text = re.sub(r"Popularmmos", "Popular MMOs", text)
    text = re.sub(r"WildHorses", "Wild Horses", text)
    text = re.sub(r"FantasticFour", "Fantastic Four", text)
    text = re.sub(r"BathAndNorthEastSomerset", "Bath and North East Somerset", text)
    text = re.sub(r"thatswhatfriendsarefor", "that is what friends are for", text)
    text = re.sub(r"residualincome", "residual income", text)
    text = re.sub(r"YahooNewsDigest", "Yahoo News Digest", text)
    text = re.sub(r"MalaysiaAirlines", "Malaysia Airlines", text)
    text = re.sub(r"AmazonDeals", "Amazon Deals", text)
    text = re.sub(r"charlesadler", "Charles Adler", text)
    text = re.sub(r"twia", "Texas Windstorm Insurance Association", text)
    text = re.sub(r"txlege", "Texas Legislature", text)
    text = re.sub(r"WindstormInsurer", "Windstorm Insurer", text)
    text = re.sub(r"Newss", "News", text)
    text = re.sub(r"hempoil", "hemp oil", text)
    text = re.sub(r"57am", "57 am", text)
    text = re.sub(r"Bokoharm", "Boko Haram", text)
    text = re.sub(r"BombEffects", "Bomb Effects", text)
    text = re.sub(r"win10", "Windows 10", text)
    text = re.sub(r"JimmieJohnson", "Jimmie Johnson", text)
    text = re.sub(r"pctool", "pc tool", text)
    text = re.sub(r"DoingHashtagsRight", "Doing Hashtags Right", text)
    text = re.sub(r"ThrowbackThursday", "Throwback Thursday", text)
    text = re.sub(r"SnowBackSunday", "Snowback Sunday", text)
    text = re.sub(r"LakeEffect", "Lake Effect", text)
    text = re.sub(r"RTphotographyUK", "Richard Thomas Photography UK", text)
    text = re.sub(r"BigBang_CBS", "Big Bang CBS", text)
    text = re.sub(r"writerslife", "writers life", text)
    text = re.sub(r"NaturalBirth", "Natural Birth", text)
    text = re.sub(r"UnusualWords", "Unusual Words", text)
    text = re.sub(r"TheaterTrial", "Theater Trial", text)
    text = re.sub(r"CatoInstitute", "Cato Institute", text)
    text = re.sub(r"nflweek1picks", "NFL week 1 picks", text)
    text = re.sub(r"uiseful", "useful", text)
    text = re.sub(r"JusticeDotOrg", "The American Association for Justice", text)
    text = re.sub(r"kindlng", "kindling", text)
    text = re.sub(r"riggd", "rigged", text)
    text = re.sub(r"slownewsday", "slow news day", text)
    text = re.sub(r"mortalkombat", "Mortal Kombat", text)
    text = re.sub(r"FilipeCoelho92", "Filipe Coelho", text)
    text = re.sub(r"OnlyQuakeNews", "Only Quake News", text)
    text = re.sub(r"kostumes", "costumes", text)
    text = re.sub(r"YEEESSSS", "yes", text)
    text = re.sub(r"ToshikazuKatayama", "Toshikazu Katayama", text)
    text = re.sub(r"IntlDevelopment", "Intl Development", text)
    text = re.sub(r"ExtremeWeather", "Extreme Weather", text)
    text = re.sub(r"NewsThousands", "News Thousands", text)
    text = re.sub(r"EyewitnessWV", "Eye witness WV", text)
    text = re.sub(r"PhiladelphiaMuseu", "Philadelphia Museum", text)
    text = re.sub(r"FromTheField", "From the field", text)
    text = re.sub(r"NorthIowa", "North Iowa", text)
    text = re.sub(r"WillowFire", "Willow Fire", text)
    text = re.sub(r"MadRiverComplex", "Mad River Complex", text)
    text = re.sub(r"viaYouTube", "via YouTube", text)
           
    # Urls
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)

    #punctuations and special char
    punctuation = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuation:
        text = text.replace(p, f' {p} ')
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ') 

    #One acronym that i've found
    text = re.sub(r"usNWSgov", "United States National Weather Service", text)

    return text

In [16]:
df_train['text_cleaned'] = df_train['text'].apply(lambda s : clean(s))
df_test['text_cleaned'] = df_test['text'].apply(lambda s : clean(s))

Now we shall see if we have improved :)

In [17]:
import numpy as np

# Calculate vocabulary coverage for a given set of text data
def calculate_coverage(text, embeddings):
    vocab = {}
    for t in text:
        for w in t:
            try:
                vocab[w] += 1
            except KeyError:
                vocab[w] = 1 

    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    return vocab_coverage, text_coverage

# Example usage
train_text = df_train['text_cleaned'].apply(lambda s: s.split()).values
test_text = df_test['text_cleaned'].apply(lambda s: s.split()).values

train_vocab_coverage, train_text_coverage = calculate_coverage(train_text, glove_embeddings)
test_vocab_coverage, test_text_coverage = calculate_coverage(test_text, glove_embeddings)

print(f"Training data: Vocabulary coverage = {train_vocab_coverage:.4f}, Text coverage = {train_text_coverage:.4f}")
print(f"Testing data: Vocabulary coverage = {test_vocab_coverage:.4f}, Text coverage = {test_text_coverage:.4f}")


Training data: Vocabulary coverage = 0.8033, Text coverage = 0.9613
Testing data: Vocabulary coverage = 0.8397, Text coverage = 0.9590


### Handling mislabeled samples
first, let's take a look at them

In [18]:
df_mislabeled = df_train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled.index.tolist()

['like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!',
 "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!',
 'To fight bioterrorism sir.',
 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE',
 '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption',
 '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect',
 'He came to a land which was engulfed in tribal war and turned it into a land 

This is a list of samples that are labeled twice. Let's create a new column where we will put new labels:

In [19]:
df_mislabeled = df_train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled.index.tolist()

['like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!',
 "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!',
 'To fight bioterrorism sir.',
 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE',
 '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption',
 '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect',
 'He came to a land which was engulfed in tribal war and turned it into a land 

In [20]:
df_train['target_relabeled'] = df_train['target'].copy() 
df_train.loc[df_train['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!'", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'Hellfire is surrounded by desires so be careful and don\x89Ûªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'wowo--=== 12000 Nigerian refugees repatriated from Cameroon', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == 'Caution: breathing may be hazardous to your health.', 'target_relabeled'] = 10
df_train.loc[df_train['text'] == 'I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == 'that horrible sinking feeling when you\x89Ûªve been at home on your phone for a while and you realise its been on 3G this whole time', 'target_relabeled'] = 1

In [21]:
K = 2
skf = StratifiedKFold(n_splits=K, shuffle=True)

is_disaster = df_train['target'] == 1
disaster_count = df_train[is_disaster]['target_relabeled'].count()
not_disaster_count = df_train[~is_disaster]['target_relabeled'].count()


for fold, (train_idx, val_idx) in enumerate(skf.split(df_train['text_cleaned'], df_train['target']), 1):
    train_shape = df_train.loc[train_idx, 'text_cleaned'].shape
    val_shape = df_train.loc[val_idx, 'text_cleaned'].shape
    train_unique_keywords = df_train.loc[train_idx, 'keyword'].nunique()
    val_unique_keywords = df_train.loc[val_idx, 'keyword'].nunique()

### BERT Layer

In [22]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)

### Train and predict 

In [23]:
# Define the encoding function
def encode(texts):
                
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_seq_length - 2]
        input_sequence = ['[CLS]'] + text + ['[SEP]']
        pad_len = max_seq_length - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_seq_length

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Define the model
def build_model():
    
    input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
    input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
    segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name='segment_ids')    
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])   
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    optimizer = SGD(learning_rate=lr, momentum=0.8)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# Define the training function
def train_model(X, tokenizer, max_seq_length, lr, epochs, batch_size):
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    models = []
    scores = {}
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X['text_cleaned'], X['target'])):
        
        print('\nFold {}\n'.format(fold))
    
        X_trn_encoded = encode(X.loc[trn_idx, 'text_cleaned'])
        y_trn = X.loc[trn_idx, 'target']
        X_val_encoded = encode(X.loc[val_idx, 'text_cleaned'])
        y_val = X.loc[val_idx, 'target']
    
        # Callbacks
        class ClassificationReport(Callback):

            def __init__(self, train_data, validation_data):
                super().__init__()

                self.X_train, self.y_train = train_data
                self.X_val, self.y_val = validation_data

                self.train


In [24]:
# Set hyperparameters
max_seq_length = 128
lr = 0.001
epochs = 5
batch_size = 32

# Tokenize the data
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Train the model
model = build_model()

for epoch in range(epochs):
    print('\nEpoch {}/{}'.format(epoch+1, epochs))
    print('----------------------------------')
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train['text_cleaned'], df_train['target'])):
        
        print('\nFold {}\n'.format(fold))
        
        X_trn_encoded = encode(df_train.loc[trn_idx, 'text_cleaned'])
        y_trn = df_train.loc[trn_idx, 'target']
        X_val_encoded = encode(df_train.loc[val_idx, 'text_cleaned'])
        y_val = df_train.loc[val_idx, 'target']
        
        history = model.fit(
            [X_trn_encoded[0], X_trn_encoded[1], X_trn_encoded[2]], y_trn,
            validation_data=([X_val_encoded[0], X_val_encoded[1], X_val_encoded[2]], y_val),
            epochs=1,
            batch_size=batch_size,
            verbose=1
        )
        
        # Evaluate on the validation set
        y_val_pred = model.predict([X_val_encoded[0], X_val_encoded[1], X_val_encoded[2]])
        y_val_pred = (y_val_pred > 0.5).astype(int)
        
        val_precision = precision_score(y_val, y_val_pred)
        val_recall = recall_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred)
        
        print('\nValidation Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}\n'.format(val_precision, val_recall, val_f1))


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089



Epoch 1/5
----------------------------------

Fold 0
