In [None]:
!pip install clean-text[gpl]

In [None]:
##loading required libraries
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend as K
import transformers
import matplotlib.pyplot as plt
from cleantext import clean
from keras.preprocessing.text import Tokenizer

In [None]:
##loading required datasets
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
##This code is for using TPU clusters provided by Kaggle
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

In [None]:
# dropping id, location column due to large no of Nan.

train.drop(['id','location','keyword'],axis=1,inplace=True)
test.drop(['id','location','keyword'],axis=1,inplace=True)

train.head()

In [None]:
##cleaning the text using clean-text package

def text_cleaning(text):
    text=clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # fully remove punctuation
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="",
    replace_with_currency_symbol="",
    lang="en"                       # set to 'de' for German special handling
    )
    return text

#train["text"].map(lambda x: text_cleaning(x))
#train["text"][7610]

train["text"] = train["text"].map(lambda x: text_cleaning(x))
test["text"] = test["text"].map(lambda x: text_cleaning(x))

In [None]:
##tokenizing the words
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train["text"].values)

seq_train = word_tokenizer.texts_to_sequences(train["text"].values)
seq_test = word_tokenizer.texts_to_sequences(test["text"].values)

#seq_train[7610]
#word_tokenizer.index_word[2280]

In [None]:
##Creating a word index
word_index = word_tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
##padding the sequences
seq_pad_train = tf.keras.preprocessing.sequence.pad_sequences(seq_train, maxlen=28)
seq_pad_test = tf.keras.preprocessing.sequence.pad_sequences(seq_test, maxlen=28)

seq_pad_train[:5]

In [None]:
##downloading the glove embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [None]:
#Processing embeddings to make embedding matrix

embeddings_index = {}
#f = open('/kaggle/input/glove-em/glove.6B.100d.txt')
#f = open('./glove.6B.300d.txt')
f = open("/kaggle/input/glove-300/glove.6B.300d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#Creation of embedding matrix

EMBEDDING_DIM=300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
#Creating embedding matrix
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=28,
                            trainable=True)

In [None]:
#Creation of network architecure
with strategy.scope():
    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(28,),dtype="int32")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = embedding_layer(inputs)


    #x = tf.keras.layers.LSTM(24,return_sequences=True)(x)
    #x = tf.keras.layers.Dropout(0.2)(x)    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100))(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    predictions = tf.keras.layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = tf.keras.Model(inputs, predictions)
    model.summary()
    
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [None]:
#plot the model architecture
tf.keras.utils.plot_model(model)

In [None]:
#Fitting the model
model.fit(seq_pad_train, train["target"].values, epochs=20, batch_size=16, verbose=1)

In [None]:
#Prediction on test data and submission file creation
final=sample_submission[['id']]
final['target'] = model.predict(seq_pad_test)


def thres(x):
    if x>=0.5:
        return 1
    else:
        return 0
    
final['target'] = final['target'].map(lambda x: thres(x))

In [None]:
final.to_csv("basic_transfer_14.csv",index=False)