<a href="https://colab.research.google.com/github/yan-69/My-projects/blob/master/KaggleChallange1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Importing the training data and changing into a pandas dataframe**

In [None]:
import tensorflow as tf
import pandas as pd

columns = ["id","keyword","location","text","target"];
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv',usecols=columns)

print(df_train['text'][11:21])
#print(df_train['target'][11:21])
print(len(df_train["text"]))
print(len(df_train["target"]))

### ***Plotting the training data***

In [None]:
import seaborn as sns

sns.countplot("target",data=df_train)

### **Cleaning the data**

In [None]:
import string
import re
import random 

def text_cleaner(data):
  ascii = set(string.printable)
  cleaned_data = filter(lambda x : x in ascii, data)
  cleaned_data = ''.join([char.lower() for char in data if char not in string.punctuation])
  cleaned_data = re.sub('[0-9]+', '',cleaned_data)
  return cleaned_data



df_train["cleaned_text"] = df_train["text"].apply(lambda x:text_cleaner(x))
print(df_train["cleaned_text"][:6])
df_train = df_train.sample(frac=1).reset_index(drop=True)
print(df_train["cleaned_text"][:6])

### **Importing the test data and changing it into a pandas dataframe**


In [None]:
test_columns = ["id","keyword","location","text"]

df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv',usecols=test_columns)
#df_dev["text"][:10]
print(len(df_test["text"]))


### **Cleaning the test data**

In [None]:
df_test["cleaned_data"] = df_test["text"].apply(lambda x: text_cleaner(x))

df_test = df_test.sample(frac=1).reset_index(drop=True)


### **Preparing the data for training process**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
import numpy as np

sentences=[]
labels = []
for i in range(len(df_train["text"])):
  sentences.append(df_train["text"][i])
  labels.append(df_train["target"][i])

#print(len(sentences))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded_data = pad_sequences(sequences, maxlen = 32, padding = 'post', truncating = 'post')

train_sentences = padded_data[:7500]
train_labels = labels[:7500]
dev_sentences = padded_data[7500:]
dev_labels = labels[7500:]

print(len(train_sentences))
print(len(train_labels))
print(len(dev_sentences))
print(len(dev_labels))



In [None]:
embedding_dim = 100

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
print(len(embeddings_matrix))

### **Training process of the model**

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=32, weights=[embeddings_matrix], trainable=False))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Conv1D(16, 6, activation='relu'))
model.add(tf.keras.layers.Conv1D(32, 6, activation='relu'))
model.add(tf.keras.layers.Conv1D(64, 6, activation='relu'))
forward_layer = tf.keras.layers.LSTM(128, return_sequences=False)
backward_layer = tf.keras.layers.LSTM(128, activation='relu', return_sequences=False,
                       go_backwards=True)
model.add(tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer, input_shape=(5,10)))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.0001), metrics=['accuracy'])
model.summary()

In [None]:
train_sentences = np.array(train_sentences)
train_labels = np.array(train_labels)
dev_sentences = np.array(dev_sentences)
dev_labels = np.array(dev_labels)
#print(train_labels)
history = model.fit(train_sentences,train_labels, epochs = 20, validation_data = (dev_sentences,dev_labels), verbose = 1)

In [None]:
predicted_labels = np.round_(model.predict(dev_sentences))
print(len(predicted_labels))



In [None]:

h = []
g = []
j = 0
for i in dev_labels:
  h.append(i)
for i in (predicted_labels.ravel().tolist()):
  g.append(i)
for x in range(len(dev_labels)):
  if predicted_labels[x] == dev_labels[x]:
    j+=1
print(F"You have {j} correct predictions out of {len(predicted_labels)}")
print(F"Your model has {round(100*91/113)}% accuracy on unseen data")

In [None]:
import matplotlib.pyplot as plt

#plt.plot(predicted_labels, r')
plt.plot(dev_labels,'v')
#plt.plot(predicted_labels, 'v')

In [None]:
plt.plot(predicted_labels,'v')

In [None]:
train_acc = history.history['accuracy']
dev_acc = history.history['val_accuracy']
train_loss = history.history['loss']
dev_loss = history.history['val_loss']
epochs = range(len(train_acc))
plt.plot(epochs, train_acc, 'r')
plt.plot(epochs, dev_acc, 'g')
plt.title("Training accuracy vs dev accuracy")
plt.xlabel("Eapochs")
plt.ylabel("Accuracy")
plt.legend(["Train_acc","Dev_acc"])
plt.figure()

In [None]:
plt.plot(epochs, train_loss, 'b')
plt.plot(epochs, dev_loss, 'g')
plt.title("Training vs Dev loss")
plt.xlabel("Training loss")
plt.ylabel("Dev loss")
plt.legend(["Training_loss","Dev_loss"])
plt.figure()