In [None]:
import pandas as pd
#pandas
import numpy as np
#numpy
import matplotlib.pyplot as plt
#matplotlib
import seaborn as sns
#seaborn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#sklearn
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
#keras
import tensorflow as tf
#tensorflow
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#nltk
import re
import os
#other useful stuff
from wordcloud import WordCloud, STOPWORDS
#wordclouds and cloud stopwords
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv('../input/training1600000processednoemoticoncsv/training.1600000.processed.noemoticon.csv', encoding='latin')

In [None]:
head=['target', 'ID', 'Date', 'Flag', 'User', 'Message']
df.columns=head

In [None]:
target=df['target']
target.value_counts()

In [None]:
df.drop(columns=['Flag'], inplace=True)
df.columns

In [None]:
df

In [None]:
df['target'] = df['target'].replace([0, 4],[0,1])

In [None]:
df['length'] = df.Message.str.split().apply(len)
df

In [None]:
# plt.figure(figsize=(14,7))
# word_cloud = WordCloud(stopwords = STOPWORDS, max_words = 200, width=1366, height=768, background_color="white").generate(" ".join(df[df.target==1].Message))
# plt.imshow(word_cloud,interpolation='bilinear')
# plt.axis('off')
# plt.title('Most common words in positive sentiment tweets.',fontsize=20)
# plt.show()

In [None]:
# plt.figure(figsize=(14,7))
# word_cloud = WordCloud(stopwords = STOPWORDS, max_words = 200, width=1366, height=768, background_color="white").generate(" ".join(df[df.target==0].Message))
# plt.imshow(word_cloud,interpolation='bilinear')
# plt.axis('off')
# plt.title('Most common words in negative sentiment tweets.',fontsize=20)
# plt.show()

In [None]:
df.drop(columns=['ID', 'Date', 'User', 'length'], inplace=True)
df

In [None]:
english_stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [None]:
def preprocess(message, stem=False):
  message = re.sub(regex, ' ', str(message).lower()).strip()
  tokens = []
  for token in message.split():
    if token not in english_stopwords:
      tokens.append(stemmer.stem(token))
  return " ".join(tokens)

In [None]:
df.Message = df.Message.apply(lambda x: preprocess(x))
df

In [None]:
missing_data = df.isna().sum().sort_values(ascending=False)
percentage_missing = round((df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100,2)
missing_info = pd.concat([missing_data,percentage_missing],keys=['Missing values','Percentage'],axis=1)
missing_info.style.background_gradient()

In [None]:
df.to_csv('processedf.csv',na_rep='Unkown')
# df.to_csv('processed.csv')

In [None]:
# df=pd.read_csv('../input/processedf/processedf.csv', encoding='latin')
# df['target'] = df['target'].replace([0, 4],[0,1])
# df

In [None]:
# missing_data = df.isna().sum().sort_values(ascending=False)
# percentage_missing = round((df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100,2)
# missing_info = pd.concat([missing_data,percentage_missing],keys=['Missing values','Percentage'],axis=1)
# missing_info.style.background_gradient()

In [None]:
train, test = train_test_split(df, test_size=0.1, random_state=44)

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.Message)
vocab_size = len(tokenizer.word_index) + 1
max_length = 50

In [None]:
sequences_train = tokenizer.texts_to_sequences(train.Message)
sequences_test = tokenizer.texts_to_sequences(test.Message)

X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train.target.values
y_test = test.target.values

In [None]:
embeddings_dictionary = dict()
embedding_dim = 100
glove_file = open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

embeddings_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False)

In [None]:
num_epochs = 10
batch_size = 1000

In [None]:
model = Sequential([
        embedding_layer,
        tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(LSTM(128)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001,epsilon=1e-07), metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size = batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1)

In [None]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred>0.5, 1, 0)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#History for accuracy
plt.figure(figsize=(10,5))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train accuracy', 'Test accuracy'], loc='lower right')
plt.show()
# History for loss
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train loss', 'Test loss'], loc='upper right')
plt.suptitle('Accuracy and loss for second model')
plt.show()