In [None]:
#Importing the required packages
#Ignoring warnings
import warnings
warnings.filterwarnings('ignore') 
import numpy as np
import pandas as pd
from time import time
import operator
import string
import re
import os

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

import sklearn
from sklearn import utils
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import f1_score

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import tqdm
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

import tensorflow as tf
import keras.preprocessing
import keras.layers
import keras.models
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Model
from keras.engine.topology import Layer
from keras.layers import Activation,  Wrapper
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Bidirectional, Flatten, SpatialDropout1D, LSTM
from keras.layers import BatchNormalization
from keras.layers import Concatenate
from keras import initializers, regularizers, constraints
from keras.callbacks import (EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard)

In [None]:
# Loading the preprocessed train data into dataframe
train_df = pd.read_csv('preprocessed_train_df.csv')

In [None]:
# Dropping the null values in the data
train_df = train_df.dropna()

In [None]:
# Filtering target values into a new dataframe for splitting the data using target class values
target_df = train_df.pop('target').to_frame()

In [None]:
# Splitting the train data into train and test data using stratified sampling
train_data, test_data, train_target, test_target = train_test_split(train_df, target_df, stratify=target_df, test_size=0.25, random_state=38)

In [None]:
# Displaying the target class values before and after the data splitting
from collections import Counter
print(f'Original train dataset shape: {Counter(target_df.target)}')
print(f'Resampled train dataset shape: {Counter(train_target.target)}')
print(f'Resampled test dataset shape: {Counter(test_target.target)}')

In [None]:
# Combining the train_data and train_target dataframes into train_df1 dataframe
train_df1 = train_data.join(train_target)

In [None]:
# Combining the test_data and test_target dataframes into test_df dataframe
test_df = test_data.join(test_target)

In [None]:
# Filtering the question text data 
words_train = train_df1['question_text']
words_test = test_df['question_text']

In [None]:
# Converting the series dataframe to string
words_train = words_train.to_string()
words_test = words_test.to_string()

In [None]:
# Removing the \n and digits from the question text
words_train = ''.join([i for i in words_train if not i.isdigit()])
words_train = words_train.replace('\n', '')

In [None]:
# Removing the \n and digits from the question text
words_test = ''.join([i for i in words_test if not i.isdigit()])
words_test = words_test.replace('\n', '')

In [None]:
# Finding the number of unique words in the train data
len(set(words_train.split()))

In [None]:
# Finding the number of unique words in the test data
len(set(words_test.split()))

In [None]:
# Finding the max no of words in the question text in train and test data
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df1['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

In [None]:
# Hyper parameters for performing tokenization
max_features = 188991 #total number of unique words = unique words in both train and test datasets
max_len = 100 # max number of words in a question to use as the max word length is around 100 for train dataset.

In [None]:
## fill up the missing values
train_X = train_df1["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

In [None]:
#Tokenize our training data
tk = Tokenizer(lower = True, filters='', num_words=max_features, oov_token = True)
tk.fit_on_texts(list(train_X))

In [None]:
# Get our training data word index
word_index = tk.word_index

In [None]:
#Encode training and test data sentences into sequences
train_X = tk.texts_to_sequences(train_X)
test_X = tk.texts_to_sequences(test_X)

In [None]:
# Pad the train sequece and Displaying the padded train sequence
train_X = pad_sequences(train_X, maxlen = max_len, padding='pre', truncating='post')
train_X

In [None]:
# Pad the test sequence and Displaying the padded test sequence
test_X = pad_sequences(test_X, maxlen = max_len, padding='pre', truncating='post')
test_X

In [None]:
## Get the target values
train_y = train_df1['target'].values
test_y = test_df['target'].values

In [None]:
# %store is used to use the data stored in one notebook in another notebook.
%store train_df1
%store test_df
%store tk
%store max_features
%store max_len
%store word_index
%store train_X
%store test_X
%store train_y