In [None]:
#Importing the required packages
#Ignoring warnings
import warnings
warnings.filterwarnings('ignore') 
import numpy as np
import pandas as pd
from time import time
import operator
import string
import re
import os

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

import sklearn
from sklearn import utils
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import f1_score

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import tqdm
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

import tensorflow as tf
import keras.preprocessing
import keras.layers
import keras.models
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Model
from keras.engine.topology import Layer
from keras.layers import Activation,  Wrapper
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Bidirectional, Flatten, SpatialDropout1D, LSTM
from keras.layers import BatchNormalization
from keras.layers import Concatenate
from keras import initializers, regularizers, constraints
from keras.callbacks import (EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard)

In [None]:
#Loading train data into a dataframe 
train_df = pd.read_csv("train.csv")

In [None]:
#Importing contractions
from pycontractions import Contractions
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [None]:
#Defining Function for Expanding contractions
def contraction_removal(record):
    global cont
    corrections = list(cont.expand_texts([record["question_text"]], precise=True))
    record["question_text"] = corrections[0]
    return record

In [None]:
#Expanding contractions in train_df
train_df = train_df.progress_apply(contraction_removal, axis=1)

In [None]:
#Defining function to correct misspelled words
from autocorrect import Speller
def spell_check_eng(record):
    check = Speller(lang='en')
    record["question_text"] = check(record["question_text"])
    return record

In [None]:
#Correcting misspelled words in train_df
train_df = train_df.progress_apply(spell_check_eng, axis=1)

In [None]:
#Importing segmenter function from ekphrasis package
from ekphrasis.classes.segmenter import Segmenter
# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="english") 

In [None]:
#Defining word segmentation function
def word_segmentation(record):
    global seg_eng
    record["question_text"] = seg_eng.segment(record["question_text"])
    return record

In [None]:
#Word segmentation of train_df
train_df = train_df.progress_apply(word_segmentation, axis =1)

In [None]:
#Defining function to remove punctuations
def punc_remove_wrapper(record, maxlen=None):
    puncts = [
         ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
    no_punc = record["question_text"]
    for punct in puncts:   
        if punct in record["question_text"]:
            no_punc = no_punc.replace(punct, '')
            #print(no_punc)
    return no_punc

In [None]:
#Removing punctuations from train_df 
train_df["question_text"] = train_df.progress_apply(punc_remove_wrapper, axis = 1)

In [None]:
#Storing english stop words into stop_words
stop_words = set(stopwords.words('english'))
stop_words

In [None]:
#Removing stop words from train_df
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: " ".join([w for w in str(x).lower().split() if w not in stop_words]))

In [None]:
#Removing numbers from train_df
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: re.sub("\d+","",x))
print(train_df["question_text"].head(10))