# Imports

In [11]:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from lazypredict.Supervised import LazyClassifier


%load_ext autoreload
%autoreload 2

#  Read Data

In [12]:
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")


In [13]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# check for empty rows
train[pd.isna(train.text)]

Unnamed: 0,id,keyword,location,text,target


In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Preprocess data

In [41]:
def preprocessing(sentence):
    # remove whitespace
    [text.strip() for text in sentence]
    
    # lowercase characters
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') 
    
    #tokenize
    sentence_tokens = word_tokenize(sentence)
    
    # remove stopwords
    #stop_words = set(stopwords.words('english'))
    #sentence_tokens = [w for w in sentence_tokens if not w in stop_words] 
    
    # Lemmatizing the verbs
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in sentence_tokens]

    # 2 - Lemmatizing the nouns
    sentence_preprocessed = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in verb_lemmatized]
    
    return ' '.join(word for word in sentence_preprocessed)

In [42]:
train["clean_text"] = train.text.apply(preprocessing)

In [43]:
train.clean_text.head()

0    our deed be the reason of this earthquake may ...
1                forest fire near la ronge sask canada
2    all resident ask to shelter in place be be not...
3    people receive wildfire evacuation order in ca...
4    just get send this photo from ruby alaska a sm...
Name: clean_text, dtype: object

In [44]:
# Feature/Target
X = train["clean_text"]
y = train["target"]

# Baseline Model

In [45]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=80)

In [46]:
# Pipeline vectorizer + Naive Bayes
pipeline_nb = make_pipeline(
    TfidfVectorizer(), 
    MultinomialNB()
)

# Cross-validation
cv_results = cross_validate(pipeline_nb, X_train, y_train, cv = 5, scoring = ["recall"])
average_recall = cv_results["test_recall"].mean()
np.round(average_recall,2)

0.6

The result of the baseline model - a simple multinomial naiva bayes algorithm based on tfidf vectorizer - returns a result of 0.64

In [47]:
pipeline_nb.fit(X_train, y_train)

In [48]:
pipeline_nb.score(X_test,y_test)

0.7981611208406305

In [49]:
pipeline_nb.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'multinomialnb': MultinomialNB(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True,
 'tfidfvectorizer__vocab

# Hyperparameter Tuning

In [54]:
# Set parameters to search
parameters = {
    'tfidfvectorizer__min_df':(0, 0.5, 1.0, 1.5, 2.0, 2.5),
    'tfidfvectorizer__max_df':(0, 0.5, 1.0, 1.5, 2.0, 2.5),
    'tfidfvectorizer__ngram_range': ((1,1), (1,2), (1, 3), (2, 4)),
    'multinomialnb__alpha': (0.001, 0.01, 0.05, 0.1, 0.5, 1, 2),
    'tfidfvectorizer__max_features': (100, 500, 1000, 2500, 5000)}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline_nb, parameters,
                           cv = 5, n_jobs=-1, verbose=1, scoring = "f1")

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 5040 candidates, totalling 25200 fits
Best Score = 0.7355777260825367
Best params = {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 2500, 'tfidfvectorizer__min_df': 0, 'tfidfvectorizer__ngram_range': (1, 1)}


In [52]:
grid_search.score(X_test, y_test)

0.7438202247191013

In [38]:
test["clean_text"] = test.text.apply(preprocessing)

In [39]:
test["target"] = grid_search.predict(test["clean_text"])

In [58]:
res = pd.DataFrame(test[["id", "target"]])
res = res.set_index("id", drop=True)

In [59]:
print(res)

       target
id           
0           1
2           0
3           1
9           1
11          1
...       ...
10861       1
10865       1
10868       1
10874       1
10875       1

[3263 rows x 1 columns]


In [60]:
res.to_csv("result.csv")

# lazypredict

In [21]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df = 0.005)

# Training it on the texts
weighted_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(train.clean_text).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

weighted_words

Unnamed: 0,accident,affect,air,also,always,ambulance,amp,another,area,army,...,wound,wreck,wreckage,year,youre,youtube,yr,ûª,ûªs,ûò
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7609,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7610,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7611,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [16]:
X_train, X_test, y_train, y_test = train_test_split(weighted_words, y, test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|███████████████████████████████████████████| 29/29 [00:37<00:00,  1.31s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NuSVC                              0.77               0.75     0.75      0.76   
SVC                                0.77               0.75     0.75      0.76   
BernoulliNB                        0.76               0.74     0.74      0.75   
ExtraTreesClassifier               0.75               0.74     0.74      0.75   
LogisticRegression                 0.75               0.74     0.74      0.75   
LinearDiscriminantAnalysis         0.75               0.74     0.74      0.75   
RidgeClassifierCV                  0.75               0.74     0.74      0.75   
RidgeClassifier                    0.75               0.74     0.74      0.75   
CalibratedClassifierCV             0.76               0.74     0.74      0.75   
NearestCentroid                    0.75               0.74     0.74      0.75   
XGBClassifier               




# Word Embedding

In [66]:
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()

In [71]:
X_train_np

array(['debris confirm mh relative hope discovery crash site httptcorlftjmhhvt via reuters video',
       'haunt memory draw survivor httptcowjujfsfd',
       'england east coast dogger bank westward seismic survey progress mv western regent tow metre long cable within area\x89û',
       ...,
       'islamic state issue new holiday brochure lovely swim pool drown shoot range downside cost bomb',
       'th person die ny legionnaire disease outbreak httptcofjdmqhyai sebee',
       'themagickidraps upset rally upset burn build business execute cop nothing etc'],
      dtype=object)

In [68]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

2023-02-05 19:41:48.647782: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [76]:
X_train_prep = [text_to_word_sequence(i) for i in X_train_np]
X_test_prep = [text_to_word_sequence(i) for i in X_test_np]

In [77]:
X_train_prep

[['debris',
  'confirm',
  'mh',
  'relative',
  'hope',
  'discovery',
  'crash',
  'site',
  'httptcorlftjmhhvt',
  'via',
  'reuters',
  'video'],
 ['haunt', 'memory', 'draw', 'survivor', 'httptcowjujfsfd'],
 ['england',
  'east',
  'coast',
  'dogger',
  'bank',
  'westward',
  'seismic',
  'survey',
  'progress',
  'mv',
  'western',
  'regent',
  'tow',
  'metre',
  'long',
  'cable',
  'within',
  'area\x89û'],
 ['role',
  'usg',
  'paeds',
  'major',
  'trauma',
  'image',
  'decision',
  'tool',
  'foam',
  'foamcc'],
 ['annmarieronan',
  'niamhosullivanx',
  'cant',
  'watch',
  'tat',
  'show',
  'like',
  'horror',
  'movie',
  'get',
  'flashback',
  'everything',
  'traumatise'],
 ['last',
  'chance',
  'animal',
  'rescue',
  'new',
  'post',
  'httptcokiildugpo',
  'animalrescue',
  'httpstcowudlkqncx'],
 ['message',
  'send',
  'dont',
  'reply',
  'see',
  'saw',
  'message',
  'least',
  'tell',
  'fuck',
  'something'],
 ['worst', 'part', 'see', 'lightning', 'try', 

In [79]:
from gensim.models import Word2Vec

# This line trains an entire embedding for the words in your train set
word2vec = Word2Vec(sentences=X_train_prep, vector_size=50, min_count = 5)

In [55]:
word2vec.wv['hiroshima']

NameError: name 'word2vec' is not defined

In [84]:
from tensorflow.keras import Sequential
from tensorflow.keras import Sequential, layers, models

In [87]:
cnn = Sequential([
    layers.Embedding(input_dim=5000, input_length=20, output_dim=30, mask_zero=True),
    layers.Conv1D(20, kernel_size=3),
    layers.Flatten(),
    layers.Dense(1, activation="sigmoid"),
])

cnn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

cnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 30)            150000    
                                                                 
 conv1d_2 (Conv1D)           (None, 18, 20)            1820      
                                                                 
 flatten_2 (Flatten)         (None, 360)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 361       
                                                                 
Total params: 152,181
Trainable params: 152,181
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience = 15, restore_best_weights = True)

history = cnn.fit(X_train_prep, y_train,
          epochs=15,
          validation_split=0.2, 
          callbacks = [es],
          batch_size=32, 
          verbose=1)

In [99]:
# Feature/Target
X = train["text"]
y = train["target"]

In [100]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=80)

In [101]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_token, maxlen=150, dtype='float32')
X_test_pad = pad_sequences(X_test_token, maxlen=150, dtype='float32')

vocab_size = len(tokenizer.word_index)
vocab_size

17658

In [126]:
from tensorflow.keras import Sequential, layers

def init_cnn_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size + 1, output_dim=10, mask_zero=True, input_length=150))
    model.add(layers.Conv1D(128, 3))
    model.add(layers.Conv1D(128, 4))
    model.add(layers.Conv1D(128, 5))
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.6))
    model.add(layers.Dense(5,))
    model.add(layers.Dropout(0.6))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn = init_cnn_model(vocab_size)

In [127]:
model_cnn.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 150, 10)           176590    
                                                                 
 conv1d_16 (Conv1D)          (None, 148, 128)          3968      
                                                                 
 conv1d_17 (Conv1D)          (None, 145, 128)          65664     
                                                                 
 conv1d_18 (Conv1D)          (None, 141, 128)          82048     
                                                                 
 flatten_11 (Flatten)        (None, 18048)             0         
                                                                 
 dropout_13 (Dropout)        (None, 18048)             0         
                                                                 
 dense_19 (Dense)            (None, 5)               

In [128]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model_cnn.fit(X_train_pad, y_train, 
          epochs=20, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es]
         )


res = model_cnn.evaluate(X_test_pad, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
The accuracy evaluated on the test set is of 78.327%


# Word2vec embedding

In [133]:
from tensorflow import keras

y_train_cat = keras.utils.to_categorical(y_train)
y_test_cat = keras.utils.to_categorical(y_test)

In [134]:
import gensim.downloader as api
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load a Word2Vec embedding
word2vec_transfer = api.load("glove-wiki-gigaword-50")

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)

# Pad the training and test embedded sentences
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=200)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=200)

In [136]:
def init_cnn_model_2():
    model = Sequential()
    model.add(layers.Conv1D(16, 3))
    model.add(layers.Conv1D(16, 4))
    model.add(layers.Conv1D(16, 5))
    model.add(layers.Flatten())
    model.add(layers.Dense(5,))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn_2 = init_cnn_model_2()


es_2 = EarlyStopping(patience=5, restore_best_weights=True)

model_cnn_2.fit(X_train_pad_2, y_train, 
          epochs=20, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es_2]
         )


res = model_cnn_2.evaluate(X_test_pad_2, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
The accuracy evaluated on the test set is of 62.653%


# AutoML with hyperopt

In [25]:
X_train, X_test, y_train, y_test = train_test_split(weighted_words, y, test_size=.7,random_state =123)


In [32]:
# Create the estimator object
estim = HyperoptEstimator()

# Search the space of classifiers and preprocessing steps and their
# respective hyperparameters in sklearn to fit a model to the data
estim.fit(X_train, y_train)

100%|██████████| 1/1 [00:03<00:00,  3.54s/trial, best loss: 0.24726477024070026]
100%|██████████| 2/2 [00:02<00:00,  2.63s/trial, best loss: 0.24507658643326036]
100%|██████████| 3/3 [00:02<00:00,  2.79s/trial, best loss: 0.24507658643326036]
100%|██████████| 4/4 [00:02<00:00,  2.77s/trial, best loss: 0.24507658643326036]
100%|██████████| 5/5 [00:02<00:00,  2.92s/trial, best loss: 0.24507658643326036]
100%|██████████| 6/6 [00:02<00:00,  2.89s/trial, best loss: 0.24507658643326036]
100%|██████████| 7/7 [00:02<00:00,  2.88s/trial, best loss: 0.24507658643326036]
100%|██████████| 8/8 [00:02<00:00,  2.81s/trial, best loss: 0.24507658643326036]
100%|██████████| 9/9 [00:02<00:00,  2.83s/trial, best loss: 0.24507658643326036]
100%|████████| 10/10 [00:02<00:00,  2.58s/trial, best loss: 0.24507658643326036]


In [34]:

# Report the accuracy of the classifier on a given set of data
score = estim.score(X_test, y_test)

# Return instances of the classifier and preprocessing steps
model = estim.best_model()

In [40]:
score

0.749718574108818

In [39]:
test["clean_text"] = test.text.apply(preprocessing)
test["target"] = estim.predict(test["clean_text"])

res = pd.DataFrame(test[["id", "target"]])
res = res.set_index("id", drop=True)


ValueError: could not convert string to float: 'happen terrible car crash'

# TODOs
Data Preprocessing:
- remove hashttags, remove unnecessary words .....
- descriptive analysis about dataset

In [None]:
https://www.kaggle.com/code/kushal1506/bullying-tweet-classification
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r',' ').replace('\n',' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+","",text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'',text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list = string.punctuation
    table = str.maketrans('','',banned_list)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text = ' '.join(word for word in text.split() if len(word)<14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t","can not",text)
    text = re.sub(r"n\'t"," not",text)
    text = re.sub(r"\'re"," are",text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2


#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent=[]
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+"," ",text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
#NOTE:Stemming seems to work better for this dataset
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

#Then we apply all the defined functions in the following order
def deep_clean(text):
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    return text