In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from pandas.io.json import json_normalize
import pandas as pd
import numpy as np

import re 

from random import seed
from tqdm import tqdm

import tensorflow as tf    
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

import sympound 
from autocorrect import Speller
import splitter

from spellchecker import SpellChecker

import nltk
from nltk.corpus import stopwords
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


Using TensorFlow backend.


In [2]:
import os 
os.getcwd()

'C:\\Users\\thanisb\\Documents\\Competition\\Zindi\\Tech4MentalHealth\\Notebook'

In [8]:
train_DF = pd.read_csv('../data/train_corrected.csv')
test_DF = pd.read_csv('../data/test_corrected.csv')

In [9]:
# Convert the label to OHE
train_DF = pd.concat([train_DF[['ID', 'text']], pd.get_dummies(train_DF.label)], axis = 1)
labels = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
train_DF

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,i feel that it was better i dream happy,0,1,0,0
1,9JDAGUV3,why do i get hallucinations,0,0,1,0
2,419WR1LQ,i am stressed due to lack of financial support...,0,1,0,0
3,6UY7DX6Q,why is life important,0,0,0,1
4,FYC0FTFB,how could i be helped to go through the depres...,0,1,0,0
...,...,...,...,...,...,...
611,BOHSNXCN,what should i do to stop alcoholism,1,0,0,0
612,GVDXRQPY,how to become my oneself again,0,0,0,1
613,IO4JHIQS,how can someone stop it,1,0,0,0
614,1DS3P1XO,i feel unworthy,0,1,0,0


In [10]:
def preprocessing(text, remove_stopwords = True):
    try:
        #print(text)
        processed_text = text.lower()
        processed_text = re.sub("[^a-zA-Z]"," ",processed_text)
        processed_text = processed_text.lower().split()
        #print("processed", processed_text)
        words = processed_text
        if remove_stopwords:
                stops = set(stopwords.words("english"))     
                words = [w for w in processed_text if not w in stops]
    except AttributeError:  # handling the case where the token is empty
        words = ''
    
    return words

def review_sentences(review, tokenizer, remove_stopwords=True):
    # 1. Using nltk tokenizer
    try:
        raw_sentences = tokenizer.tokenize(review.strip())
        
    except AttributeError:  # handling the case where the token is empty
        raw_sentences = ''

    if len(raw_sentences) > 1 : raw_sentences = [" ".join(raw_sentences)]

    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(preprocessing(raw_sentence, remove_stopwords))

    # This returns the list of lists
    return sentences



In [11]:
def Glove_embedding(Glove_path):
    print("Getting the GLove embedding...")
    embedding_50d = {}
    with open(Glove_path, 'r', encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embedding_50d[word] = vector
    print("Done")
    return embedding_50d

Glove_path = "C:/Users/thanisb/Documents/Competition/Glove/glove.6B.300d.txt"
embedding_50d = Glove_embedding(Glove_path) 

Getting the GLove embedding...
Done


In [12]:
def featureVecMethod_v2(words, model, features_dim):
    # Pre-initialising empty numpy array for speed
    featureVec = np.empty(shape=[0, features_dim],  dtype="float32")
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    
    for word in words:
        if word in model:
            #print(word)
            word_vector = model[word]
            featureVec = np.concatenate((featureVec, word_vector.reshape(-1, features_dim)), axis=0)        
    return featureVec

def getAvgFeatureVecs_v2(reviews, model, sent_len, features_dim):
    reviewFeatureVecs = np.zeros((len(reviews), sent_len, features_dim),dtype="float32")
    for i, review in enumerate(reviews):
        #print(review)
        vectors = featureVecMethod_v2(review, model, features_dim)
        reviewFeatureVecs[i] = vectors
        
    return reviewFeatureVecs

In [13]:
Feature_dimension = 300
Max_Len = 196

train_content_sentence = []
# TOkenizing all the text to
for i, sent in enumerate(train_DF.text):
    content = review_sentences(sent, tokenizer, remove_stopwords=False)
    pad_len = Max_Len - len(content[0])
    content = list(content[0] + (' pad' * pad_len).split())
    train_content_sentence.append(content)
    #train_content_sentence += content
print(len(train_content_sentence))

test_content_sentence = []
for i, sent in enumerate(test_DF.text):
    content = review_sentences(sent, tokenizer, remove_stopwords=False)
    pad_len = Max_Len - len(content[0])
    content = list(content[0] + (' pad' * pad_len).split())
    test_content_sentence.append(content)
    #test_content_sentence += content
print(len(test_content_sentence))

# Overall_content_sentence = train_content_sentence + test_content_sentence
# print(len(Overall_content_sentence))

# model = w2vec_model(Overall_content_sentence, feature_embed = Feature_dimension)


616
309


In [14]:
train_content_embed = getAvgFeatureVecs_v2(train_content_sentence, 
                                           model = embedding_50d, 
                                           sent_len = Max_Len, 
                                           features_dim = Feature_dimension)
print(train_content_embed.shape)

test_content_embed = getAvgFeatureVecs_v2(test_content_sentence, 
                                          model = embedding_50d, 
                                          sent_len = Max_Len,
                                          features_dim = Feature_dimension)
print(test_content_embed.shape)

(616, 196, 300)
(309, 196, 300)


In [15]:
dep = labels
indep = train_DF.columns.difference(labels + ['ID'])

np.random.seed(100)
train_local_X, valid_local_X, train_local_Y, valid_local_Y = train_test_split(train_content_embed,
                                                                              train_DF[dep].values,
                                                                              test_size = 0.2,
                                                                              random_state = 100)
print(train_local_X.shape, valid_local_X.shape, train_local_Y.shape, valid_local_Y.shape)

(492, 196, 300) (124, 196, 300) (492, 4) (124, 4)


In [16]:
K.clear_session()

tf.random.set_seed(100)
np.random.seed(100)
seed(100)

K.manual_variable_initialization(True)

model = Sequential()
model.add(GRU(5, return_sequences = True, input_shape = (train_local_X.shape[1], train_local_X.shape[2]), activation = 'relu'))
model.add(GRU(5, return_sequences = True, activation = 'relu'))
model.add(GRU(5, return_sequences = False, activation = 'relu'))
# model.add(Dense(10, activation = 'relu'))
model.add(Dense(len(dep), activation = 'sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'binary_crossentropy'])


In [49]:
epochs = 100
batch_size = 64

early_stop = EarlyStopping(monitor='val_loss'
                           ,verbose = True
                           ,mode = 'min'
                           ,patience = 15
                           #,min_delta=0.0001
                          )
checkpoint = ModelCheckpoint(monitor = 'val_loss',
                             mode = 'min',
                             filepath = '../output/best_local_model.hdf5', 
                             verbose = 1, 
                             save_best_only = True)

history = model.fit(train_local_X, train_local_Y, 
                    epochs=epochs, 
                    batch_size= batch_size,
                    #validation_split= 0.2,
                    validation_data= (valid_local_X, valid_local_Y),
                    callbacks=[early_stop, checkpoint])

Train on 492 samples, validate on 124 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.68798, saving model to ../output/best_local_model.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.68798 to 0.68143, saving model to ../output/best_local_model.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.68143 to 0.67227, saving model to ../output/best_local_model.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.67227 to 0.66015, saving model to ../output/best_local_model.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.66015 to 0.64510, saving model to ../output/best_local_model.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.64510 to 0.62740, saving model to ../output/best_local_model.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.62740 to 0.60893, saving model to ../output/best_local_model.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.60893 to 0.59257, saving model to ../output/best_local_model.hdf5
Epoch 9/100

Epoch 000


Epoch 00053: val_loss improved from 0.47338 to 0.47322, saving model to ../output/best_local_model.hdf5
Epoch 54/100

Epoch 00054: val_loss improved from 0.47322 to 0.47319, saving model to ../output/best_local_model.hdf5
Epoch 55/100

Epoch 00055: val_loss improved from 0.47319 to 0.47316, saving model to ../output/best_local_model.hdf5
Epoch 56/100

Epoch 00056: val_loss improved from 0.47316 to 0.47309, saving model to ../output/best_local_model.hdf5
Epoch 57/100

Epoch 00057: val_loss improved from 0.47309 to 0.47294, saving model to ../output/best_local_model.hdf5
Epoch 58/100

Epoch 00058: val_loss did not improve from 0.47294
Epoch 59/100

Epoch 00059: val_loss did not improve from 0.47294
Epoch 60/100

Epoch 00060: val_loss improved from 0.47294 to 0.47284, saving model to ../output/best_local_model.hdf5
Epoch 61/100

Epoch 00061: val_loss improved from 0.47284 to 0.47276, saving model to ../output/best_local_model.hdf5
Epoch 62/100

Epoch 00062: val_loss did not improve from 


Epoch 00082: val_loss improved from 0.47237 to 0.47233, saving model to ../output/best_local_model.hdf5
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.47233
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.47233
Epoch 85/100

Epoch 00085: val_loss improved from 0.47233 to 0.47232, saving model to ../output/best_local_model.hdf5
Epoch 86/100

Epoch 00086: val_loss improved from 0.47232 to 0.47230, saving model to ../output/best_local_model.hdf5
Epoch 87/100

Epoch 00087: val_loss improved from 0.47230 to 0.47226, saving model to ../output/best_local_model.hdf5
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.47226
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.47226
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.47226
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.47226
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.47226
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.47226
Epoch 94/100

Epoch 0

In [50]:
# Running the prediction
model.load_weights("../output/best_local_model.hdf5")
valid_pred = model.predict(valid_local_X).argmax(axis = 1)
accuracy_score(valid_pred, valid_local_Y.argmax(axis = 1)), model.predict(test_content_embed).argmax(axis = 1)

(0.5645161290322581,
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 

In [17]:
epochs = 100
batch_size = 64

early_stop = EarlyStopping(monitor='val_loss'
                           ,verbose = True
                           ,mode = 'min'
                           ,patience = 25
                           #,min_delta=0.0001
                          )
checkpoint = ModelCheckpoint(monitor = 'val_loss',
                             mode = 'min',
                             filepath = '../output/best_prod_model.hdf5', 
                             verbose = 1, 
                             save_best_only = True)

history = model.fit(train_content_embed, train_DF[dep].values, 
                    epochs=epochs, 
                    batch_size= batch_size,
                    #validation_split= 0.2,
                    validation_data= (valid_local_X, valid_local_Y),
                    callbacks=[early_stop, checkpoint])

Train on 616 samples, validate on 124 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.68644, saving model to ../output/best_prod_model.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.68644 to 0.67812, saving model to ../output/best_prod_model.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.67812 to 0.66474, saving model to ../output/best_prod_model.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.66474 to 0.64714, saving model to ../output/best_prod_model.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.64714 to 0.62530, saving model to ../output/best_prod_model.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.62530 to 0.60255, saving model to ../output/best_prod_model.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.60255 to 0.58242, saving model to ../output/best_prod_model.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.58242 to 0.56508, saving model to ../output/best_prod_model.hdf5
Epoch 9/100

Epoch 00009: val_


Epoch 00054: val_loss improved from 0.47232 to 0.47230, saving model to ../output/best_prod_model.hdf5
Epoch 55/100

Epoch 00055: val_loss did not improve from 0.47230
Epoch 56/100

Epoch 00056: val_loss did not improve from 0.47230
Epoch 57/100

Epoch 00057: val_loss improved from 0.47230 to 0.47221, saving model to ../output/best_prod_model.hdf5
Epoch 58/100

Epoch 00058: val_loss did not improve from 0.47221
Epoch 59/100

Epoch 00059: val_loss did not improve from 0.47221
Epoch 60/100

Epoch 00060: val_loss did not improve from 0.47221
Epoch 61/100

Epoch 00061: val_loss did not improve from 0.47221
Epoch 62/100

Epoch 00062: val_loss improved from 0.47221 to 0.47221, saving model to ../output/best_prod_model.hdf5
Epoch 63/100

Epoch 00063: val_loss did not improve from 0.47221
Epoch 64/100

Epoch 00064: val_loss did not improve from 0.47221
Epoch 65/100

Epoch 00065: val_loss did not improve from 0.47221
Epoch 66/100

Epoch 00066: val_loss improved from 0.47221 to 0.47216, saving 


Epoch 00083: val_loss did not improve from 0.47209
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.47209
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.47209
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.47209
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.47209
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.47209
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.47209
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.47209
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.47209
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.47209
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.47209
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.47209
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.47209
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.47209
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.47209
Epoch 98/100

Epoch 00098: val_loss di

In [18]:
# Running the prediction
model.load_weights("../output/best_prod_model.hdf5")
valid_pred = model.predict(valid_local_X).argmax(axis = 1)
accuracy_score(valid_pred, valid_local_Y.argmax(axis = 1)), model.predict(test_content_embed).argmax(axis = 1)

(0.5645161290322581,
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 

In [47]:
predicted_output = model.predict(test_content_embed)#.argmax(axis = 1)

final_output = pd.DataFrame(predicted_output)
final_output.columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
final_output['ID'] = test_DF.ID

final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.273115,0.073476,0.187834,0.125925
1,03BMGTOK,0.978315,0.004156,0.024107,0.032699
2,03LZVFM6,0.999711,0.001608,0.000134,0.002577
3,0EPULUM5,0.997606,0.002513,0.002343,0.009697
4,0GM4C5GD,0.212696,0.150022,0.097651,0.117226
...,...,...,...,...,...
304,Z9A6ACLK,0.739813,0.008243,0.548140,0.212157
305,ZDUOIGKN,0.816554,0.011764,0.103487,0.079620
306,ZHQ60CCH,0.124871,0.183525,0.210371,0.193468
307,ZVIJMA4O,0.050327,0.340298,0.217203,0.232231


In [17]:
final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output.to_csv('../output/sub_42_GRU.csv', index = False)