In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from pandas.io.json import json_normalize
import pandas as pd
import numpy as np

import xgboost as xgb
import re 

from random import seed
from tqdm import tqdm

import tensorflow as tf    
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

import sympound 
from autocorrect import Speller
import splitter

from spellchecker import SpellChecker

import gensim
import nltk
from nltk.corpus import stopwords
from gensim.models import word2vec as w2v
LabeledSentence = gensim.models.doc2vec.LabeledSentence
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


Using TensorFlow backend.


In [2]:
import os 
os.getcwd()

'C:\\Users\\thanisb\\Documents\\Competition\\Zindi\\Tech4MentalHealth\\Notebook'

In [3]:
train_DF = pd.read_csv('../data/train_corrected.csv')
test_DF = pd.read_csv('../data/test_corrected.csv')

In [4]:
# Convert the label to OHE
train_DF = pd.concat([train_DF[['ID', 'text']], pd.get_dummies(train_DF.label)], axis = 1)
labels = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
train_DF

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,i feel that it was better i dream happy,0,1,0,0
1,9JDAGUV3,why do i get hallucinations,0,0,1,0
2,419WR1LQ,i am stressed due to lack of financial support...,0,1,0,0
3,6UY7DX6Q,why is life important,0,0,0,1
4,FYC0FTFB,how could i be helped to go through the depres...,0,1,0,0
...,...,...,...,...,...,...
611,BOHSNXCN,what should i do to stop alcoholism,1,0,0,0
612,GVDXRQPY,how to become my oneself again,0,0,0,1
613,IO4JHIQS,how can someone stop it,1,0,0,0
614,1DS3P1XO,i feel unworthy,0,1,0,0


In [5]:
def preprocessing(text, remove_stopwords = True):
    try:
        #print(text)
        processed_text = text.lower()
        processed_text = re.sub("[^a-zA-Z]"," ",processed_text)
        processed_text = processed_text.lower().split()
        #print("processed", processed_text)
        words = processed_text
        if remove_stopwords:
                stops = set(stopwords.words("english"))     
                words = [w for w in processed_text if not w in stops]
    except AttributeError:  # handling the case where the token is empty
        words = ''
    
    return words

def review_sentences(review, tokenizer, remove_stopwords=True):
    # 1. Using nltk tokenizer
    try:
        raw_sentences = tokenizer.tokenize(review.strip())
        
    except AttributeError:  # handling the case where the token is empty
        raw_sentences = ''

    if len(raw_sentences) > 1 : raw_sentences = [" ".join(raw_sentences)]

    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(preprocessing(raw_sentence, remove_stopwords))

    # This returns the list of lists
    return sentences



In [6]:
# Training a Word2Vec model
def w2vec_model(text, feature_embed):
    num_features = feature_embed  # Word vector dimensionality
    min_word_count = 1  # Minimum word count
    num_workers = 4     # Number of parallel threads
    context = 10        # Context window size
    downsampling = 1e-3 # (0.001) Downsample setting for frequent words

    print("Training model....")
    model = w2v.Word2Vec(text,
                         workers=num_workers,
                         size=num_features,
                         min_count=min_word_count,
                         window=context,
                         sample=downsampling
    )

    #model.build_vocab(sentence)
    model.train(text, total_examples= model.corpus_count, epochs=300)

    # # To make the model memory efficient
    # model.init_sims(replace=True)

    # # Saving the model for later use. Can be loaded using Word2Vec.load()
    # model_name = "300features_40minwords_10context"
    # model.save(model_name)

    print("Vocabulary shape", model.wv.syn0.shape)
    return model

In [7]:
def featureVecMethod_v2(words, model, features_dim):
    # Pre-initialising empty numpy array for speed
    featureVec = np.empty(shape=[0, features_dim],  dtype="float32")
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            word_vector = model[word]
            featureVec = np.concatenate((featureVec, word_vector.reshape(-1, features_dim)), axis=0)        
    return featureVec

def getAvgFeatureVecs_v2(reviews, model, sent_len, features_dim):
    reviewFeatureVecs = np.zeros((len(reviews), sent_len, features_dim),dtype="float32")
    for i, review in enumerate(reviews):
        vectors = featureVecMethod_v2(review, model, features_dim)
        reviewFeatureVecs[i] = vectors
        
    return reviewFeatureVecs

In [8]:
Feature_dimension = 300
Max_Len = 196

train_content_sentence = []
# TOkenizing all the text to
for i, sent in enumerate(train_DF.text):
    content = review_sentences(sent, tokenizer, remove_stopwords=False)
    pad_len = Max_Len - len(content[0])
    content = list(content[0] + (' PAD' * pad_len).split())
    train_content_sentence.append(content)
    #train_content_sentence += content
print(len(train_content_sentence))

test_content_sentence = []
for i, sent in enumerate(test_DF.text):
    content = review_sentences(sent, tokenizer, remove_stopwords=False)
    pad_len = Max_Len - len(content[0])
    content = list(content[0] + (' PAD' * pad_len).split())
    test_content_sentence.append(content)
    #test_content_sentence += content
print(len(test_content_sentence))
    

Overall_content_sentence = train_content_sentence + test_content_sentence
print(len(Overall_content_sentence))

model = w2vec_model(Overall_content_sentence, feature_embed = Feature_dimension)


616
309
925
Training model....
Vocabulary shape (1000, 300)




In [9]:
train_content_embed = getAvgFeatureVecs_v2(train_content_sentence, 
                                           model, 
                                           sent_len = Max_Len, 
                                           features_dim = Feature_dimension)
print(train_content_embed.shape)

test_content_embed = getAvgFeatureVecs_v2(test_content_sentence, 
                                          model, 
                                          sent_len = Max_Len,
                                          features_dim = Feature_dimension)
print(test_content_embed.shape)

  # Remove the CWD from sys.path while we load stuff.


(616, 196, 300)
(309, 196, 300)


In [10]:
dep = labels
indep = train_DF.columns.difference(labels + ['ID'])

np.random.seed(100)
train_local_X, valid_local_X, train_local_Y, valid_local_Y = train_test_split(train_content_embed,
                                                                              train_DF[dep].values,
                                                                              test_size = 0.2,
                                                                              random_state = 100)
print(train_local_X.shape, valid_local_X.shape, train_local_Y.shape, valid_local_Y.shape)

(492, 196, 300) (124, 196, 300) (492, 4) (124, 4)


In [11]:
K.clear_session()

tf.random.set_seed(100)
np.random.seed(100)
seed(100)

model = Sequential()
model.add(GRU(100, return_sequences = True, input_shape = (train_local_X.shape[1], train_local_X.shape[2]), activation = 'relu'))
model.add(GRU(50, return_sequences = False, activation = 'relu'))
# model.add(Dense(10, activation = 'relu'))
model.add(Dense(len(dep), activation = 'sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'binary_crossentropy'])


In [12]:
epochs = 100
batch_size = 64

early_stop = EarlyStopping(monitor='val_loss'
                           ,verbose = True
                           ,mode = 'min'
                           ,patience = 15
                           #,min_delta=0.0001
                          )
checkpoint = ModelCheckpoint(monitor = 'val_loss',
                             mode = 'min',
                             filepath = '../output/best_local_model.hdf5', 
                             verbose = 1, 
                             save_best_only = True)

history = model.fit(train_local_X, train_local_Y, 
                    epochs=epochs, 
                    batch_size= batch_size,
                    #validation_split= 0.2,
                    validation_data= (valid_local_X, valid_local_Y),
                    callbacks=[early_stop, checkpoint])

Train on 492 samples, validate on 124 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.48053, saving model to ../output/best_local_model.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.48053 to 0.46601, saving model to ../output/best_local_model.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.46601 to 0.42013, saving model to ../output/best_local_model.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.42013 to 0.38254, saving model to ../output/best_local_model.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.38254 to 0.34695, saving model to ../output/best_local_model.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.34695 to 0.31252, saving model to ../output/best_local_model.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.31252 to 0.29452, saving model to ../output/best_local_model.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.29452 to 0.27663, saving model to ../output/best_local_model.hdf5
Epoch 9/100

Epoch 000

In [15]:
# Running the prediction
model.load_weights("../output/best_local_model.hdf5")
model.predict(valid_local_X).argmax(axis = 1), model.predict(test_content_embed).argmax(axis = 1)

(array([1, 1, 0, 2, 1, 0, 1, 1, 3, 1, 3, 0, 0, 1, 0, 2, 0, 0, 1, 3, 1, 1,
        1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 3,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 3, 0, 1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 2,
        1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 3, 0, 1, 3, 3, 3, 1, 3, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1], dtype=int64),
 array([2, 1, 1, 1, 0, 1, 1, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 0, 1, 3, 1, 1,
        2, 1, 3, 1, 1, 1, 1, 0, 1, 1, 3, 1, 1, 0, 1, 3, 1, 0, 1, 1, 0, 1,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 1, 0, 0, 0, 1, 1, 1, 2,
        3, 1, 3, 1, 1, 3, 2, 0, 1, 1, 0, 3, 1, 3, 3, 1, 3, 0, 1, 1, 0, 0,
        1, 3, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 3, 3, 3, 1, 3, 1, 1, 0, 0,
        1, 1, 1, 1, 3, 0, 3, 0, 1, 0, 3, 1, 1, 3, 0, 1, 3, 0, 1, 1, 0, 0,
        1, 3, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 3, 2, 1, 0, 1, 1,
        0, 3, 1, 0, 1, 1, 0, 3, 0, 2, 0, 3, 1, 

In [16]:
predicted_output = model.predict(test_content_embed)#.argmax(axis = 1)

final_output = pd.DataFrame(predicted_output)
final_output.columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
final_output['ID'] = test_DF.ID

final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.049698,0.312891,0.322518,0.330993
1,03BMGTOK,0.978126,0.000456,0.001767,0.003244
2,03LZVFM6,0.960782,0.003269,0.014660,0.017179
3,0EPULUM5,0.394141,0.021279,0.064595,0.075042
4,0GM4C5GD,0.028937,0.371191,0.257797,0.358780
...,...,...,...,...,...
304,Z9A6ACLK,0.934757,0.011615,0.053264,0.044841
305,ZDUOIGKN,0.853093,0.023830,0.089158,0.056398
306,ZHQ60CCH,0.628944,0.057558,0.166942,0.091595
307,ZVIJMA4O,0.009613,0.488795,0.127024,0.294661


In [17]:
final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output.to_csv('../output/sub_42_GRU.csv', index = False)