<a href="https://colab.research.google.com/github/tabaraei/depression-detection/blob/master/baseline_replication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- They merge all the existing data, and perform a Kfold (k=3) to distribute train/test sets.
- This code is designed to augment or resample data points based on certain conditions (whether an index is in audio_dep_idxs_tmp) by generating permutations of features and adding them to audio_features and corresponding labels (audio_targets). The augmented data points are then tracked using train_idxs. This approach likely aims to increase the diversity of training data by creating variations of existing data points.

In [1]:
from google.colab import drive


DATASET_DIR = '/content/drive/MyDrive/Data/DepressionDetection/EATD-Corpus'
BASELINE_DIR = '/content/drive/MyDrive/Data/DepressionDetection/Baseline'
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
os.makedirs(f'{BASELINE_DIR}/Features/AudioWhole', exist_ok=True)
os.makedirs(f'{BASELINE_DIR}/Features/TextWhole', exist_ok=True)
os.makedirs(f'{BASELINE_DIR}/Features/TrainIdx', exist_ok=True)

os.makedirs(f'{BASELINE_DIR}/Model/ClassificationWhole/Audio', exist_ok=True)
os.makedirs(f'{BASELINE_DIR}/Model/ClassificationWhole/Text', exist_ok=True)
os.makedirs(f'{BASELINE_DIR}/Model/ClassificationWhole/Fuse', exist_ok=True)
os.makedirs(f'{BASELINE_DIR}/Model/ELMoForManyLangs', exist_ok=True)

In [None]:
# path_to_remove_audio_features = f'{BASELINE_DIR}/Features/AudioWhole'
# os.system(f'rm -rf {path_to_remove_audio_features}/*')

# path_to_remove_text_features = f'{BASELINE_DIR}/Features/TextWhole'
# os.system(f'rm -rf {path_to_remove_text_features}/*')

path_to_remove_train_indecs = f'{BASELINE_DIR}/Features/TrainIdx'
os.system(f'rm -rf {path_to_remove_train_indecs}/*')

path_to_remove_trained_models = f'{BASELINE_DIR}/Model/ClassificationWhole'
os.system(f'rm -rf {path_to_remove_trained_models}/*')

# Audio

## `NetVLAD`
Used to enforce same-length audio features extracted as clusters

In [None]:
import math
import tensorflow as tf
# import tensorflow.contrib.slim as slim
import numpy as np
from keras import initializers, layers
import keras.backend as K
import sys

In [None]:
class NetVLAD(layers.Layer):
    """Creates a NetVLAD class.
    """
    def __init__(self, feature_size, max_samples, cluster_size, output_dim, **kwargs):

        self.feature_size = feature_size
        self.max_samples = max_samples
        self.output_dim = output_dim
        self.cluster_size = cluster_size
        super(NetVLAD, self).__init__(**kwargs)

    def build(self, input_shape):
    # Create a trainable weight variable for this layer.
        self.cluster_weights = self.add_weight(name='kernel_W1',
                                      shape=(self.feature_size, self.cluster_size),
                                      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)),
                                      trainable=True)
        self.cluster_biases = self.add_weight(name='kernel_B1',
                                      shape=(self.cluster_size,),
                                      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)),
                                      trainable=True)
        self.cluster_weights2 = self.add_weight(name='kernel_W2',
                                      shape=(1,self.feature_size, self.cluster_size),
                                      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)),
                                      trainable=True)
        self.hidden1_weights = self.add_weight(name='kernel_H1',
                                      shape=(self.cluster_size*self.feature_size, self.output_dim),
                                      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)),
                                      trainable=True)

        super(NetVLAD, self).build(input_shape)  # Be sure to call this at the end

    def call(self, reshaped_input):
        """Forward pass of a NetVLAD block.

        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

        Returns:
        vlad: the pooled vector of size: 'batch_size' x 'output_dim'
        """
        """
        In Keras, there are two way to do matrix multiplication (dot product)
        1) K.dot : AxB -> when A has batchsize and B doesn't, use K.dot
        2) tf.matmul: AxB -> when A and B both have batchsize, use tf.matmul

        Error example: Use tf.matmul when A has batchsize (3 dim) and B doesn't (2 dim)
        ValueError: Shape must be rank 2 but is rank 3 for 'net_vlad_1/MatMul' (op: 'MatMul') with input shapes: [?,21,64], [64,3]

        tf.matmul might still work when the dim of A is (?,64), but this is too confusing.
        Just follow the above rules.
        """
        activation = K.dot(reshaped_input, self.cluster_weights)

        activation += self.cluster_biases

        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation,
                [-1, self.max_samples, self.cluster_size])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        a = tf.multiply(a_sum,self.cluster_weights2)

        activation = tf.transpose(activation,perm=[0,2,1])

        reshaped_input = tf.reshape(reshaped_input,[-1,
            self.max_samples, self.feature_size])

        vlad = tf.matmul(activation,reshaped_input)
        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.subtract(vlad,a)
        vlad = tf.nn.l2_normalize(vlad,1)
        vlad = tf.reshape(vlad,[-1, self.cluster_size*self.feature_size])
        vlad = tf.nn.l2_normalize(vlad,1)
        vlad = K.dot(vlad, self.hidden1_weights)

        return vlad

    def compute_output_shape(self, input_shape):
        return tuple([None, self.output_dim])

## `audio_features_whole.py`
Extracts the audio features and stores them

In [None]:
import os
import numpy as np
import pandas as pd
import wave
import librosa
# from python_speech_features import *
import sys
import pickle
import tensorflow.compat.v1 as tf
# import vggish.vggish_input as vggish_input
# import vggish.vggish_params as vggish_params
# import vggish.vggish_postprocess as vggish_postprocess
# import vggish.vggish_slim as vggish_slim
# import loupe_keras as lpk
# from allennlp.commands.elmo import ElmoEmbedder
from tqdm.notebook import trange, tqdm

In [None]:
# sys.path.append('/Users/linlin/Desktop/depression/classfication')

tf.enable_eager_execution()

# elmo = ElmoEmbedder()

# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# prefix = os.path.abspath(os.path.join(os.getcwd(), "."))

# # Paths to downloaded VGGish files.
# checkpoint_path =os.path.join(os.getcwd(),  'vggish/vggish_model.ckpt')
# pca_params_path = os.path.join(os.getcwd(), 'vggish/vggish_pca_params.npz')

cluster_size = 16
min_len = 100
max_len = -1

In [None]:
# def to_vggish_embedds(x, sr):
#     # x为输入的音频，sr为sample_rate
#     input_batch = vggish_input.waveform_to_examples(x, sr)
#     with tf.Graph().as_default(), tf.Session() as sess:
#       vggish_slim.define_vggish_slim()
#       vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

#       features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
#       embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
#       [embedding_batch] = sess.run([embedding_tensor],
#                                    feed_dict={features_tensor: input_batch})

#     # Postprocess the results to produce whitened quantized embeddings.
#     pproc = vggish_postprocess.Postprocessor(pca_params_path)
#     postprocessed_batch = pproc.postprocess(embedding_batch)

#     return tf.cast(postprocessed_batch, dtype='float32')

In [None]:
def wav2vlad(wave_data, sr):
    global cluster_size
    signal = wave_data
    melspec = librosa.feature.melspectrogram(y=signal, n_mels=80,sr=sr).astype(np.float32).T
    melspec = np.log(np.maximum(1e-6, melspec))
    feature_size = melspec.shape[1]
    max_samples = melspec.shape[0]
    output_dim = cluster_size * 16
    feat = NetVLAD(feature_size=feature_size, max_samples=max_samples, \
                            cluster_size=cluster_size, output_dim=output_dim) \
                                (tf.convert_to_tensor(melspec))
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        r = feat.numpy()
    return r

In [None]:
def extract_features(audio_features, targets, folder):
    global max_len, min_len
    if not os.path.exists(f'{DATASET_DIR}/{folder}/positive_out.wav'):
        return
    positive_file = wave.open(f'{DATASET_DIR}/{folder}/positive_out.wav')
    sr1 = positive_file.getframerate()
    nframes1 = positive_file.getnframes()
    wave_data1 = np.frombuffer(positive_file.readframes(nframes1), dtype=np.short).astype(float)
    len1 = nframes1 / sr1

    neutral_file = wave.open(f'{DATASET_DIR}/{folder}/neutral_out.wav')
    sr2 = neutral_file.getframerate()
    nframes2 = neutral_file.getnframes()
    wave_data2 = np.frombuffer(neutral_file.readframes(nframes2), dtype=np.short).astype(float)
    len2 = nframes2 / sr2

    negative_file = wave.open(f'{DATASET_DIR}/{folder}/negative_out.wav')
    sr3 = negative_file.getframerate()
    nframes3 = negative_file.getnframes()
    wave_data3 = np.frombuffer(negative_file.readframes(nframes3), dtype=np.short).astype(float)
    len3 = nframes3/sr3

    for l in [len1, len2, len3]:
        if l > max_len:
            max_len = l
        if l < min_len:
            min_len = l

    with open(f'{DATASET_DIR}/{folder}/new_label.txt') as fli:
        target = float(fli.readline())

    if wave_data1.shape[0] < 1:
        wave_data1 = np.array([1e-4]*sr1*5)
    if wave_data2.shape[0] < 1:
        wave_data2 = np.array([1e-4]*sr2*5)
    if wave_data3.shape[0] < 1:
        wave_data3 = np.array([1e-4]*sr3*5)
    audio_features.append([wav2vlad(wave_data1, sr1), wav2vlad(wave_data2, sr2), \
        wav2vlad(wave_data3, sr3)])
    targets.append(1 if target >= 53 else 0)
    # targets.append(target)

In [None]:
audio_features = []
audio_targets = []

for index in trange(114):
    extract_features(audio_features, audio_targets, f't_{index+1}')

for index in trange(114):
    extract_features(audio_features, audio_targets, f'v_{index+1}')

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

In [None]:
print("Saving npz file locally...")
np.savez(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_{cluster_size*16}.npz', audio_features)
np.savez(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_{cluster_size*16}.npz', audio_targets)

print(max_len, min_len)

Saving npz file locally...
111.02 0.0


In [None]:
len(audio_features), np.array(audio_features[0]).shape

(162, (3, 1, 256))

## `audio_gru_whole.py`
The main learning algorithm and network architecture of BiLSTM for audio

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools
from tqdm.notebook import trange, tqdm

In [3]:
audio_features = np.squeeze(np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_256.npz')['arr_0'], axis=2)
audio_targets = np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_256.npz')['arr_0']
audio_dep_idxs_tmp = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]
audio_features.shape, audio_targets.shape

((162, 3, 256), (162,))

In [None]:
class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        # self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if not 'ln' in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # self.lstm_net_audio = nn.LSTM(self.embedding_size,
        #                         self.hidden_dims,
        #                         num_layers=self.rnn_layers,
        #                         dropout=self.dropout,
        #                         bidirectional=self.bidirectional,
        #                         batch_first=True)
        self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.ln = nn.LayerNorm(self.embedding_size)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x = self.ln(x)
        x, _ = self.lstm_net_audio(x)
        x = x.mean(dim=1)
        out = self.fc_audio(x)
        return out

In [None]:
config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 8,
    'epochs': 170,
    'learning_rate': 6e-6,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

In [None]:
def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test.cpu().numpy(), y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred.numpy())
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def train(epoch, train_idxs):
    global lr, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    pred = np.array([])
    X_train = audio_features[train_idxs]
    Y_train = audio_targets[train_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y))

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        pred = output.data.max(1, keepdim=True)[1]
        #print(pred.shape, y.shape)
        correct += pred.eq(y.data.view_as(pred)).cpu().sum()
        loss = criterion(output, y)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()

    train_acc = correct
    print(
        'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '
        .format(epoch + 1, config['learning_rate'], total_loss, correct,
                X_train.shape[0], 100. * correct / X_train.shape[0]))


def evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec
    pred = np.array([])
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(audio_targets[test_idxs])).cuda()
        else:
            x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(audio_targets[test_idxs])).type(torch.LongTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()

        y_test_pred, conf_matrix = model_performance(y, output.cpu())
        accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
        precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
        recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
        f1_score = 2 * (precision * recall) / (precision + recall)
        print("Accuracy: {}".format(accuracy))
        print("Precision: {}".format(precision))
        print("Recall: {}".format(recall))
        print("F1-Score: {}\n".format(f1_score))
        print('=' * 89)

        if max_f1 <= f1_score and train_acc > len(train_idxs)*0.90  and f1_score > 0.5:
            max_f1 = f1_score
            max_acc = accuracy
            max_rec = recall
            max_prec = precision
            mode ='gru'
            save(model, f"{BASELINE_DIR}/Model/ClassificationWhole/Audio/BiLSTM_{mode}_vlad{config['embedding_size']}_{config['hidden_dims']}_{max_f1:.2f}_{fold}")
            np.save(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_{f1_score:.2f}_{fold}.npy', train_idxs_tmp)
            print('*' * 64)
            print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
            print('*' * 64)

    return total_loss

def get_param_group(model):
    nd_list = []
    param_list = []
    for name, param in model.named_parameters():
        if 'ln' in name:
            nd_list.append(param)
        else:
            param_list.append(param)
    return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}]

In [None]:
kf = KFold(n_splits=3, shuffle=True)
fold = 1
for train_idxs_tmp, test_idxs_tmp in kf.split(audio_features):
# train_idxs_tmps = [
#     np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.63_1.npy', allow_pickle=True),
#     np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_2.npy', allow_pickle=True),
#     np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_3.npy', allow_pickle=True)
# ]
# for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
#     fold = idx_idx + 1
#     # if idx_idx != 1:
#     #     continue
#     test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    resample_idxs = [0,1,2,3,4,5]
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    train_idxs.append(len(audio_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    test_idxs.append(len(audio_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
        # test_idxs.append(idx)

    model = AudioBiLSTM(config)

    if config['cuda']:
        model = model.cuda()

    param_group = get_param_group(model)
    optimizer = optim.AdamW(param_group, lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss()
    # criterion = FocalLoss(class_num=2)
    max_f1 = -1
    max_acc = -1
    max_rec = -1
    max_prec = -1
    train_acc = -1

    for ep in trange(1, config['epochs']):
        train(ep, train_idxs)
        tloss = evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs)
    fold += 1

## `AudioModelChecking.py`
Only loads the previously learnt "BiLSTM_gru_vlad256_256_0" models and reports the performances

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import re
import os
import tensorflow.compat.v1 as tf
import random
import itertools
# from audio_gru_whole import AudioBiLSTM

from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, rnn_layers, dropout, num_classes, audio_hidden_dims, audio_embed_size):
        super(BiLSTM, self).__init__()

        self.lstm_net_audio = nn.GRU(audio_embed_size, audio_hidden_dims,
                                num_layers=rnn_layers, dropout=dropout, batch_first=True)

        self.fc_audio = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(audio_hidden_dims, audio_hidden_dims),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(audio_hidden_dims, num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x, _ = self.lstm_net_audio(x)
        # x = self.bn(x)
        x = x.sum(dim=1)
        out = self.fc_audio(x)
        return out

In [None]:
# prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
# audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/whole_samples_clf_avid256.npz'))['arr_0'], axis=2)
# audio_targets = np.load(os.path.join(prefix, 'Features/Audio/whole_labels_clf_avid256.npz'))['arr_0']

audio_features = np.squeeze(np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_256.npz')['arr_0'], axis=2)
audio_targets = np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_256.npz')['arr_0']

audio_dep_idxs = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]

In [None]:
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

In [None]:
config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 4,
    'epochs': 100,
    'learning_rate': 1e-5,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

In [None]:
# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio/BiLSTM_gru_vlad256_256_0.80.pt'))
# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio3/BiLSTM_gru_vlad256_256_0.89.pt'))
# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio2/BiLSTM_gru_vlad256_256_0.65.pt'))

# model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], \
#          config['hidden_dims'], config['embedding_size'])

# model_state_dict = {}
# model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
# model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
# model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
# model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

# model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
# model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
# model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
# model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

# model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
# model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
# model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
# model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
# model_state_dict = audio_lstm_model.state_dict()
# model.load_state_dict(model_state_dict, strict=False)

In [None]:
def evaluate(model, test_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    # X_test = audio_features[test_dep_idxs+test_non_idxs]
    # Y_test = audio_targets[test_dep_idxs+test_non_idxs]
    X_test = audio_features[test_idxs]
    Y_test = audio_targets[test_idxs]
    global max_train_acc, max_acc,max_f1
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y))
        with torch.no_grad():
            output = model(x.squeeze(2))
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))

    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    return precision, recall, f1_score

In [None]:
# evaluate(audio_features_test, fuse_targets_test, audio_lstm_model)
# evaluate(model)

idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt']
ps, rs, fs = [], [], []
for fold in range(3):
    train_idxs_tmp = np.load(f'{BASELINE_DIR}/Features/TrainIdx/{idxs_paths[fold]}', allow_pickle=True)
    test_idxs_tmp = list(set(list(audio_dep_idxs)+list(audio_non_idxs)) - set(train_idxs_tmp))
    audio_lstm_model = torch.load(f'{BASELINE_DIR}/Model/ClassificationWhole/Audio/{audio_model_paths[fold]}')

    train_idxs, test_idxs = [], []
    for idx in train_idxs_tmp:
        if idx in audio_dep_idxs:
            feat = audio_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    train_idxs.append(len(audio_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in audio_dep_idxs:
            feat = audio_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    test_idxs.append(len(audio_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
    p, r, f = evaluate(audio_lstm_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))

## `AudioTraditionalClassifiers.py`
Tries "DecisionTreeClassifier", "LogisticRegression", "SVC", and "RandomForestClassifier" on the same folds to compare against the proposed GRU

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
audio_features = np.squeeze(np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_256.npz')['arr_0'], axis=2)
audio_targets = np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_256.npz')['arr_0']

audio_dep_idxs_tmp = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]

In [None]:
def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def standard_confusion_matrix(y_test, y_test_pred):
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

In [None]:
train_idxs_tmps = [
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.63_1.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.65_2.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_3.npy', allow_pickle=True)
]
precs, recs, f1s = [], [], []
for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    train_idxs.append(len(audio_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    test_idxs.append(len(audio_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    X_train = audio_features[train_idxs]
    Y_train = audio_targets[train_idxs]
    X_test = audio_features[test_idxs]
    Y_test = audio_targets[test_idxs]

    # Decision Tree
    # from sklearn import tree
    # clf = tree.DecisionTreeClassifier(max_depth=20)

    # svm
    # from sklearn.svm import SVC
    # clf = SVC(kernel='sigmoid')

    # rf
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50)

    # lr
    # from sklearn.linear_model import LogisticRegression
    # clf = LogisticRegression(solver='newton-cg')

    clf.fit([f.flatten() for f in X_train], Y_train)
    pred = clf.predict([f.flatten() for f in X_test])
    # clf.fit([f.sum(axis=0) for f in X_train], Y_train)
    # pred = clf.predict([f.sum(axis=0) for f in X_test])

    y_test_pred, conf_matrix = model_performance(Y_test, pred)

    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    precs.append(0 if np.isnan(precision) else precision)
    recs.append(0 if np.isnan(recall) else recall)
    f1s.append(0 if np.isnan(f1_score) else f1_score)
    # precs.append(precision)
    # recs.append(recall)
    # f1s.append(f1_score)
print(np.mean(precs), np.mean(recs), np.mean(f1s))

# Text

## `text_features_whole.py`
Tries to extract textual features using "ELMoForManyLangs"

- From the [GitHub repository](https://github.com/HIT-SCIR/ELMoForManyLangs?tab=readme-ov-file) download the model and place in the Google Drive.
- If using the latest version of `overrides`, delete the decorator from `/content/ELMoForManyLangs/elmoformanylangs/modules/highway.py`.
- Inside `ELMoForManyLangs/zhs.model`, set `"config_path"` to `"/content/ELMoForManyLangs/elmoformanylangs/configs/cnn_50_100_512_4096_sample.json"`

In [None]:
%%capture
!pip install overrides
!git clone https://github.com/HIT-SCIR/ELMoForManyLangs.git
!python ELMoForManyLangs/setup.py install
!pip install overrides==4.1.2

In [None]:
import numpy as np
import pandas as pd
import wave
import librosa
import re
from tqdm.notebook import trange, tqdm
# from allennlp.commands.elmo import ElmoEmbedder
import os
from ELMoForManyLangs.elmoformanylangs import Embedder
# import pkuseg
# import thulac
# from pyhanlp import HanLP
import jieba
# seg = pkuseg.pkuseg()
# thu1 = thulac.thulac(seg_only=True)

In [None]:
elmo = Embedder(f'{BASELINE_DIR}/Model/ELMoForManyLangs/zhs.model')
topics = ['positive', 'neutral', 'negative']
answers = {}
text_features = []
text_targets = []

INFO:elmoformanylangs:char embedding size: 6169
INFO:elmoformanylangs:word embedding size: 71222
INFO:elmoformanylangs:Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(71222, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(6169, 50, padding_idx=6166)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
      )
    )
    (projection): Linear(in_features=2148, out_fea

In [None]:
def extract_features(text_features, text_targets, folder):
    for index in trange(114):
        if os.path.isdir(f'{DATASET_DIR}/{folder}{index+1}'):
            answers[index+1] = []
            for topic in topics:
                with open(f'{DATASET_DIR}/{folder}{index+1}/{topic}.txt' ,'r') as f:
                    lines = f.readlines()[0]
                    # seg_text = seg.cut(lines)
                    # seg_text = thu1.cut(lines)
                    # seg_text_iter = HanLP.segment(lines)
                    seg_text_iter = jieba.cut(lines, cut_all=False)
                    answers[index+1].append([item for item in seg_text_iter])
                    # answers[dir].append(seg_text)

            with open(f'{DATASET_DIR}/{folder}{index+1}/new_label.txt') as fli:
                target = float(fli.readline())
            text_targets.append(1 if target >= 53 else 0)
            # text_targets.append(target)
            text_features.append([np.array(item).mean(axis=0) for item in elmo.sents2elmo(answers[index+1])])

In [None]:
extract_features(text_features, text_targets, 't_')
extract_features(text_features, text_targets, 'v_')

print("Saving npz file locally...")
np.savez(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz', text_features)
np.savez(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz', text_targets)

  0%|          | 0/114 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.700 seconds.
DEBUG:jieba:Loading model cost 0.700 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
INFO:elmoformanylangs:1 batches, avg len: 70.3
INFO:elmoformanylangs:1 batches, avg len: 19.7
INFO:elmoformanylangs:1 batches, avg len: 33.3
INFO:elmoformanylangs:1 batches, avg len: 12.3
INFO:elmoformanylangs:1 batches, avg len: 32.7
INFO:elmoformanylangs:1 batches, avg len: 109.3
INFO:elmoformanylangs:1 batches, avg len: 62.3
INFO:elmoformanylangs:1 batches, avg len: 45.7
INFO:elmoformanylangs:1 batches, avg len: 150.0
INFO:elmoformanylangs:1 batches, avg len: 37.3
INFO:elmoformanylangs:1 batches, avg len: 87.7
INFO:elmoformanylangs:1 batches, avg len: 21.3
INFO:elmoformanylangs:1 batches, avg len

  0%|          | 0/114 [00:00<?, ?it/s]

INFO:elmoformanylangs:1 batches, avg len: 28.0
INFO:elmoformanylangs:1 batches, avg len: 17.7
INFO:elmoformanylangs:1 batches, avg len: 27.0
INFO:elmoformanylangs:1 batches, avg len: 13.3
INFO:elmoformanylangs:1 batches, avg len: 84.0
INFO:elmoformanylangs:1 batches, avg len: 171.3
INFO:elmoformanylangs:1 batches, avg len: 68.0
INFO:elmoformanylangs:1 batches, avg len: 25.0
INFO:elmoformanylangs:1 batches, avg len: 26.7
INFO:elmoformanylangs:1 batches, avg len: 35.7
INFO:elmoformanylangs:1 batches, avg len: 58.0
INFO:elmoformanylangs:1 batches, avg len: 20.0
INFO:elmoformanylangs:1 batches, avg len: 39.3
INFO:elmoformanylangs:1 batches, avg len: 28.3
INFO:elmoformanylangs:1 batches, avg len: 48.3
INFO:elmoformanylangs:1 batches, avg len: 22.7
INFO:elmoformanylangs:1 batches, avg len: 14.7
INFO:elmoformanylangs:1 batches, avg len: 128.7
INFO:elmoformanylangs:1 batches, avg len: 9.3
INFO:elmoformanylangs:1 batches, avg len: 55.0
INFO:elmoformanylangs:1 batches, avg len: 25.0
INFO:elmofor

Saving npz file locally...


## `text_bilstm_whole.py`
Trains a BiLSTM on the textual features extracted from each of the 3 folds (using indices)

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools

In [None]:
text_features = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz')['arr_0']
text_targets = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz')['arr_0']
text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]

In [None]:
class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'ln' not in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)

        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            # nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

        self.ln1 = nn.LayerNorm(self.embedding_size)
        self.ln2 = nn.LayerNorm(self.hidden_dims)


    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        # x = self.ln1(x)
        output, (final_hidden_state, _) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        # atten_out = self.ln2(atten_out)
        return self.fc_out(atten_out)

In [None]:
def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def train(epoch, train_idxs):
    global lr, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    X_train = text_features[train_idxs]
    Y_train = text_targets[train_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y))

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        pred = output.data.max(1, keepdim=True)[1]
        #print(pred.shape, y.shape)
        correct += pred.eq(y.data.view_as(pred)).cpu().sum()
        loss = criterion(output, y)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()

    train_acc = correct
    print(
        'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '
        .format(epoch + 1, config['learning_rate'], total_loss, correct,
                X_train.shape[0], 100. * correct / X_train.shape[0]))


def evaluate(model, test_idxs, fold, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec
    pred = np.array([])
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(text_targets[test_idxs])).cuda()
        else:
            x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(text_targets[test_idxs])).type(torch.LongTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()
        y_test_pred, conf_matrix = model_performance(y, output.cpu())
        accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
        precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
        recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
        f1_score = 2 * (precision * recall) / (precision + recall)
        print("Accuracy: {}".format(accuracy))
        print("Precision: {}".format(precision))
        print("Recall: {}".format(recall))
        print("F1-Score: {}\n".format(f1_score))
        print('=' * 89)

        if max_f1 <= f1_score and train_acc > len(train_idxs)*0.9 and f1_score > 0.5:
            max_f1 = f1_score
            max_acc = accuracy
            max_rec = recall
            max_prec = precision
            save(model, f"{BASELINE_DIR}/Model/ClassificationWhole/Text/BiLSTM_{config['hidden_dims']}_{max_f1:.2f}_{fold}")
            print('*' * 64)
            print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
            print('*' * 64)

    return total_loss

def get_param_group(model):
    nd_list = []
    param_list = []
    for name, param in model.named_parameters():
        if 'ln' in name:
            nd_list.append(param)
        else:
            param_list.append(param)
    return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}]

In [None]:
config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 1024,
    'batch_size': 4,
    'epochs': 150,
    'learning_rate': 1e-5,
    'hidden_dims': 128,
    'bidirectional': True,
    'cuda': False,
}

In [None]:
train_idxs_tmps = [
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.63_1.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_2.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_3.npy', allow_pickle=True)
]
fold = 1

for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    # if idx_idx != 2:
    #     continue
    test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    model = TextBiLSTM(config)

    param_group = get_param_group(model)
    optimizer = optim.AdamW(param_group, lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss()
    max_f1 = -1
    max_acc = -1
    max_rec = -1
    max_prec = -1
    train_acc = -1

    for ep in range(1, config['epochs']):
        train(ep, train_idxs)
        tloss = evaluate(model, test_idxs, fold, train_idxs)
    fold += 1

## `TextModelChecking.py`
Just evaluates some "BiLSTM_128_0" models on each of the 3 folds

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import re
import os
import tensorflow.compat.v1 as tf
import random
import itertools

from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
# prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
# text_features = np.load(os.path.join(prefix, 'Features/Text/whole_samples_clf_avg.npz'))['arr_0']
# text_targets = np.load(os.path.join(prefix, 'Features/Text/whole_labels_clf_avg.npz'))['arr_0']

# audio_dep_idxs = np.where(text_targets == 1)[0]
# audio_non_idxs = np.where(text_targets == 0)[0]
# # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.80.npy'), allow_pickle=True)
# # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.80.npy'), allow_pickle=True))
# # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.65_2.npy'), allow_pickle=True)
# # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.65_2.npy'), allow_pickle=True))
# train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.89_3.npy'), allow_pickle=True)
# train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.89_3.npy'), allow_pickle=True))

# test_dep_idxs_tmp = list(set(audio_dep_idxs) - set(train_dep_idxs_tmp))
# test_non_idxs = list(set(audio_non_idxs) - set(train_non_idxs))

text_features = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz')['arr_0']
text_targets = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz')['arr_0']
text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]

In [None]:
# # training data augmentation
# train_dep_idxs = []
# for idx in train_dep_idxs_tmp:
#     feat = text_features[idx]
#     for i in itertools.permutations(feat, feat.shape[0]):
#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         train_dep_idxs.append(len(text_features)-1)

#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         train_dep_idxs.append(len(text_features)-1)

# # test data augmentation
# test_dep_idxs = []
# for idx in test_dep_idxs_tmp:
#     feat = text_features[idx]
#     for i in itertools.permutations(feat, feat.shape[0]):
#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         test_dep_idxs.append(len(text_features)-1)

In [None]:
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])


def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

In [None]:
class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)

        # self.init_weight()

        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):

        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class BiLSTM(nn.Module):
    def __init__(self, rnn_layers, dropout, num_classes, text_hidden_dims, text_embed_size):
        super(BiLSTM, self).__init__()

        self.text_embed_size = text_embed_size
        self.text_hidden_dims = text_hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes

        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x_text):
        # x : [len_seq, batch_size, embedding_dim]
        x_text = x_text.permute(1, 0, 2)
        output, (final_hidden_state, _) = self.lstm_net(x_text)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        text_feature = self.fc_out(atten_out)

        return text_feature

In [None]:
def evaluate(model, test_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    # X_test = text_features[test_dep_idxs+test_non_idxs]
    # Y_test = text_targets[test_dep_idxs+test_non_idxs]
    X_test = text_features[test_idxs]
    Y_test = text_targets[test_idxs]
    global max_train_acc, max_acc, max_f1
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])
                          ], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(
            ),             Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(
                torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y))
        with torch.no_grad():
            output = model(x.squeeze(2))
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))

    y_test_pred, conf_matrix = model_performance(
        Y_test, pred[config['batch_size']:])
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] +
                     conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / \
        (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    return precision, recall, f1_score

In [None]:
text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt']
train_idxs_tmps = [
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.63_1.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_2.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_3.npy', allow_pickle=True)
]
resample_idxs = [0, 1, 2, 3, 4, 5]
fold = 1
ps, rs, fs = [], [], []
for idx_i, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(
        set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack(
                        (text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack(
                        (text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    config = {
        'num_classes': 2,
        'dropout': 0.5,
        'rnn_layers': 2,
        'embedding_size': 1024,
        'batch_size': 4,
        'epochs': 100,
        'learning_rate': 2e-5,
        'hidden_dims': 128,
        'bidirectional': True,
        'cuda': False,
    }

    text_lstm_model = torch.load(os.path.join(
        prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[idx_i])))

    model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'],
                   config['hidden_dims'], config['embedding_size'])

    # model_state_dict = {}
    # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
    # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
    # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
    # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

    # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
    # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
    # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
    # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

    # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
    # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
    # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
    # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
    # model_state_dict = text_lstm_model.state_dict()
    # model.load_state_dict(model_state_dict)

    # evaluate(text_features_test, fuse_targets_test, audio_lstm_model)
    # evaluate(model, test_idxs)

    p, r, f = evaluate(text_lstm_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))

## `TextTraditionalClassifiers.py`
Tries "DecisionTreeClassifier", "LogisticRegression", "SVC", and "RandomForestClassifier" on the same folds to compare against the proposed BiLSTM

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
text_features = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz')['arr_0']
text_targets = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz')['arr_0']

text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]

In [None]:
def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def standard_confusion_matrix(y_test, y_test_pred):
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

In [None]:
train_idxs_tmps = [
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.63_1.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.65_2.npy', allow_pickle=True),
    np.load(f'{BASELINE_DIR}/Features/TrainIdx/train_idxs_0.60_3.npy', allow_pickle=True)
]
precs, recs, f1s = [], [], []

for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []

    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
    # train_idxs = train_idxs_tmp
    # test_idxs = test_idxs_tmp

    X_train = text_features[train_idxs]
    Y_train = text_targets[train_idxs]
    X_test = text_features[test_idxs]
    Y_test = text_targets[test_idxs]

    # Decision Tree
    from sklearn import tree
    clf = tree.DecisionTreeClassifier(max_depth=20)

    # svm
    # from sklearn.svm import SVC
    # clf = SVC(kernel='rbf', gamma='auto')

    # rf
    # from sklearn.ensemble import RandomForestClassifier
    # clf = RandomForestClassifier(n_estimators=10, max_depth=20)

    # lr
    # from sklearn.linear_model import LogisticRegression
    # clf = LogisticRegression()

    clf.fit([f.flatten() for f in X_train], Y_train)
    pred = clf.predict([f.flatten() for f in X_test])
    # clf.fit([f.sum(axis=0) for f in X_train], Y_train)
    # pred = clf.predict([f.sum(axis=0) for f in X_test])

    y_test_pred, conf_matrix = model_performance(Y_test, pred)

    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    # precs.append(0 if np.isnan(precision) else precision)
    # recs.append(0 if np.isnan(recall) else recall)
    # f1s.append(0 if np.isnan(f1_score) else f1_score)
    precs.append(precision)
    recs.append(recall)
    f1s.append(f1_score)
print(np.mean(precs), np.mean(recs), np.mean(f1s))

# Fusion

## `fuse_net_whole.py`
Connects altogether using 3-fold indices and textual "BiLSTM" features along with "BiLSTM-GRU-VLAD" features and their according saved models

In [None]:
%%capture
!pip install python_speech_features

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import re
# from allennlp.commands.elmo import ElmoEmbedder
import os
import tensorflow.compat.v1 as tf
import itertools

In [None]:

text_features = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz')['arr_0']
text_targets = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz')['arr_0']

audio_features = np.squeeze(np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_256.npz')['arr_0'], axis=2)
audio_targets = np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_256.npz')['arr_0']

fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])]
fuse_targets = text_targets

fuse_dep_idxs = np.where(text_targets == 1)[0]
fuse_non_idxs = np.where(text_targets == 0)[0]

In [None]:
def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

In [None]:
class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)

        # self.init_weight()

        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):

        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        # self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if not 'ln' in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # self.lstm_net_audio = nn.LSTM(self.embedding_size,
        #                         self.hidden_dims,
        #                         num_layers=self.rnn_layers,
        #                         dropout=self.dropout,
        #                         bidirectional=self.bidirectional,
        #                         batch_first=True)
        self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.ln = nn.LayerNorm(self.embedding_size)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x = self.ln(x)
        x, _ = self.lstm_net_audio(x)
        x = x.mean(dim=1)
        out = self.fc_audio(x)
        return out

class fusion_net(nn.Module):
    def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \
         audio_hidden_dims, audio_embed_size):
        super(fusion_net, self).__init__()
        self.text_embed_size = text_embed_size
        self.audio_embed_size = audio_embed_size
        self.text_hidden_dims = text_hidden_dims
        self.audio_hidden_dims = audio_hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes

        # ============================= TextBiLSTM =================================

        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )

        # ============================= TextBiLSTM =================================

        # ============================= AudioBiLSTM =============================

        self.lstm_net_audio = nn.GRU(self.audio_embed_size,
                                self.audio_hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=False,
                                batch_first=True)

        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )

        self.ln = nn.LayerNorm(self.audio_embed_size)

        # ============================= AudioBiLSTM =============================

        # ============================= last fc layer =============================
        # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims)
        # modal attention
        self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False)
        self.fc_final = nn.Sequential(
            nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False),
            # nn.ReLU(),
            nn.Softmax(dim=1),
            # nn.Sigmoid()
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def pretrained_feature(self, x):
        with torch.no_grad():
            x_text = []
            x_audio = []
            for ele in x:
                x_text.append(ele[1])
                x_audio.append(ele[0])
            x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False)
            # ============================= TextBiLSTM =================================
            # x : [len_seq, batch_size, embedding_dim]
            x_text = x_text.permute(1, 0, 2)
            output, (final_hidden_state, _) = self.lstm_net(x_text)
            # output : [batch_size, len_seq, n_hidden * 2]
            output = output.permute(1, 0, 2)
            # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
            final_hidden_state = final_hidden_state.permute(1, 0, 2)
            # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
            # atten_out = self.attention_net(output, final_hidden_state)
            atten_out = self.attention_net_with_w(output, final_hidden_state)
            text_feature = self.fc_out(atten_out)

            # ============================= TextBiLSTM =================================

            # ============================= AudioBiLSTM =============================
            x_audio = self.ln(x_audio)
            x_audio, _ = self.lstm_net_audio(x_audio)
            x_audio = x_audio.sum(dim=1)
            audio_feature = self.fc_audio(x_audio)

        # ============================= AudioBiLSTM =============================
        return (text_feature, audio_feature)

    def forward(self, x):
        # x = self.bn(x)
        # modal_weights = torch.softmax(self.modal_attn(x), dim=1)
        # modal_weights = self.modal_attn(x)
        # x = (modal_weights * x)
        output = self.fc_final(x)
        return output

class MyLoss(nn.Module):
    def __init__(self):
        super(MyLoss, self).__init__()

    def forward(self, text_feature, audio_feature, target, model):
        weight = model.fc_final[0].weight
        # bias = model.fc_final[0].bias
        # print(weight, bias)
        pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']])
        pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:])
        l = nn.CrossEntropyLoss()
        target = torch.tensor(target)
        # l = nn.BCEWithLogitsLoss()
        # target = F.one_hot(target, num_classes=2).type(torch.FloatTensor)
        # print('y: {}\npred_audio: {}\npred_text: {}\n'.format(target, pred_audio.data.max(1, keepdim=True)[1], pred_text.data.max(1, keepdim=True)[1]))
        # return l(pred_text, target) + l(pred_audio, target) + \
        #         config['lambda']*torch.norm(weight[:, :config['text_hidden_dims']]) + \
        #         config['lambda']*torch.norm(weight[:, config['text_hidden_dims']:])
        # a = F.softmax(pred_text, dim=1) + F.softmax(pred_audio, dim=1)
        return l(pred_text, target) + l(pred_audio, target)

In [None]:
config = {
    'num_classes': 2,
    'dropout': 0.3,
    'rnn_layers': 2,
    'audio_embed_size': 256,
    'text_embed_size': 1024,
    'batch_size': 2,
    'epochs': 100,
    'learning_rate': 8e-6,
    'audio_hidden_dims': 256,
    'text_hidden_dims': 128,
    'cuda': False,
    'lambda': 1e-5,
}

In [None]:
model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \
    config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size'])

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()
criterion = MyLoss()

In [None]:
def train(epoch, train_idxs):
    global max_train_acc, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    X_train = []
    Y_train = []
    for idx in train_idxs:
        X_train.append(fuse_features[idx])
        Y_train.append(fuse_targets[idx])
    for i in range(0, len(X_train), config['batch_size']):
        if i + config['batch_size'] > len(X_train):
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        text_feature, audio_feature = model.pretrained_feature(x)
        # text_feature = torch.from_numpy(ss.fit_transform(text_feature.numpy()))
        # audio_feature = torch.from_numpy(ss.fit_transform(audio_feature.numpy()))
        # concat_x = torch.cat((audio_feature, text_feature), dim=1)
        concat_x = torch.cat((text_feature, audio_feature), dim=1)
        # dot_x = text_feature.mul(audio_feature)
        # add_x = text_feature.add(audio_feature)
        output = model(concat_x)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(torch.tensor(y).data.view_as(pred)).cpu().sum()
        # loss = criterion(output, torch.tensor(y))
        loss = criterion(text_feature, audio_feature, y, model)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()
    cur_loss = total_loss
    max_train_acc = correct
    train_acc = correct
    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '.format(
                epoch, config['learning_rate'], cur_loss/len(X_train), correct, len(X_train),
        100. * correct / len(X_train)))


def evaluate(model, test_idxs, fold, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    X_test = []
    Y_test = []
    for idx in test_idxs:
        X_test.append(fuse_features[idx])
        Y_test.append(fuse_targets[idx])
    global max_train_acc, max_acc,max_f1
    for i in range(0, len(X_test), config['batch_size']):
        if i + config['batch_size'] > len(X_test):
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            # concat_x = torch.cat((audio_feature, text_feature), dim=1)
            audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
            text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
            output = model(concat_x)
        # loss = criterion(output, torch.tensor(y))
        loss = criterion(text_feature, audio_feature, y, model)
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))
        total_loss += loss.item()

    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])

    print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test)))
    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)

    if max_f1 < f1_score and max_train_acc >= len(train_idxs)*0.9 and f1_score > 0.61:
        max_f1 = f1_score
        max_acc = accuracy
        save(model, f'{BASELINE_DIR}/Model/ClassificationWhole/Fuse/fuse_{max_f1:.2f}_{fold}')
        print('*'*64)
        print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
        print('*'*64)
    return total_loss

In [None]:
idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.62_3.pt']
audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt']
for fold in range(1, 4):
    # if fold != 2:
    #     continue
    train_idxs_tmp = np.load(f'{BASELINE_DIR}/Features/TrainIdx/{idxs_paths[fold-1]}', allow_pickle=True)
    test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp))
    resample_idxs = list(range(6))

    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    train_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            resample_idxs = [0,1,4,5]
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    test_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    text_lstm_model = torch.load(f'{BASELINE_DIR}/Model/ClassificationWhole/Text/{text_model_paths[fold-1]}')
    audio_lstm_model = torch.load(f'{BASELINE_DIR}/Model/ClassificationWhole/Audio/{audio_model_paths[fold-1]}')
    model_state_dict = {}
    model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
    model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
    model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
    model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

    model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
    model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
    model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
    model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

    model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
    model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
    model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
    model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']

    model_state_dict['ln.weight'] = audio_lstm_model.state_dict()['ln.weight']
    model_state_dict['ln.bias'] = audio_lstm_model.state_dict()['ln.bias']
    model.load_state_dict(text_lstm_model.state_dict(), strict=False)
    # model.load_state_dict(audio_lstm_model.state_dict(), strict=False)
    model.load_state_dict(model_state_dict, strict=False)

    for param in model.parameters():
        param.requires_grad = False

    model.fc_final[0].weight.requires_grad = True
    # model.fc_final[0].bias.requires_grad = True
    # model.modal_attn.weight.requires_grad = True

    max_f1 = -1
    max_acc = -1
    max_train_acc = -1

    for ep in range(1, config['epochs']):
        train(ep, train_idxs)
        tloss = evaluate(model, test_idxs, fold, train_idxs)

## `FuseModelChecking.py`
testing module for all the stored models and features altogether

In [None]:
# from fuse_net_whole import fusion_net, config, model_performance
import os
import numpy as np
import torch
from torch.autograd import Variable
import itertools

In [None]:
idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
text_model_paths = ['BiLSTM_128_0.67_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt']
audio_model_paths = ['BiLSTM_gru_vlad256_256_0.63_1.pt', 'BiLSTM_gru_vlad256_256_0.65_2.pt', 'BiLSTM_gru_vlad256_256_0.60_3.pt']
fuse_model_paths = ['fuse_0.69_1.pt', 'fuse_0.68_2.pt', 'fuse_0.62_3.pt']

text_features = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_samples_clf_avg.npz')['arr_0']
text_targets = np.load(f'{BASELINE_DIR}/Features/TextWhole/whole_labels_clf_avg.npz')['arr_0']
audio_features = np.squeeze(np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_samples_clf_256.npz')['arr_0'], axis=2)
audio_targets = np.load(f'{BASELINE_DIR}/Features/AudioWhole/whole_labels_clf_256.npz')['arr_0']

fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])]
fuse_targets = text_targets

fuse_dep_idxs = np.where(text_targets == 1)[0]
fuse_non_idxs = np.where(text_targets == 0)[0]

In [None]:
def evaluate(model, test_idxs):
    model.eval()
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    X_test = []
    Y_test = []
    for idx in test_idxs:
        X_test.append(fuse_features[idx])
        Y_test.append(fuse_targets[idx])
    global max_train_acc, max_acc,max_f1
    for i in range(0, len(X_test), config['batch_size']):
        if i + config['batch_size'] > len(X_test):
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            # concat_x = torch.cat((audio_feature, text_feature), dim=1)
            audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
            text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
            output = model(concat_x)
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))

    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])
    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)

    return precision, recall, f1_score

In [None]:
ps, rs, fs = [], [], []
for fold in range(3):
    train_idxs_tmp = np.load(f'{BASELINE_DIR}/Features/TrainIdx/{idxs_paths[fold]}', allow_pickle=True)
    test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp))
    resample_idxs = list(range(6))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    train_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            resample_idxs = [0,1,4,5]
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    test_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    fuse_model = torch.load(f'{BASELINE_DIR}/Model/ClassificationWhole/Fuse/{fuse_model_paths[fold]}')
    p, r, f = evaluate(fuse_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))