In [1]:
import sys
import os
import collections
import csv
import argparse
import random
import re
import emoji
import pickle

import xml.etree.ElementTree as et
import numpy as np
import pandas as pd

#import tensorflow as tf
#import torch
#import transformers

from datetime import datetime
from dateutil.parser import parse
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from load import *

#from transformers import AutoModel, BertTokenizerFast
#from transformers import BertTokenizer, BertModel
#from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
#                              TensorDataset)


In [None]:
#import bert pre-trained model
bert = AutoModel.from_pretrained('bert-base-uncased')
#load the bert tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


In [2]:
year=2018

#input directory
POS_DIR = "/projets/sig/mullah/nlp/depression/data/raw/"+str(year)+"/train/positive_examples_anonymous_chunks/"
NEG_DIR = "/projets/sig/mullah/nlp/depression/data/raw/"+str(year)+"/train/negative_examples_anonymous_chunks/"
TEST_DIR = "/projets/sig/mullah/nlp/depression/data/raw/"+str(year)+"/test/"

#processed directory
Processed_Train_DIR = "/projets/sig/mullah/nlp/depression/data/processed/"+str(year)+"/train"
Processed_Test_DIR = "/projets/sig/mullah/nlp/depression/data/processed/"+str(year)+"/test"


In [None]:
#loading training data and computing statistics
chunks = [i for i in range(1, 11)]
train_user_data = load_train_data(chunks, NEG_DIR, POS_DIR)
print ("Total users in trained set: {}".format(len(train_user_data)))

#statistics 
#users, label, only title, only text, both title + text, no title + text
users = []
labels = []
writings_count = []
total_writings = 0
count_title = 0
count_text = 0
count_title_text = 0
count_no_title_text = 0
count_text_tokens = []
count_title_tokens = []

for idx in range(0, len(train_user_data)):
    user_data = train_user_data[idx]
    
    users.append(user_data['uid'])
    labels.append(user_data['class'])
    writings = merge_writings(user_data)
    writings_count.append(len(writings))

    for m in writings:
        if m[0] != '' and m[2] == '':
            count_title = count_title + 1
            count_title_tokens.append(len(m[0].split(' ')))
        if m[0] =='' and m[2] != '':
            count_text = count_text + 1
            count_text_tokens.append(len(m[2].split(' ')))            
        if m[0] != '' and m[2] != '':
            count_title_text = count_title_text + 1
            count_text_tokens.append(len(m[2].split(' ')))
        if m[0] == '' and m[2] == '':
            count_no_title_text = count_no_title_text + 1
            count_title_tokens.append(len(m[0].split(' ')))

print ("Total users: {}".format(len(users)))
print ("Total labels: {}".format(len(labels)))
print ("Total writings: {}".format(sum(writings_count)))

print ("#only title: {}, #only text: {}, #both title and text: {}, #no titel and text: {}".format(count_title, count_text, count_title_text, count_no_title_text))
avg_text_tokens = np.mean(count_text_tokens)
print ("Average token in text: {}".format(avg_text_tokens))
print (np.sum(np.asarray(count_text_tokens)>128))
avg_title_tokens = np.mean(count_title_tokens)
print ("Average token in text: {}".format(avg_title_tokens))


In [None]:
#loading and sampling training data
chunks = [i for i in range(1, 11)]
train_user_data = load_train_data(chunks, NEG_DIR, POS_DIR)
print ("Total users in trained set: {}".format(len(train_user_data)))

labels = []
texts = []

for idx in range(0, len(train_user_data)):
    user_data = train_user_data[idx]

    label = user_data['class']
    label_code = 1 if label == 'p' else 0
    writings = merge_writings(user_data)
    
    for m in writings:
        if m[0] != '' and m[2] != '':
            text = str(m[0]) + " " + str(m[2]) #just considering the title + text
            labels.append(label_code)
            texts.append(text)
            
        elif m[2] != '':
            text = m[2] #just considering the text (not title)
            labels.append(label_code)
            texts.append(text)   
                  
print ("Total labels: {}".format(len(labels)))
print ("Total texts: {}".format(len(texts)))

data_labels_texts = list(zip(labels, texts))
df_labels_texts = pd.DataFrame(data_labels_texts, columns=["labels", "texts"])
print (df_labels_texts.head())

trained_datapath = os.path.join(Processed_Train_DIR, 'train_texts')
df_labels_texts.to_csv(trained_datapath, index=False)
print ('Done.')

In [None]:
#loading and preparing testing data
chunks = [i for i in range(1, 11)]
test_user_data = load_test_data(chunks, year, TEST_DIR)
print ("Total users in test set: {}".format(len(test_user_data)))

for idx in range(0, len(test_user_data)):
    user_data = test_user_data[idx]
    
    uid = user_data['uid']
    print ("User id: {}".format(uid))
    label = user_data['class']
    label_code = 1 if label == 'p' else 0
    
    for chunk_data in user_data["data"]:
        chunk_number = chunk_data["chunk"]
        writings = chunk_data["writings"]
        
        labels = []
        texts = []
        
        for m in writings:
            if m[0] != '' and m[2] != '':
                text = str(m[0]) + " " + str(m[2]) #just considering the title + text
                labels.append(label_code)
                texts.append(text)
            
            elif m[0] != '':
                text = m[0] #just considering the title
                labels.append(label_code)
                texts.append(text) 
            
            elif m[2] != '':
                text = m[2] #just considering the text (not title)
                labels.append(label_code)
                texts.append(text)      

        print ("Total labels: {}".format(len(labels)))
        print ("Total texts: {}".format(len(texts)))

        data_labels_texts = list(zip(labels, texts))
        df_labels_texts = pd.DataFrame(data_labels_texts, columns=["labels", "texts"])
        #print (df_labels_texts.head())

        #chunk dir
        if not os.path.exists(os.path.join(Processed_Test_DIR, "chunk "+str(chunk_number))):
            os.makedirs(os.path.join(Processed_Test_DIR, "chunk "+str(chunk_number)))
                     
        user_test_datapath = os.path.join(Processed_Test_DIR, "chunk "+str(chunk_number), str(uid)+".csv")
        print (user_test_datapath)
        df_labels_texts.to_csv(user_test_datapath, index=False)
    print ('Done.')

In [3]:
#combined test data cumulatively
chunks = [i for i in range(1, 11)]

for chunk_number in chunks:
    chunk_processed_test_dir = os.path.join(Processed_Test_DIR, "chunk "+str(chunk_number))
    chunk_cumulative_test_dir = os.path.join(Processed_Test_DIR, "cumulative", "chunk "+str(chunk_number))

    if not os.path.exists(chunk_cumulative_test_dir):
        os.makedirs(chunk_cumulative_test_dir)
    
    if chunk_number == 1:
        for root, subdirs, files in os.walk(chunk_processed_test_dir):
            for filename in files:
                file_path = os.path.join(root, filename)
                data_df = pd.read_csv(file_path)
            
                chunk_cumulative_test_datapath = os.path.join(chunk_cumulative_test_dir, filename)
                data_df.to_csv(chunk_cumulative_test_datapath, index=False)
    else:
        for root, subdirs, files in os.walk(chunk_processed_test_dir):
            for filename in files:
                file_path2 = os.path.join(root, filename)
                data_df2 = pd.read_csv(file_path2)
                
                file_path1 = os.path.join(Processed_Test_DIR, "cumulative", "chunk "+str(chunk_number-1), filename)
                data_df1 = pd.read_csv(file_path1)
            
                data_df = data_df1.append(data_df2, ignore_index=True)
            
                chunk_cumulative_test_datapath = os.path.join(chunk_cumulative_test_dir, filename)
                data_df.to_csv(chunk_cumulative_test_datapath, index=False)      
print ('Done.')

Done.
