In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from ast import literal_eval

In [2]:
ten_p = pd.read_csv('./data/positive_ten.csv',index_col=0)
ten_n = pd.read_csv('./data/negative_ten.csv',index_col=0)
n_tst = pd.read_csv('./data/negative_test.csv',index_col=0)
p_tst = pd.read_csv('./data/positive_test.csv',index_col=0)
n_tr = pd.read_csv('./data/negative_train.csv',index_col=0)
p_tr = pd.read_csv('./data/positive_train.csv',index_col=0)

In [3]:
ten_p.columns

Index(['statuses_count', 'followers_count', 'friends_count', 'listed_count',
       'favourites_count', 'favorite_count', 'retweet_count', 'days',
       'posts_number_all', 'posts_number', 'text_length', 'reply_ratio',
       'time', 'positive_words_count', 'negative_words_count',
       'positive_emoji_count', 'negative_emoji_count', 'vad',
       'antidepressant_words_count', 'depression_symptoms_count', 'lda'],
      dtype='object')

In [4]:
frames_ten = [ten_p,ten_n]
frames_full = [n_tst,n_tr,p_tst,p_tr]

In [5]:
ten_concat = pd.concat(frames_ten)
whole_concat = pd.concat(frames_full)

## Simple sanity check

In [6]:
len(whole_concat)

6348

In [7]:
len(ten_concat)

6348

In [8]:
set(whole_concat.index.tolist())==set(ten_concat.index.tolist())

True

## Remove biased features

In [23]:
#compared two datasets - if there are same feature values, it is most probably biased - have to be removed
#in oder to be 100%, I validate if the set of features is consistent for 1k samples

examp_id = whole_concat.iloc[102].name
c1 = whole_concat.loc[examp_id]
c2 = ten_concat.loc[examp_id]
biased_feats = []
for i in c1.index:
    if c1[i]==c2[i]:
        #print(i)
        biased_feats.append(i)


for ii in range(1000):
    examp_id = whole_concat.iloc[102].name
    c1 = whole_concat.loc[examp_id]
    c2 = ten_concat.loc[examp_id]
    bb = []
    for i in c1.index:
        if c1[i]==c2[i]:
            bb.append(i)
    if str(bb) != str(biased_feats):
        print('there is a problem with biased features: ',bb)
        print('it does not agree with: ',biased_feats)

In [11]:
clean_ten = ten_concat.drop(labels=biased_feats,axis=1)
clean_whole = whole_concat.drop(labels=biased_feats,axis=1)

## Understanding features, processing lists, etc.

In [12]:
clean_ten.columns

Index(['favorite_count', 'retweet_count', 'days', 'posts_number_all',
       'text_length', 'reply_ratio', 'time', 'positive_words_count',
       'negative_words_count', 'positive_emoji_count', 'vad',
       'depression_symptoms_count', 'lda'],
      dtype='object')

#### vad - always 3 values
#### time - always 24
#### depression_symptoms_count - always 9
#### LDA - always 25

In [13]:
len(np.array(clean_ten.time.iloc[101][1:-1].split(',')).astype(float))

24

In [14]:
len(np.array(clean_ten.lda.iloc[111][1:-1].split(',')).astype(float))

25

In [15]:
len(np.array(clean_ten.depression_symptoms_count.iloc[101][1:-1].split(',')).astype(float))

9

In [16]:
len(np.array(clean_ten.vad.iloc[101][1:-1].split(',')).astype(float))

3

## Features stationarization
#### retweet_count should be considered with respect to the posts_number_all or text_length(?)
#### text_length - with respect to the posts_number_all
#### reply_ratio - with respect to the retweet_count (assuming there is a correlation of retweet_count and number of comments)

In [72]:
clean_ten['retweet_count_div_by_posts_number_all'] = clean_ten.retweet_count/clean_ten.posts_number_all
clean_whole['retweet_count_div_by_posts_number_all'] = clean_whole.retweet_count/clean_whole.posts_number_all

clean_ten['positive_words_freq'] = clean_ten.positive_words_count/(clean_ten.posts_number_all*clean_ten.text_length)
clean_whole['positive_words_freq'] = clean_whole.positive_words_count/(clean_whole.posts_number_all*clean_whole.text_length)

clean_ten['negavite_words_freq'] = clean_ten.negative_words_count/(clean_ten.posts_number_all*clean_ten.text_length)
clean_whole['negative_words_freq'] = clean_whole.negative_words_count/(clean_whole.posts_number_all*clean_whole.text_length)

clean_ten['positive_emoji_freq'] = clean_ten.positive_emoji_count/(clean_ten.posts_number_all*clean_ten.text_length)
clean_whole['positive_emoji_freq'] = clean_whole.positive_emoji_count/(clean_whole.posts_number_all*clean_whole.text_length)

#clean_ten['reply_ratio_div_by_retweet_count_plus_one'] = clean_ten.reply_ratio/(clean_ten.retweet_count+1.0)
#clean_whole['reply_ratio_div_by_retweet_count_plus_one'] = clean_whole.reply_ratio/(clean_whole.retweet_count+1.0)

In [24]:
nonstat_to_remove = ['retweet_count','positive_words_count','negative_words_count','positive_emoji_count']

### Removing non-stationary features

In [25]:
clean_ten = clean_ten.drop(labels=nonstat_to_remove,axis=1)
clean_whole = clean_whole.drop(labels=nonstat_to_remove,axis=1)

### Transform lists into separate features

In [26]:
clean_ten.to_csv('./data/intermediate_ten.csv')
clean_whole.to_csv('./data/intermediate_whole.csv')

In [27]:
def converter(x):
    return np.array(literal_eval(x),dtype=np.float64).squeeze()

converters={'vad': converter,'time': converter,'depression_symptoms_count': converter,'lda': converter}

clean_ten = pd.read_csv('./data/intermediate_ten.csv',sep=',', converters=converters,index_col=0)
clean_whole = pd.read_csv('./data/intermediate_whole.csv',sep=',', converters=converters,index_col=0)

In [28]:
#creating names for additional columns in df's
vad_ls = []
for i in range(3):
    vad_ls.append('vad_'+str(i))
    
time_ls = []
for i in range(24):
    time_ls.append('time_'+str(i))
    
depression_symptoms_count_ls = []
for i in range(9):
    depression_symptoms_count_ls.append('depression_symptoms_count_'+str(i))
    
lda_ls = []
for i in range(25):
    lda_ls.append('lda_'+str(i))

### Merge all the features into final df's and save the dataset

In [29]:
vad_df_10 = pd.DataFrame(np.array(clean_ten.vad.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = vad_ls)
time_df_10 = pd.DataFrame(np.array(clean_ten.time.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = time_ls)
depression_symptoms_count_df_10 = pd.DataFrame(np.array(clean_ten.depression_symptoms_count.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = depression_symptoms_count_ls)
lda_df_10 = pd.DataFrame(np.array(clean_ten.lda.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = lda_ls)
FINAL_df_10 = pd.concat([clean_ten.drop(labels=['vad','lda','depression_symptoms_count','time'],axis=1),vad_df_10,time_df_10,depression_symptoms_count_df_10,lda_df_10],axis=1)

In [30]:
vad_df_whole = pd.DataFrame(np.array(clean_whole.vad.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = vad_ls)
time_df_whole = pd.DataFrame(np.array(clean_whole.time.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = time_ls)
depression_symptoms_count_df_whole = pd.DataFrame(np.array(clean_whole.depression_symptoms_count.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = depression_symptoms_count_ls)
lda_df_whole = pd.DataFrame(np.array(clean_whole.lda.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = lda_ls)
FINAL_df_whole = pd.concat([clean_whole.drop(labels=['vad','lda','depression_symptoms_count','time'],axis=1),vad_df_whole,time_df_whole,depression_symptoms_count_df_whole,lda_df_whole],axis=1)

In [31]:
FINAL_df_10.to_csv('./data/final_10.csv')
FINAL_df_whole.to_csv('./data/final_whole.csv')