In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from ast import literal_eval

In [2]:
ten_p = pd.read_csv('./data/positive_ten.csv',index_col=0)
ten_n = pd.read_csv('./data/negative_ten.csv',index_col=0)
n_tst = pd.read_csv('./data/negative_test.csv',index_col=0)
p_tst = pd.read_csv('./data/positive_test.csv',index_col=0)
n_tr = pd.read_csv('./data/negative_train.csv',index_col=0)
p_tr = pd.read_csv('./data/positive_train.csv',index_col=0)

In [5]:
ten_p.columns

Index(['statuses_count', 'followers_count', 'friends_count', 'listed_count',
       'favourites_count', 'favorite_count', 'retweet_count', 'days',
       'posts_number_all', 'posts_number', 'text_length', 'reply_ratio',
       'time', 'positive_words_count', 'negative_words_count',
       'positive_emoji_count', 'negative_emoji_count', 'vad',
       'antidepressant_words_count', 'depression_symptoms_count', 'lda'],
      dtype='object')

In [6]:
frames_ten = [ten_p,ten_n]
frames_full = [n_tst,n_tr,p_tst,p_tr]

In [8]:
ten_concat = pd.concat(frames_ten)
whole_concat = pd.concat(frames_full)

## Simple sanity check

In [13]:
len(whole_concat)

6348

In [14]:
len(ten_concat)

6348

In [15]:
set(whole_concat.index.tolist())==set(ten_concat.index.tolist())

True

## Remove biased features

In [36]:
examp_id = whole_concat.iloc[12].name

In [38]:
#compared two datasets - if there are same feature values, it is most probably biased - have to be removed
c1 = whole_concat.loc[examp_id]
c2 = ten_concat.loc[examp_id]
biased_feats = []
for i in c1.index:
    if c1[i]==c2[i]:
        print(i)
        biased_feats.append(i)

statuses_count
followers_count
friends_count
listed_count
favourites_count
posts_number
negative_emoji_count
antidepressant_words_count


In [41]:
clean_ten = ten_concat.drop(labels=biased_feats,axis=1)
clean_whole = whole_concat.drop(labels=biased_feats,axis=1)

## Understanding features, processing lists, etc.

In [42]:
clean_ten.columns

Index(['favorite_count', 'retweet_count', 'days', 'posts_number_all',
       'text_length', 'reply_ratio', 'time', 'positive_words_count',
       'negative_words_count', 'positive_emoji_count', 'vad',
       'depression_symptoms_count', 'lda'],
      dtype='object')

#### vad - always 3 values
#### time - always 24
#### depression_symptoms_count - always 9
#### LDA - always 25

In [52]:
len(np.array(clean_ten.time.iloc[101][1:-1].split(',')).astype(float))

24

In [58]:
len(np.array(clean_ten.lda.iloc[111][1:-1].split(',')).astype(float))

25

In [59]:
len(np.array(clean_ten.depression_symptoms_count.iloc[101][1:-1].split(',')).astype(float))

9

In [60]:
len(np.array(clean_ten.vad.iloc[101][1:-1].split(',')).astype(float))

3

In [63]:
clean_ten

Unnamed: 0_level_0,favorite_count,retweet_count,days,posts_number_all,text_length,reply_ratio,time,positive_words_count,negative_words_count,positive_emoji_count,vad,depression_symptoms_count,lda
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2423026075,1,385,2,5,13,0.400000,"[0.0, 0.2, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,2,0,"[4.53, 5.43, 4.61]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.016666666666666847, 0.016666666666666847, 0..."
121839389,3,17,2,6,13,0.166667,"[0.167, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",9,4,0,"[3.14, 2.69, 2.73]","[1, 0, 0, 0, 0, 0, 0, 0, 0]","[0.015777777777778366, 0.24355555555554473, 0...."
521751014,4,314,3,8,14,0.500000,"[0.125, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",2,7,4,"[7.21, 6.46, 6.42]","[0, 0, 0, 0, 0, 0, 0.5, 0, 0]","[0.01216666666666694, 0.01216666666666694, 0.0..."
3138827800,0,238,2,3,19,0.666667,"[0.0, 0.0, 0.333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0,1,0,"[0.0, 0.0, 0.0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.00901587301587328, 0.00901587301587328, 0.0..."
2739726394,0,1075,2,3,25,0.666667,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,4,0,"[5.42, 5.12, 5.0]","[0, 0, 0, 0, 0, 0, 1, 0, 0]","[0.005460317460317744, 0.23879365079365888, 0...."
1344064586,0,0,1,1,25,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,0,"[9.44, 9.0, 8.86]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0066666666666672855, 0.1733333333332544, 0...."
1283765972,0,122,5,14,14,0.214286,"[0.071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",11,4,0,"[6.29, 5.36, 5.28]","[2, 0, 0, 0, 0, 0, 0.2, 0, 0]","[0.047868480725621686, 0.06215419501132284, 0...."
66702042,1,2281,1,7,15,0.285714,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6,5,0,"[3.51, 3.76, 3.51]","[0, 1, 0, 0, 0, 0, 0, 0.5, 0]","[0.06005442176870744, 0.07365986394558302, 0.0..."
471714582,5,1944,4,14,11,0.285714,"[0.0, 0.0, 0.286, 0.071, 0.0, 0.0, 0.0, 0.0, 0...",3,6,0,"[1.63, 2.16, 1.86]","[1, 0, 1, 0, 1.0, 0, 0, 0, 0]","[0.01628571428571479, 0.01628571428571479, 0.0..."
1911548394,5,6,3,17,17,0.235294,"[0.118, 0.059, 0.118, 0.059, 0.0, 0.0, 0.0, 0....",12,1,0,"[3.68, 2.69, 3.02]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.03652576181987916, 0.01551735845853217, 0.0..."


## Features stationarization (normalization)
#### retweet_count should be considered with respect to the posts_number_all or text_length(?)
#### text_length - with respect to the posts_number_all
#### reply_ratio - with respect to the retweet_count (assuming there is a correlation of retweet_count and number of comments)

In [72]:
clean_ten['retweet_count_div_by_posts_number_all'] = clean_ten.retweet_count/clean_ten.posts_number_all
clean_whole['retweet_count_div_by_posts_number_all'] = clean_whole.retweet_count/clean_whole.posts_number_all

clean_ten['text_length_div_by_posts_number_all'] = clean_ten.text_length/clean_ten.posts_number_all
clean_whole['text_length_div_by_posts_number_all'] = clean_whole.text_length/clean_whole.posts_number_all

clean_ten['reply_ratio_div_by_retweet_count_plus_one'] = clean_ten.reply_ratio/(clean_ten.retweet_count+1.0)
clean_whole['reply_ratio_div_by_retweet_count_plus_one'] = clean_whole.reply_ratio/(clean_whole.retweet_count+1.0)

### Transform lists into separate features

In [84]:
clean_ten.to_csv('./data/intermediate_ten.csv')
clean_whole.to_csv('./data/intermediate_whole.csv')

In [126]:
def converter(x):
    return np.array(literal_eval(x),dtype=np.float64).squeeze()

converters={'vad': converter,'time': converter,'depression_symptoms_count': converter,'lda': converter}

clean_ten = pd.read_csv('./data/final_ten.csv',sep=',', converters=converters,index_col=0)
clean_whole = pd.read_csv('./data/final_whole.csv',sep=',', converters=converters,index_col=0)

In [133]:
#creating names for additional columns in df's
vad_ls = []
for i in range(3):
    vad_ls.append('vad_'+str(i))
    
time_ls = []
for i in range(24):
    time_ls.append('time_'+str(i))
    
depression_symptoms_count_ls = []
for i in range(9):
    depression_symptoms_count_ls.append('depression_symptoms_count_'+str(i))
    
lda_ls = []
for i in range(25):
    lda_ls.append('lda_'+str(i))

### Merge all the features into final df's and save the dataset

In [139]:
vad_df_10 = pd.DataFrame(np.array(clean_ten.vad.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = vad_ls)
time_df_10 = pd.DataFrame(np.array(clean_ten.time.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = time_ls)
depression_symptoms_count_df_10 = pd.DataFrame(np.array(clean_ten.depression_symptoms_count.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = depression_symptoms_count_ls)
lda_df_10 = pd.DataFrame(np.array(clean_ten.lda.values.tolist(),dtype=np.float64),index = clean_ten.index.tolist(),columns = lda_ls)
FINAL_df_10 = pd.concat([clean_ten.drop(labels=['vad','lda','depression_symptoms_count','time'],axis=1),vad_df_10,time_df_10,depression_symptoms_count_df_10,lda_df_10],axis=1)

In [141]:
vad_df_whole = pd.DataFrame(np.array(clean_whole.vad.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = vad_ls)
time_df_whole = pd.DataFrame(np.array(clean_whole.time.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = time_ls)
depression_symptoms_count_df_whole = pd.DataFrame(np.array(clean_whole.depression_symptoms_count.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = depression_symptoms_count_ls)
lda_df_whole = pd.DataFrame(np.array(clean_whole.lda.values.tolist(),dtype=np.float64),index = clean_whole.index.tolist(),columns = lda_ls)
FINAL_df_whole = pd.concat([clean_whole.drop(labels=['vad','lda','depression_symptoms_count','time'],axis=1),vad_df_whole,time_df_whole,depression_symptoms_count_df_whole,lda_df_whole],axis=1)

In [142]:
FINAL_df_10.to_csv('./data/final_10.csv')
FINAL_df_whole.to_csv('./data/final_whole.csv')