In [134]:
import pandas as pd
from spacy.lang.en import English
import itertools

In [135]:
post_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_post.csv', encoding='UTF-8')
comment_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_comment.csv', encoding='UTF-8')

In [136]:
satisfactions_float = list(post_df['satisfaction'])

In [137]:
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f03cdbc1c00>

In [138]:
post_contents = list(post_df['content'])
comment_bodies = list(comment_df['content'])

post_sequences = []
comment_sequences = []


for post_content, comment_body in zip(post_contents, comment_bodies):
    post_sequences.append(list(map(lambda x: str(x), list(nlp(post_content).sents))))
    comment_sequences.append(list(map(lambda x: str(x), list(nlp(comment_body).sents))))



# satisfaction score (y)
satisfactions_float = list(post_df['satisfaction'])
satisfactions = []
for_contrastive = []

for s in satisfactions_float:
    if s < 3.5:
        satisfactions.append(0)
    elif s < 5:
        satisfactions.append(1)
    else:
        satisfactions.append(2)
        
for s in satisfactions_float:
    if s <= 2.5:
        for_contrastive.append(0)
    elif s <= 3.5:
        for_contrastive.append(1)
    elif s <= 4.5:
        for_contrastive.append(2)
    elif s <= 5.5:
        for_contrastive.append(3)
    else:
        for_contrastive.append(4)

# print(satisfactions_float)
# print(satisfactions)

data = []

i = 0
for post, comment, satisfaction1, satisfaction2, satisfaction_float in zip(post_contents, comment_bodies,
                                                                           satisfactions, for_contrastive, satisfactions_float):
    data.append([i, post, comment, satisfaction1, satisfaction2, satisfaction_float])
    i += 1

columns = ['index', 'post_contents', 'comment_contents', 'label', 'label_for_contrastive', 'score']
df = pd.DataFrame(data, columns=columns)

In [109]:
train_df = pd.DataFrame([], columns=columns)

In [110]:
train_df

Unnamed: 0,index,post_contents,comment_contents,label,label_for_contrastive,score


In [111]:
for i in range(5):
    sub_df = df[df.label_for_contrastive == i].sample(45)
    train_df = pd.concat([train_df, sub_df])

In [112]:
train_df.shape

(225, 6)

In [113]:
remain_df = df.drop(train_df.index)

In [114]:
remain_df.shape

(775, 6)

In [118]:
remain_df

Unnamed: 0,index,post_contents,comment_contents,label,label_for_contrastive,score
1,1,"I don't really know why, and maybe it's just m...",I use to love taking cold showers when I was y...,1,2,4.25
2,2,I am 30 years old and my girlfriend is 24 are ...,"Depression is not an excuse for this behavior,...",2,3,5.35
3,3,"Almost every day for months, I've gone to a se...",Then you have nothing to lose. Tell her dude.,0,1,3.30
4,4,Can you love when you're depressed? When depre...,"Yes, of course. I think it is because we love ...",1,2,3.80
5,5,"Hello. I have had insecurities, anxiety, depre...",I didn't mean necessarily trying harder. Just ...,2,3,5.05
...,...,...,...,...,...,...
994,994,So I take three different meds for my depressi...,Do research on your meds. Also that sounds rea...,1,2,4.10
996,996,"Yeah, how great, how lucky I am. When I finall...",Why do you think you are sensitive and cursed?...,1,2,4.30
997,997,"I've felt reluctant to post here, given that o...",Life is so unfair but you need to get your shi...,1,2,4.30
998,998,Iâm a 32 year old man. I have been extremely...,Same for me. Im too ugly for a girl to love me.,1,2,4.25


In [119]:
test_df = pd.DataFrame([], columns=columns)

In [120]:
for i in range(5):
    sub_df = remain_df[remain_df.label_for_contrastive == i].sample(5)
    test_df = pd.concat([test_df, sub_df])

In [122]:
test_df.shape

(25, 6)

In [123]:
train_comb = list(itertools.combinations(train_df['index'], 2))
test_comb = list(itertools.combinations(test_df['index'], 2))

In [124]:
len(train_comb)

25200

In [125]:
len(test_comb)

300

In [126]:
def make_comb_for_contrastive_learner(comb_list, df):
    columns = columns = ['index', 'post_contents', 'comment_contents', 'label', 'label_for_contrastive', 'score'] * 2
    result_df = pd.DataFrame([], columns=columns)
    for comb in comb_list:
        sub_comb_df = pd.concat([df[df.index == comb[0]].reset_index(drop=True), 
                                 df[df.index == comb[1]].reset_index(drop=True)], axis=1)
        result_df = pd.concat([result_df, sub_comb_df]).reset_index(drop=True)
    
    return result_df

In [127]:
train_result_df = make_comb_for_contrastive_learner(train_comb, df)
test_result_df = make_comb_for_contrastive_learner(test_comb, df)

In [129]:
new_columns = ['index1', 'post_contents1', 'comment_contents1', 'label1', 'label_for_contrastive1', 'score1', 'index2', 'post_contents2', 'comment_contents2', 'label2', 'label_for_contrastive2', 'score2']

In [130]:
train_result_df.columns = new_columns
test_result_df.columns = new_columns

In [131]:
train_result_df.to_csv('../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_train_45.csv')
test_result_df.to_csv('../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_test_5.csv')

In [89]:
remain_df = remain_df.sample(100)

In [90]:
test_comb = list(itertools.combinations(remain_df['index'], 2))
len(test_comb)

4950

In [91]:
test_df = make_comb_for_contrastive_learner(test_comb, df)

In [92]:
test_df = test_df.sample(1238)

In [93]:
test_df.columns = new_columns

In [94]:
test_df.to_csv('../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_test.csv')

In [141]:
train_df = pd.DataFrame([], columns=columns)

In [142]:
for i in range(5):
    sub_df = df[df.label_for_contrastive == i].sample(50)
    train_df = pd.concat([train_df, sub_df])

In [143]:
train_df.shape

(250, 6)

In [148]:
train_comb = list(itertools.combinations(train_df['index'], 2))

In [149]:
print(len(train_comb))

31125


In [144]:
remain_df = df.drop(train_df.index)

In [146]:
remain_df.shape

(750, 6)

In [172]:
test_df = pd.DataFrame([], columns=columns)

In [173]:
sub_df = remain_df.sample(110)
test_df = pd.concat([test_df, sub_df])

In [174]:
test_df.shape

(110, 6)

In [175]:
test_comb = list(itertools.combinations(test_df['index'], 2))

In [176]:
len(test_comb)

5995

In [177]:
train_result_df = make_comb_for_contrastive_learner(train_comb, df)
test_result_df = make_comb_for_contrastive_learner(test_comb, df)

In [178]:
train_result_df.columns = new_columns
test_result_df.columns = new_columns

In [179]:
train_result_df.to_csv('../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_train_50.csv')
test_result_df.to_csv('../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_test_remain.csv')