In [311]:
import pandas as pd
from spacy.lang.en import English
import itertools

In [312]:
post_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_post.csv', encoding='UTF-8')
comment_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_comment.csv', encoding='UTF-8')

In [313]:
satisfactions_float = list(post_df['satisfaction'])

In [314]:
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f03c3749dc0>

In [315]:
post_contents = list(post_df['content'])
comment_bodies = list(comment_df['content'])

post_sequences = []
comment_sequences = []


for post_content, comment_body in zip(post_contents, comment_bodies):
    post_sequences.append(list(map(lambda x: str(x), list(nlp(post_content).sents))))
    comment_sequences.append(list(map(lambda x: str(x), list(nlp(comment_body).sents))))



# satisfaction score (y)
satisfactions_float = list(post_df['satisfaction'])
satisfactions = []
for_contrastive = []

for s in satisfactions_float:
    if s < 3.5:
        satisfactions.append(0)
    elif s < 5:
        satisfactions.append(1)
    else:
        satisfactions.append(2)
        
for s in satisfactions_float:
    if s <= 2.5:
        for_contrastive.append(0)
    elif s <= 3.5:
        for_contrastive.append(1)
    elif s <= 4.5:
        for_contrastive.append(2)
    elif s <= 5.5:
        for_contrastive.append(3)
    else:
        for_contrastive.append(4)

# print(satisfactions_float)
# print(satisfactions)

data = []

i = 0
for post, comment, satisfaction1, satisfaction2, satisfaction_float in zip(post_contents, comment_bodies,
                                                                           satisfactions, for_contrastive, satisfactions_float):
    data.append([i, post, comment, satisfaction1, satisfaction2, satisfaction_float])
    i += 1

columns = ['index', 'post_contents', 'comment_contents', 'label', 'label_for_contrastive', 'score']
df = pd.DataFrame(data, columns=columns)

In [316]:
def make_comb_for_contrastive_learner(comb_list, df):
    columns = ['index', 'post_contents', 'comment_contents', 'label', 'label_for_contrastive', 'score'] * 2
    result_df = pd.DataFrame([], columns=columns)
    i = 0
    for comb in comb_list:
        sub_comb_df = pd.concat([df[df.index == comb[0]].reset_index(drop=True), 
                                 df[df.index == comb[1]].reset_index(drop=True)], axis=1)
        result_df = pd.concat([result_df, sub_comb_df]).reset_index(drop=True)
        if i % 10000 == 0:
            print(i, end=' ')
        i += 1
    
    return result_df

In [317]:
def make_comb_for_contrastive_learner(comb_list, df):
    columns = ['index1', 'index2']
    result_df = pd.DataFrame([], columns=columns)
    i = 0
    for comb in comb_list:
        sub_comb_df = pd.concat([df[df.index == comb[0]]].index)
        print(sub_comb_df)
        result_df = pd.concat([result_df, sub_comb_df]).reset_index(drop=True)
        if i % 10000 == 0:
            print(i, end=' ')
        i += 1
    
    return result_df

In [318]:
def conduct_train_test_set(df, train_size, test_size):
    copy_df = df.copy()
    train_df = pd.DataFrame([], columns=columns)
    test_df = pd.DataFrame([], columns=columns)
    
    for i in range(5):
        sub_df = copy_df[copy_df.label_for_contrastive == i].sample(train_size)
        train_df = pd.concat([train_df, sub_df])
        
    remain_df = copy_df.drop(train_df.index)
    
    for i in range(5):
        sub_df = remain_df[remain_df.label_for_contrastive == i].sample(test_size)
        test_df = pd.concat([test_df, sub_df])
        
    train_comb = list(itertools.combinations(train_df['index'], 2))
    test_comb = list(itertools.combinations(test_df['index'], 2))
    
    train_result_df = make_comb_for_contrastive_learner(train_comb, df)
    test_result_df = make_comb_for_contrastive_learner(test_comb, df)
    
    new_columns = ['index1', 'post_contents1', 'comment_contents1', 'label1', 'label_for_contrastive1', 'score1', 'index2', 
                   'post_contents2', 'comment_contents2', 'label2', 'label_for_contrastive2', 'score2']
    
    train_result_df.columns = new_columns
    test_result_df.columns = new_columns
    
    train_result_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_train_{train_size * 5}.csv')
    test_result_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_test_{test_size * 5}.csv')

In [367]:
def conduct_train_test_set_product(df, train_size=0, test_size=0):
    copy_df = df.copy()
    
    # production
    product_df = copy_df.merge(copy_df, how='cross')
    new_columns = ['index1', 'post_contents1', 'comment_contents1', 'label1', 'label_for_contrastive1', 'score1', 'index2', 
                   'post_contents2', 'comment_contents2', 'label2', 'label_for_contrastive2', 'score2']
    product_df.columns = new_columns
    product_df = product_df.drop(product_df[product_df.index1 == product_df.index2].index).reset_index(drop=True)
    
    if train_size == 0 and test_size == 0:
        return product_df
    
    train_df = pd.DataFrame([], columns=new_columns)
    test_df = pd.DataFrame([], columns=new_columns)
    
    def conduct_sets(origin_df, target_df, size):
        for i in range(5):
            for j in range(5):
                sub_df = origin_df[(origin_df.label_for_contrastive1 == i) & (origin_df.label_for_contrastive2 == j)].sample(size)
                target_df = pd.concat([target_df, sub_df])
        return target_df
    
    train_df = conduct_sets(product_df, train_df, train_size)
    remain_df = product_df.drop(train_df.index)
    test_df = conduct_sets(remain_df, test_df, test_size)
    
    # after using reamin_df, reset index
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    train_result_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_train_{train_size * 5}.csv')
    test_result_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/dataset/df_for_contrastive_learner_test_{test_size * 5}.csv')

In [368]:
product_df = conduct_train_test_set_product(df)

In [369]:
counts = product_df.apply(pd.value_counts)
mini_df = product_df[['label_for_contrastive1', 'label_for_contrastive2']]
mini_df = mini_df.groupby(['label_for_contrastive1', 'label_for_contrastive2']).size().reset_index().rename(columns={0:'count'})
table = mini_df.pivot('label_for_contrastive1', 'label_for_contrastive2', 'count')

In [370]:
table

label_for_contrastive2,0,1,2,3,4
label_for_contrastive1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2550,9027,17187,15300,6885
1,9027,31152,59649,53100,23895
2,17187,59649,113232,101100,45495
3,15300,53100,101100,89700,40500
4,6885,23895,45495,40500,18090


In [372]:
conduct_train_test_set_product(df, 2000, 500)