In [1]:
import pandas as pd
import os
from importlib import reload
from helpers import constants; reload(constants)
import re

In [2]:
raw_data_path = constants.RAW_TEXT_DIR
base_data_path = constants.BASE_DATA_DIR
max_len = constants.MAX_SEQUENCE_LENGTH
intermediate_path = constants.ITM_DATA_DIR

In [3]:
df_posts = pd.read_pickle(intermediate_path + '/post_df.pickle')

In [5]:
print(df_posts.loc[100,'post'])

   Err... i think i get his point. Sorry 'bout that...  Well, maybe for you it won't be a critical status to be "psychedelic", but when you're in there knowing nothing about what's going to happen next, then "kabloosh..." =P   But the reason i smoke? I really don't know. But now all i know is that i have this one "judgement stick" left in my W-Lights' case. If i take that one, i won't smoke again... darn.  


In [31]:
df_posts.shape

(409798, 6)

In [32]:
# attach one column recording the length of the post
def post_length(text):
    match_phrase = r'\w+' # r'\w+[\']?\w*'
    return len(re.findall(match_phrase, text))

length_list = []
for post in df_posts.loc[:,'post']:
    length = post_length(post)
    length_list.append(length)

df_posts['len'] = length_list

In [33]:
df_posts.head()

Unnamed: 0,post,id,gender,age,industry,sign,len
0,Hello to anyone who's reading this. I've be...,2973911,male,17,Student,Sagittarius,143
1,"Gosh, Jer. This is a blog. Not a chatroom. ...",2973911,male,17,Student,Sagittarius,92
2,Hey! people please come here! I'm dying......,2973911,male,17,Student,Sagittarius,19
3,"Is that ""mandough"" the mando I know? Or so...",2973911,male,17,Student,Sagittarius,37
4,"Omigodessess!!!! What the fork! Hey, doofu...",2973911,male,17,Student,Sagittarius,18


In [34]:
def divide_text(text, length_keep):
    '''
    Divide each post into two parts - the first half and the second half,
    the length of each half is controlled by the argument length_keep.
    these two texts will be used in the model on half of the original posts.
    Such operation is to control the length of each post.
    Argument:
    text: the post that need to be divided
    length_keep: the length of the remaining text
    Return: 
    A tuple of two strings each of which is the resulting text
    '''
    word_list = text.split()
    while ('' in word_list): word_list.remove('')
    first_half = ' '.join(word_list[:length_keep])
    second_half = ' '.join(word_list[-length_keep:])
    return [first_half, second_half]

In [35]:
def combine_text_label(post_list, label_list):
    return [[post] + label_list for post in post_list]

In [36]:
def truncate_text(df_posts, length_keep):
    '''
    Transform the post dataframe into a new dataframe which contains the truncated version of the post.
    The long post(longer than length_keep) will be divided into two halves,
    each of the two half will become a row in the new dataframe.
    '''
    col_name = df_posts.columns
    list_df_truncated = []
    for _, data in df_posts.iterrows():
        if data['len'] <= length_keep: # if the post is short enough, no truncation is needed
            df_truncated_text = pd.DataFrame(columns=col_name)
            df_truncated_text.loc[0] = data.tolist()
        else:
            label_list = data[data.index != 'post'].tolist()
            divided_text_list = divide_text(data['post'], length_keep)
            df_truncated_text = pd.DataFrame(combine_text_label(divided_text_list, label_list), columns = col_name)
        list_df_truncated.append(df_truncated_text)
    df_return = pd.concat(list_df_truncated).reset_index()
    return df_return.drop('index', axis=1)

In [37]:
df_posts_short = truncate_text(df_posts, max_len)

In [38]:
df_posts_short.drop('len',inplace=True, axis=1)

In [39]:
df_posts_short.to_pickle(base_data_path +'/post_df_short.pickle')