In [4]:
from process_util import *
from random import sample

In [6]:
# read jsonl dataset
src = "/home/ubuntu/cs224u/raw_reddit/tldr-training-data.jsonl"
reader = jsonlines.open(src)

# choose a subreddit
input_subreddit = "relationships"

# create the directory to this corresponding dataset
dst = "/home/ubuntu/cs224u/new_relationships"+'/'+input_subreddit+'_story(small)'
os.mkdir(dst)

In [19]:
def create_preprocessed_story_file(input_dict, save_dir):
    '''
    input:
    input_dict: input dictionary, include information about its id, content, summary etc
    save_dir: a directory about where to save the story files
    reference: https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908
    here we preprocessed the content and the summary of the story by:
    1) get rid of extra space tab
    2) filter out those whose summary is too short/content is too short
    3) delete special characters like [...]
    4) [potential] Stemming (spend/spent/spends...)
    5) [potential] Lemmatization (do/done/did)
    '''
    dic_id = input_dict["id"]
    content = input_dict["content"]
    summary = input_dict['summary']
    if(summary.split() >3):
        # get rid of extra space tab
        content = re.sub('\s+', ' ', content).strip()
        summary = re.sub('\s+', ' ', summary).strip()    
        # get rid of words inside special characters
        content = re.sub("[\(\[].*?[\)\]]", "", content)
        summary = re.sub("[\(\[].*?[\)\]]", "", summary)

        filename = os.path.join(save_dir, dic_id + ".story")
        file1 = open(filename,"w")
        file1.writelines(content+'\n')
        file1.writelines('@highlight \n')
        file1.writelines(summary)
        file1.close()

In [20]:
# get a corresponding dataset
count = 0
dic_list = []
for dic in reader:
    if("subreddit" in dic.keys() and dic["subreddit"] == input_subreddit and 
       isEnglish(dic["summary"]) == True and  isEnglish(dic["content"]) == True ):
        dic_list.append(dic)

In [21]:
# create a small dataset
sample_list = sample(dic_list,100)
for dic in sample_list:
    create_preprocessed_story_file(dic, dst)        

In [22]:
dst_whole = "/home/ubuntu/cs224u/new_relationships"+'/'+input_subreddit+'_story(whole)'
os.mkdir(dst_whole)

In [24]:
# create the whole dataset 
#sample_list = sample(dic_list,100)
for dic in dic_list:
    create_preprocessed_story_file(dic, dst_whole)        

# get corresponding list

In [11]:
input_subreddit = "relationships"
dst = "/home/ubuntu/cs224u/new_relationships"+'/'+input_subreddit+'_story'
result_list = os.listdir(dst)
np.random.shuffle(result_list)
size = len(result_list)

train_list = result_list[0:int(0.8*size)-1]
train_str = "\n".join(x for x in train_list)

dev_list = result_list[int(0.8*size):int(0.9*size)-1]
dev_str = "\n".join(x for x in dev_list)

test_list = result_list[int(0.9*size): int(size)-1]
test_str = "\n".join(x for x in test_list)

print (len(train_list))
print (len(dev_list))
print (len(test_list))

169945
21242
21243


In [12]:
def create_list(subreddit_type, input_type, input_str):
    filename = os.path.join("/home/ubuntu/cs224u/new_"+ subreddit_type+"/", subreddit_type + input_type + "list.txt")
    print (filename)
    f = open(filename,"w")
    f.writelines(input_str)
    f.close() 

In [13]:
# create three lists
create_list(input_subreddit, "_train", train_str)
create_list(input_subreddit, "_val", dev_str)
create_list(input_subreddit, "_test", test_str)


/home/ubuntu/cs224u/new_relationships/relationships_trainlist.txt
/home/ubuntu/cs224u/new_relationships/relationships_vallist.txt
/home/ubuntu/cs224u/new_relationships/relationships_testlist.txt


# create a baseline result for the test set

In [43]:
# create the directory to the corresponding baseline result
make_dir = '/home/ubuntu/cs224u/processed_' +input_subreddit+'/baseline' 
os.mkdir(make_dir)

In [44]:
make_dec_dir = make_dir + '/decoded'
os.mkdir(make_dec_dir)

make_ref_dir = make_dir + '/reference'
os.mkdir(make_ref_dir)

In [51]:
# get the name of the test list
test_name_list = [x[:-6] for x in test_list]
test_name_list

['t3_1udtww',
 '6ejvy9',
 't3_3yzftu',
 't3_4bcuhi',
 'di3h5hh',
 '64tr5t',
 't3_29tth3',
 'd62d0ho',
 'd5axozy']

In [60]:
reader = jsonlines.open(src)
# create corresponding baseline summarization
for dic in reader:
    if("subreddit" in dic.keys() and dic["subreddit"] == input_subreddit and isEnglish(dic["content"]) == True):
        if(dic["id"] in test_name_list):
            print(dic["id"])
            create_baseline_summarization_file(dic, make_dec_dir)
            create_reference_file(dic, make_ref_dir)
        

t3_4bcuhi
t3_1udtww
t3_3yzftu
t3_29tth3
d62d0ho
d5axozy
di3h5hh
6ejvy9
64tr5t


# create an example.story (not relavant to here)

In [7]:
for dic in reader:
    if("subreddit" in dic.keys() and dic["subreddit"] == "AskReddit" and isEnglish(dic["summary"]) == True and  isEnglish(dic["content"]) == True ):
        print(dic)
        break

{u'body': u"I want to say this was about two weeks ago, could be less, but I haven't been sleeping very much lately and the days are starting to blur together.  My dream started off quite normal, going to familiar places and seeing people I knew.  However, I then went through a door and ended up on the middle of the interstate outside my current hometown, traffic not even bothering to attempt to avoid me.  It felt like Frogger.  Anyway, I came to the conclusion that I should just walk home to my old house which is about 180 miles from where I live now.  The entire way back, I began making up stories to keep myself entertained and somehow came upon the idea of creating a new background for my last name, coming up with traits and their stories for each letter in my last name.  Somehow I ended up in my childhood neighborhood in North Carolina, but my house in Florida had replaced the house there.  Every light in the neighborhood was off.  No streetlamps, no porch lights, the stars and moo

In [9]:
dic["content"]

u"I want to say this was about two weeks ago, could be less, but I haven't been sleeping very much lately and the days are starting to blur together.  My dream started off quite normal, going to familiar places and seeing people I knew.  However, I then went through a door and ended up on the middle of the interstate outside my current hometown, traffic not even bothering to attempt to avoid me.  It felt like Frogger.  Anyway, I came to the conclusion that I should just walk home to my old house which is about 180 miles from where I live now.  The entire way back, I began making up stories to keep myself entertained and somehow came upon the idea of creating a new background for my last name, coming up with traits and their stories for each letter in my last name.  Somehow I ended up in my childhood neighborhood in North Carolina, but my house in Florida had replaced the house there.  Every light in the neighborhood was off.  No streetlamps, no porch lights, the stars and moon were gon

In [18]:
len(dic["summary"].split())

3

In [None]:
# cwd = os.getcwd()
filename = os.path.join(cwd, "example.story")
file1 = open(filename,"w")
file1.writelines(dic["content"]+'\n')
#file1.writelines('@hightlight \n')
#file1.writelines(input_dict['summary'])
file1.close()

In [None]:
dic["content"]

In [None]:
dic["summary"]