In [18]:
import pandas as pd
import pathlib
import numpy as np

Load the data. We want to use the words level file, since it preserves the original sentence ordering. NOTE: might take a bit of time to load (more than a few seconds), since the file is quite large.

In [9]:
words_path = pathlib.Path.cwd().joinpath('data', 'annotations_word-level_all-to-date-2018-4-25.xlsx')

In [10]:
words = pd.read_excel(words_path)

In [12]:
words.count()

_id                     387548
content                 387525
docID                   387548
globalPsn               387548
highlightsBackground    387548
highlightsFindings      387548
highlightsMechanism     387548
highlightsMethod        387548
highlightsPurpose       387548
sentenceID              387548
sequence                387548
winningHighlight        349903
paperID                 387548
dtype: int64

In [17]:
print("There are", len(set(words['paperID'])), "unique docs")

There are 2071 unique docs


Then I think we want to group by paperID, make sure that it's sorted by globalPsn, and then make:
1. A new csv that has a coherent segment as a row, with order preserved.
2. Pseudo-docs that join these rows with a "fake" separator that Snorkel can use to split into candidates.

In [41]:
validTypes = set(['Method', 'Background', 'Findings', 'Mechanism', 'Purpose'])

In [37]:
print(set(words['winningHighlight']))

{nan, 'Method', 'None', 'Background', 'Findings', 'Mechanism', 'Purpose'}


In [43]:
segmented = []

for paperID, paperData in words.groupby('paperID'):
    
    # make sure everything is sorted first
    paperData = paperData.sort_values(by="globalPsn")
    
    # get the first row first
    # we're going to switch if the current highlight is different from the last one
    firstRow = paperData.iloc[0]
    lastType = firstRow['winningHighlight']
    segmentWords = [str(firstRow['content'])]
    sequence = 1
    #secondRowOnwards = paperData.iloc[1:]
    
    for index, row in paperData.iloc[1:].iterrows():
        
        # get currentType
        currentType = row['winningHighlight']
        # propagate past empty ones (this is mostly the periods)
        if currentType not in validTypes:
            currentType = lastType
        
        # compare with lastType
        if currentType != lastType:
            # we've switched types
            # process the last segment group
            joined = " ".join(segmentWords)
            segmented.append({
                'content': joined,
                'highlightType': lastType,
                'paperID': paperID,
                'sequence': sequence
            })
            # start a new segment group
            segmentWords = [str(row['content'])]
            sequence += 1
        else:
            # we're still in the same type
            # add to the current segment group
            segmentWords.append(str(row['content']))
            
        # set lastType for next comparison
        lastType = currentType
        
segmented = pd.DataFrame(segmented)
segmented.head(25)

Unnamed: 0,content,highlightType,paperID,sequence
0,Online communities have the potential to be su...,Background,2017_1,1
1,Using millions of messages sent in Twitch chat...,Method,2017_1,2
2,we explore the effectiveness of methods for en...,Purpose,2017_1,3
3,Consistent with aspects of imitation theory an...,Background,2017_1,4
4,users imitated examples of behavior that they ...,Findings,2017_1,5
5,Recent research has demonstrated that ( a ) gr...,Background,2017_10,1
6,The current study examines whether these resul...,Purpose,2017_10,2
7,In this study of teams playing the online game...,Method,2017_10,3
8,Forming work teams involves matching people wi...,Purpose,2017_11,1
9,"We introduce team dating, where people interac...",Mechanism,2017_11,2


Ok let's check a few to see what segments come out!

In [45]:
for paperID, paperData in segmented.head(25).groupby("paperID"):
    print("Paper:", paperID)
    print("*"*30)
    paperData = paperData.sort_values(by="sequence")
    for index, row in paperData.iterrows():
        print(row['highlightType'], row['content'])

Paper: 2017_1
******************************
Background Online communities have the potential to be supportive, cruel, or anywhere in between . The development of positive norms for interaction can help users build bonds, grow, and learn .
Method Using millions of messages sent in Twitch chatrooms,
Purpose we explore the effectiveness of methods for encouraging and discouraging specific behaviors, including taking advantage of imitation effects through setting positive examples and using moderation tools to discourage antisocial behaviors .
Background Consistent with aspects of imitation theory and deterrence theory,
Findings users imitated examples of behavior that they saw, and more so for behaviors from high status users . Proactive moderation tools, such as chat modes which restricted the ability to post certain content, proved effective at discouraging spam behaviors, while reactive bans were able to discourage a wider variety of behaviors .
Paper: 2017_10
************************

Now let's create Snorkel-style segments, split by periods.

In [52]:
segmented_periodSplit = []
for paperID, segmentedPaper in segmented.groupby("paperID"):
    sequence = 1
    for index, row in segmentedPaper.iterrows():
        for split in row['content'].split('.'):
            if len(split) > 1: # skip blank content
                segmented_periodSplit.append({
                    'content': split.strip(), # remove leading/trailing spaces from the period split
                    'highlightType': row['highlightType'],
                    'paperID': paperID,
                    'sequence': sequence
                })
            sequence += 1
segmented_periodSplit = pd.DataFrame(segmented_periodSplit)

In [53]:
segmented_periodSplit.head(50)

Unnamed: 0,content,highlightType,paperID,sequence
0,Online communities have the potential to be su...,Background,2017_1,1
1,The development of positive norms for interact...,Background,2017_1,2
2,Using millions of messages sent in Twitch chat...,Method,2017_1,4
3,we explore the effectiveness of methods for en...,Purpose,2017_1,5
4,Consistent with aspects of imitation theory an...,Background,2017_1,7
5,users imitated examples of behavior that they ...,Findings,2017_1,8
6,"Proactive moderation tools, such as chat modes...",Findings,2017_1,9
7,Recent research has demonstrated that ( a ) gr...,Background,2017_10,1
8,The current study examines whether these resul...,Purpose,2017_10,3
9,In this study of teams playing the online game...,Method,2017_10,5


In [58]:
out_path = pathlib.Path.cwd().joinpath('data', 'annotations_label-level_all-to-date-2018-4-25-WithTitle.labelled.originalSegments.csv')
segmented_periodSplit.to_csv(out_path, columns=['content', 'highlightType', 'paperID'], index=False, header=False)