# Text features

Construct prior factors associated with response types 

In [2]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import urllib
import json
import time
import itertools
import random
import requests
import re
import os
import nltk
import string
import warnings
import tqdm
import seaborn as sns
from collections import defaultdict
from operator import itemgetter
from datetime import datetime, timedelta
from scipy import stats
from dateutil import parser
from pathlib import Path
from matplotlib import rcParams
from collections import ChainMap
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

warnings.filterwarnings('ignore')

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

What does the dataset look like?

It should be at the comment/conversatio level. There is only one comment coming from each conversation. there should be identifiers of conversations and page ids in case I want to do fixed effects. 


**About users**
* author_anon: author is anonoymous, not registered
* author_admin: author is admin or not
* author experience: 
    * author_edits_talk: how many edits on this talk page
    * author_edits_article: how many edits on this article
    * author_edits_wiki: use the edit counts in API (inaccurate), or query user's all edits?
    * author_tenure: how long since registered - in months
* num_speakers: how many speakers involved prior to the focal comment
* has_admin: is there an admin in prior comments? 



**About talk page community**
* page_year: in which year was the page created?
* page_edits_talk: how many edits by the time of focal comment on talk page
* page_edits_article: how many edits so far on article
* page_freq_talk: average time between new edits on the talk page
* page_freq_article: average tiem between edits on article



**About networks**
* build interaction network prior to focal comment. try several definitions of ties: 

(1) reply-to, 
(2) bipartite of user-conversation, 
(3) consecutive edits on talk pages

* network position of the author (centrality)
    * author_central_degree
    * author_central_between
    * author_central_close
    * author_central_eigen
* prior interaction level of users in the conversation, prior to focal comment 


* page_density: network density
* page_hierarchy: star structure. polarization of user activity. number/presence of extremely active users. imbalance
* clusters/factions


***

**Should not include**
* total length of conversation - post-toxic information shouldn't be used to predict toxic

In [3]:
# get bot list
with open('../data/bots.txt', 'r') as f:
    bots = f.readlines()
bots = [b.strip('\n') for b in bots]

In [4]:
matches = pd.read_csv('../data/matches_full2.csv', parse_dates=['timestamp'])
print(matches.shape)
matches.head()

(6863, 27)


Unnamed: 0,ancestor_id,authors,cleaned_content,content,conversation_id,id,indentation,isUnchanged,page_id,page_title,...,user_id,user_text,file_number,toxic,row_new,row_new2,toxic_new,toxic_new2,pos,mod_by_bot
0,24941859.1647.1647,ANONYMOUS:66.81.142.89,So the article is not neutral because you thi...,:: So the article is not neutral because you t...,24183500.0.0,24941859.1647.1647,2,False,2778861,Talk:A. E. Wilder-Smith,...,,66.81.142.89,52,True,True,True,True,True,4,False
1,67928757.6071.6071,691327:Steve Hart,"Well, as I'm sure you know, WP cite sources, ...",": Well, as I'm sure you know, WP cite sources,...",67888171.4644.4644,67928757.6071.6071,1,False,2778861,Talk:A. E. Wilder-Smith,...,691327.0,Steve Hart,52,False,True,True,False,False,3,False
2,258190421.9495.9485,8509347:P Cezanne,What was the cause?,What was the cause? [[User:P Cezanne|P Cezanne...,258190421.9485.9485,258190421.9495.9485,0,False,1182107,Talk:A. J. Ayer,...,8509347.0,P Cezanne,81,False,True,True,False,False,1,False
3,33407073.587.587,129409:SlimVirgin,"If it's a source, it should go in a references...","::If it's a source, it should go in a referenc...",33406513.241.241,33407073.587.587,2,False,1182107,Talk:A. J. Ayer,...,129409.0,SlimVirgin,81,True,True,True,True,True,3,False
4,532828193.32635.32635,40487:Sj,Here's a mention in the Economist: Everything...,:: Here's a mention in the Economist: [http://...,532664191.23453.23453,532828193.32635.32635,2,False,2850785,Talk:Aaron Swartz,...,40487.0,Sj,236,False,True,True,False,False,9,False


In [5]:
features = pd.read_csv('../data/bio_features0.csv', parse_dates=['timestamp'])
print(features.shape)
features.head()

(6863, 12)


Unnamed: 0,id,page_id,page_title,file_number,conversation_id,rev_id,user_id,user_text,toxic,pos,timestamp,conversation_start
0,24941859.1647.1647,2778861,Talk:A. E. Wilder-Smith,52,24183500.0.0,24941859,,66.81.142.89,True,4,2005-10-07 02:33:48+00:00,2005-09-27 20:39:03+00:00
1,67928757.6071.6071,2778861,Talk:A. E. Wilder-Smith,52,67888171.4644.4644,67928757,691327.0,Steve Hart,False,3,2006-08-06 02:13:39+00:00,2006-08-05 21:28:57+00:00
2,258190421.9495.9485,1182107,Talk:A. J. Ayer,81,258190421.9485.9485,258190421,8509347.0,P Cezanne,False,1,2008-12-15 20:24:31+00:00,2008-12-15 20:24:31+00:00
3,33407073.587.587,1182107,Talk:A. J. Ayer,81,33406513.241.241,33407073,129409.0,SlimVirgin,True,3,2005-12-31 19:58:26+00:00,2005-12-31 19:52:03+00:00
4,532828193.32635.32635,2850785,Talk:Aaron Swartz,236,532664191.23453.23453,532828193,40487.0,Sj,False,9,2013-01-13 07:45:38+00:00,2013-01-12 07:17:02+00:00


In [6]:
starts = pd.read_csv('../data/bio_talks_prior.csv', parse_dates=['timestamp'])

## Outline

* pos: turn position in the conversation, i.e. how many prior turns
* start: whether focal comment is the first comment of the conversation??? - how is it in relation to pos?
* if author has spoken before the focal comment
* *is it possible to add repeated interaction, like dyads, or triads?*
* median minutes between posts prior to focal comment. assign 0 if start==True???
* burstiness
* timestamp: year, month, weekend, part_of_day (morning, afternoong, evening, night)
* sentiment median and (max-min) prior to focal comment
* sentiment of focal comment
* politeness strategy
* question types asked 
* median length of prior posts 
* reaction time 
    * how many turns between toxic comment and the reaction comment
    * how many minutes
    * for archived comments, create a dummy has_response and assign a value 0 or max to reaction time???

## Turn position - exclude section heading

### <font color='red'>Issues</font>

* Pos includes section header or not?

In [7]:
features.pos.describe()

count    6863.000000
mean        4.905726
std         7.924538
min         0.000000
25%         1.000000
50%         2.000000
75%         5.000000
max        64.000000
Name: pos, dtype: float64

In [8]:
# topcode pos 
tc = 5
print(sum(features.pos > tc), round(np.mean(features.pos > tc), 3))

1692 0.247


In [9]:
features['pos_tc'] = np.where(features.pos > tc, tc, features.pos)
features['pos_tc'].value_counts()

1    2147
5    1966
0    1162
2     629
3     576
4     383
Name: pos_tc, dtype: int64

## Whether conversation starts with this comment

In [10]:
features['startswith'] = (features.pos == 0)
features.startswith.value_counts()

False    5701
True     1162
Name: startswith, dtype: int64

## Recount pos without section header. Recreate pos and startswith

In [11]:
results = defaultdict(list)

for i, start in starts.groupby('marker'):
    
    # remove section header if any
    start = start[start.type!='CREATION']
    
    results['id'].append(start.iloc[-1]['id'])
    results['pos2'].append(start.shape[0] - 1)

In [12]:
df = pd.DataFrame(results)
df['startswith2'] = df.pos2 == 0
df.head()

Unnamed: 0,id,pos2,startswith2
0,100056213.25366.25366,2,False
1,10011304.27024.26589,8,False
2,10011440.16241.16241,9,False
3,100161230.490.490,2,False
4,100202560.123.74,0,True


In [13]:
df.startswith2.value_counts()

False    3769
True     3094
Name: startswith2, dtype: int64

In [14]:
features = features.merge(df, on='id')

In [15]:
sum(features.pos != features.pos2)

5153

In [16]:
# removing section header is equivalent to moving later text forward one step
(features.pos - features.pos2).value_counts() 

1    5153
0    1710
dtype: int64

## Repeated exchanges

In [17]:
def get_repeated_exchange(row, window=3):
    '''
    Given a row, get the comments before (including) this row. 
    Return (1) if the conversation structure A-B-A is in the comments, 
    (2) if author of the given row is involved in the A-B-A structure.
    
    If not enough three comments, return (False, False).
    '''
    
    if row['pos2'] < window-1:
        return (False, False)
    
    author = row['user_text']
    start = starts[starts.marker==row['id']]
    
    # keep only one row per rev_id
    start = start[start.type!='CREATION']
    turn_df = start.drop_duplicates('rev_id')

    # sequence of who talked
    seq = turn_df.user_text.tolist()

    
    # get segments by the given moving window
    segments = [seq[i : i+window] for i in range(len(seq)-window+1)]
    
    if window == 3:
        
        return any([s[0]==s[2] for s in segments]), any([author in s for s in segments if s[0]==s[2]])
    
    elif window == 4:
        return any([s[0]==s[2] and s[1]==s[3] for s in segments]), any([author in s for s in segments if s[0]==s[2] and s[1]==s[3]])
    

In [18]:
double = features.apply(get_repeated_exchange, axis=1)
features['repeat3'] = list(double.map(itemgetter(0)))
features['author_in_repeat3'] = list(double.map(itemgetter(1)))

In [19]:
pd.crosstab(features.startswith2, features.repeat3)

repeat3,False,True
startswith2,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2032,1737
True,3094,0


In [70]:
features[features.repeat3==True].iloc[0].user_text

'66.81.142.89'

In [67]:
talk_df = pd.read_csv('../data/talk_page_bio_select/52.csv')
df = talk_df[talk_df.conversation_id=='24183500.0.0']

In [68]:
df

Unnamed: 0,ancestor_id,authors,cleaned_content,content,conversation_id,id,indentation,isUnchanged,page_id,page_title,...,user_id,user_text,file_number,toxic,row_new,row_new2,toxic_new,toxic_new2,pos,mod_by_bot
19,24183500.0.0,108029:ACW,NPOV tag.,== NPOV tag. ==\n,24183500.0.0,24183500.0.0,-1,False,2778861,Talk:A. E. Wilder-Smith,...,108029.0,ACW,52,False,True,True,False,False,0,False
20,24183500.16.0,108029:ACW,", the NPOV tag incorporates text that says, ""S...","[[User:Duncharris|Duncharris]], the NPOV tag i...",24183500.0.0,24183500.16.0,0,False,2778861,Talk:A. E. Wilder-Smith,...,108029.0,ACW,52,False,True,True,False,False,1,False
21,24197031.541.541,47291:Duncharris,The NPOV tag stays into the problems are sort...,: The NPOV tag stays into the problems are sor...,24183500.0.0,24197031.541.541,1,False,2778861,Talk:A. E. Wilder-Smith,...,47291.0,Duncharris,52,False,True,True,False,False,2,False
22,24232384.1647.1647,108029:ACW,Thanks for the quick response. I just wanted ...,::Thanks for the quick response. I just wante...,24183500.0.0,24232384.1647.1647,2,False,2778861,Talk:A. E. Wilder-Smith,...,108029.0,ACW,52,False,True,True,False,False,3,False
23,24941859.1647.1647,ANONYMOUS:66.81.142.89,So the article is not neutral because you thi...,:: So the article is not neutral because you t...,24183500.0.0,24941859.1647.1647,2,False,2778861,Talk:A. E. Wilder-Smith,...,,66.81.142.89,52,True,True,True,True,True,4,False
24,25067154.1886.1886,108029:ACW,"The above paragraph was inserted, ''not by me...","::: The above paragraph was inserted, ''not by...",24183500.0.0,25067154.1886.1886,3,False,2778861,Talk:A. E. Wilder-Smith,...,108029.0,ACW,52,False,True,True,False,False,5,False
25,25912653.2386.2386,108670:Hob Gadling,"Well, Wilder Smith dates back to the fifties, ...",":::Well, Wilder Smith dates back to the fiftie...",24183500.0.0,25912653.2386.2386,3,False,2778861,Talk:A. E. Wilder-Smith,...,108670.0,Hob Gadling,52,False,True,True,False,False,6,False
26,30559528.2722.2722,577967:Arthurkoestler,I read one of his books about twenty years ago...,I read one of his books about twenty years ago...,24183500.0.0,30559528.2722.2722,0,False,2778861,Talk:A. E. Wilder-Smith,...,577967.0,Arthurkoestler,52,False,True,True,False,False,7,False
27,38354233.3137.3137,ANONYMOUS:80.141.125.157,The problem is that for many evolutionist ther...,The problem is that for many evolutionist ther...,24183500.0.0,38354233.3137.3137,0,False,2778861,Talk:A. E. Wilder-Smith,...,,80.141.125.157,52,False,True,True,False,False,8,False
28,38354233.3137.3137,ANONYMOUS:80.141.125.157,The problem is that for many evolutionist ther...,The problem is that for many evolutionist ther...,24183500.0.0,38354393.3137.3137,0,True,2778861,Talk:A. E. Wilder-Smith,...,,80.141.125.157,52,False,False,True,False,False,9,False


In [20]:
pd.crosstab(features.author_in_repeat3, features.repeat3)

repeat3,False,True
author_in_repeat3,Unnamed: 1_level_1,Unnamed: 2_level_1
False,5126,865
True,0,872


## Time gaps between talks

In [21]:
def get_time_between(row, unit='day'):
    '''
    Return a list of time between successive revisions
    '''
    
    if row['startswith2']:
        return []
    
    # conversation before (including) current row
    start = starts[starts.marker==row['id']]
        
    # keep only one row per rev_id
    start = start[start.type!='CREATION']
    turn_df = start.drop_duplicates('rev_id')
        
    # time gaps
    secs = turn_df.timestamp.diff().dt.total_seconds()
    secs = secs[1: ] # drop first value nan

    intervals = []
    
    if unit == 'minute':
        intervals = secs / 60
    
    elif unit == 'hour':
        intervals = secs / 3600
    
    elif unit == 'day':
        intervals = secs / 3600 / 24
        
    return intervals.tolist()

In [22]:
features['days_btw'] = features.apply(get_time_between, axis=1)
features['hours_btw'] = features.apply(lambda row: get_time_between(row, unit='hour'), axis=1)

features[['days_btw', 'hours_btw']].head()

Unnamed: 0,days_btw,hours_btw
0,"[0.1247800925925926, 0.5876388888888889, 8.533...","[2.9947222222222223, 14.103333333333333, 204.8..."
1,"[0.0004398148148148148, 0.19726851851851854]","[0.010555555555555556, 4.734444444444445]"
2,[],[]
3,"[0.002615740740740741, 0.0018171296296296297]","[0.06277777777777778, 0.043611111111111114]"
4,"[0.023240740740740742, 0.4900231481481481, 0.0...","[0.5577777777777778, 11.760555555555555, 0.485..."


In [23]:
# median days between successive revisions
num_gaps = features.days_btw.map(len)

features['med_gaps_days'] = np.where(num_gaps==0, 0, features.days_btw.map(np.median))

# median hours between successive revisions
features['med_gaps_hours'] = np.where(num_gaps==0, 0, features.hours_btw.map(np.median))

### Burstiness

In [24]:
def get_burstiness(series):
    m = np.mean(series)
    sd = np.std(series)
    return (sd - m) / (sd + m)

features['burst_days'] = np.where(num_gaps==0, 0, features.days_btw.map(get_burstiness))

features['burst_hours'] = np.where(num_gaps==0, 0, features.hours_btw.map(get_burstiness))

In [25]:
all(features.burst_hours == features.burst_days)

False

In [26]:
np.corrcoef(features.burst_days, features.burst_hours)

array([[1., 1.],
       [1., 1.]])

## Sentiment in prior comments

Not very accurate though

In [27]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
sentiment_analyzer = VS()

# for example
s = 'he smells like poo'
sentiment_analyzer.polarity_scores(s)

{'neg': 0.0, 'neu': 0.545, 'pos': 0.455, 'compound': 0.3612}

In [28]:
starts.cleaned_content = starts.cleaned_content.fillna('')

In [29]:
# sentiment analysis

def get_values(text):
    sentiment_dict = sentiment_analyzer.polarity_scores(text)
    return sentiment_dict['neg'], sentiment_dict['compound']


def negative_prior(row):
    '''
    Return number of negative comments. measured in two ways.
    '''

    if row['startswith2']:
        return (0, 0)
    
    # get prior conversation
    start = starts[starts.marker==row['id']]
    start = start[start.type!='CREATION']
    start = start.iloc[:-1]
    
    vals = start.cleaned_content.map(get_values)

    negatives = vals.map(itemgetter(0))
    compounds = vals.map(itemgetter(1))
        
    return sum(negatives > 0), sum(compounds <= -0.05)
                

In [30]:
tada = features.apply(negative_prior, axis=1)

In [31]:
features['neg_prior'] = tada.map(itemgetter(0))
features['neg_prior2'] = tada.map(itemgetter(1))

## Sentiment of current comment

In [32]:
def get_current_sentiment(text):
    vals = sentiment_analyzer.polarity_scores(text)
    return (vals['pos'], vals['neg'], vals['compound'])


matches.cleaned_content = matches.cleaned_content.fillna('')
values = matches.cleaned_content.map(get_current_sentiment)

In [33]:
matches['pos_current'] = values.map(itemgetter(0))
matches['neg_current'] = values.map(itemgetter(1))
matches['senti_current'] = values.map(itemgetter(2))

In [34]:
features = features.merge(matches[['id', 'pos_current', 'neg_current', 'senti_current']], on='id')
features

Unnamed: 0,id,page_id,page_title,file_number,conversation_id,rev_id,user_id,user_text,toxic,pos,...,hours_btw,med_gaps_days,med_gaps_hours,burst_days,burst_hours,neg_prior,neg_prior2,pos_current,neg_current,senti_current
0,24941859.1647.1647,2778861,Talk:A. E. Wilder-Smith,52,24183500.0.0,24941859,,66.81.142.89,True,4,...,"[2.9947222222222223, 14.103333333333333, 204.8...",0.587639,14.103333,0.112007,0.112007,2,1,0.177,0.071,0.4767
1,67928757.6071.6071,2778861,Talk:A. E. Wilder-Smith,52,67888171.4644.4644,67928757,691327.0,Steve Hart,False,3,...,"[0.010555555555555556, 4.734444444444445]",0.098854,2.372500,-0.002230,-0.002230,1,0,0.219,0.042,0.6924
2,258190421.9495.9485,1182107,Talk:A. J. Ayer,81,258190421.9485.9485,258190421,8509347.0,P Cezanne,False,1,...,[],0.000000,0.000000,0.000000,0.000000,0,0,0.000,0.000,0.0000
3,33407073.587.587,1182107,Talk:A. J. Ayer,81,33406513.241.241,33407073,129409.0,SlimVirgin,True,3,...,"[0.06277777777777778, 0.043611111111111114]",0.002216,0.053194,-0.694690,-0.694690,0,0,0.000,0.000,0.0000
4,532828193.32635.32635,2850785,Talk:Aaron Swartz,236,532664191.23453.23453,532828193,40487.0,Sj,False,9,...,"[0.5577777777777778, 11.760555555555555, 0.485...",0.030052,0.721250,0.176279,0.176279,5,3,0.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,307018439.43174.43150,3325702,Talk:Ányos Jedlik,79440,307018439.43150.43150,307018439,869314.0,Dicklyon,False,1,...,[],0.000000,0.000000,0.000000,0.000000,0,0,0.180,0.000,0.7841
6859,160525037.307.276,2467547,Talk:Évariste Galois,79603,160525037.276.276,160525037,4368562.0,SophomoricPedant,True,1,...,[],0.000000,0.000000,0.000000,0.000000,0,0,0.000,0.000,0.0000
6860,21150249.24.0,2467547,Talk:Évariste Galois,79603,21150249.0.0,21150249,84330.0,Rama,False,1,...,[],0.000000,0.000000,0.000000,0.000000,0,0,0.150,0.082,0.6015
6861,91404508.189.189,7212654,Talk:‘Abd al-Razzaq al-San‘ani,79679,91392170.32.32,91404508,146986.0,Gene Nygaard,True,1,...,[1.1194444444444445],0.046644,1.119444,-1.000000,-1.000000,0,0,0.144,0.165,-0.1010


## Politeness strategies of prior comments

In [35]:
# politeness strategies

from convokit import Corpus, User, Utterance, Parser, PolitenessStrategies

# construct corpus object

ps = PolitenessStrategies()
annotator = Parser()

def make_corpus(df):
    corpus_users = {k: User(name = k) for k in df.user_text.tolist()}

    utterance_corpus = {row['id']: 
                        Utterance(row['id'], corpus_users[row['user_text']], 
                                  row['conversation_id'], None, None, row['cleaned_content'])
                        for i, row in df.iterrows()}

    corpus = Corpus(utterances=list(utterance_corpus.values()), version=1)
    corpus = annotator.fit_transform(corpus)
    
    return corpus

In [36]:
starts.cleaned_content = starts.cleaned_content.fillna('')

In [40]:
ps_prev = pd.read_csv('../data/bio_politeness_strategies.csv')
new_comments = starts[~starts.id.isin(ps_prev.id)]
new_comments.shape

(704, 28)

In [41]:
corpus = make_corpus(new_comments)
corpus = ps.transform(corpus)

In [42]:
utterance_ids = corpus.get_utterance_ids()
rows = [corpus.get_utterance(uid).meta["politeness_strategies"] for uid in utterance_ids]
politeness_strategies = pd.DataFrame(rows, index=utterance_ids)
politeness_strategies = politeness_strategies.reset_index()
politeness_strategies = politeness_strategies.rename(columns={'index': 'id'})
politeness_strategies

Unnamed: 0,id,feature_politeness_==Please==,feature_politeness_==Please_start==,feature_politeness_==Indirect_(btw)==,feature_politeness_==Hedges==,feature_politeness_==Factuality==,feature_politeness_==Deference==,feature_politeness_==Gratitude==,feature_politeness_==Apologizing==,feature_politeness_==1st_person_pl.==,...,feature_politeness_==2nd_person==,feature_politeness_==2nd_person_start==,feature_politeness_==Indirect_(greeting)==,feature_politeness_==Direct_question==,feature_politeness_==Direct_start==,feature_politeness_==SUBJUNCTIVE==,feature_politeness_==INDICATIVE==,feature_politeness_==HASHEDGE==,feature_politeness_==HASPOSITIVE==,feature_politeness_==HASNEGATIVE==
0,103839272.4260.4260,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,103839272.4285.4260,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
2,107961459.21736.21736,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,107961459.21793.21736,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,1,1
4,107961603.21793.21793,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,93062760.0.115353,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
684,93062760.0.115370,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
685,93062760.0.116522,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
686,93062911.115353.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
politeness_strategies = politeness_strategies.append(ps_prev, ignore_index=True)

In [782]:
# politeness_strategies.to_csv('../data/bio_politeness_strategies.csv', index=False)

In [44]:
# whether each politeness strategy was ever used prior to current comment


starts_polite = starts.merge(politeness_strategies, on='id', how='left')

polite_cols = politeness_strategies.columns.tolist()
polite_cols.remove('id')

tmp_cols = politeness_strategies.columns.tolist()
tmp_cols.append('feature_politeness_all')
place_holder = pd.DataFrame(columns=tmp_cols)


for i, start in tqdm.tqdm(starts_polite.groupby('marker')):
    
    # trim
    start = start[start.type!='CREATION'][polite_cols] # no section header
    start = start.iloc[:-1] # no current row

    # column sum
    row = start.sum()
    row['feature_politeness_all'] = row.sum()
    row['id'] = i
    
    place_holder = place_holder.append(row, ignore_index=True)
    

100%|██████████| 6863/6863 [01:20<00:00, 85.26it/s]


In [45]:
new_cols = {c: 'prior_'+c for c in place_holder.columns if c != 'id' and not c.startswith('prior_')}
place_holder = place_holder.rename(columns = new_cols)
print(place_holder.shape)
place_holder.head()

(6863, 23)


Unnamed: 0,id,prior_feature_politeness_==Please==,prior_feature_politeness_==Please_start==,prior_feature_politeness_==Indirect_(btw)==,prior_feature_politeness_==Hedges==,prior_feature_politeness_==Factuality==,prior_feature_politeness_==Deference==,prior_feature_politeness_==Gratitude==,prior_feature_politeness_==Apologizing==,prior_feature_politeness_==1st_person_pl.==,...,prior_feature_politeness_==2nd_person_start==,prior_feature_politeness_==Indirect_(greeting)==,prior_feature_politeness_==Direct_question==,prior_feature_politeness_==Direct_start==,prior_feature_politeness_==SUBJUNCTIVE==,prior_feature_politeness_==INDICATIVE==,prior_feature_politeness_==HASHEDGE==,prior_feature_politeness_==HASPOSITIVE==,prior_feature_politeness_==HASNEGATIVE==,prior_feature_politeness_all
0,100056213.25366.25366,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,2,1,2,7
1,10011304.27024.26589,1,0,0,3,1,0,0,1,2,...,0,0,2,4,0,0,6,6,8,47
2,10011440.16241.16241,0,0,0,2,1,0,0,0,1,...,1,0,0,1,0,0,5,7,7,38
3,100161230.490.490,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,2,0,4
4,100202560.123.74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
features = features.merge(place_holder, on='id')
features.head()

Unnamed: 0,id,page_id,page_title,file_number,conversation_id,rev_id,user_id,user_text,toxic,pos,...,prior_feature_politeness_==2nd_person_start==,prior_feature_politeness_==Indirect_(greeting)==,prior_feature_politeness_==Direct_question==,prior_feature_politeness_==Direct_start==,prior_feature_politeness_==SUBJUNCTIVE==,prior_feature_politeness_==INDICATIVE==,prior_feature_politeness_==HASHEDGE==,prior_feature_politeness_==HASPOSITIVE==,prior_feature_politeness_==HASNEGATIVE==,prior_feature_politeness_all
0,24941859.1647.1647,2778861,Talk:A. E. Wilder-Smith,52,24183500.0.0,24941859,,66.81.142.89,True,4,...,0,0,2,2,0,1,2,1,2,20
1,67928757.6071.6071,2778861,Talk:A. E. Wilder-Smith,52,67888171.4644.4644,67928757,691327.0,Steve Hart,False,3,...,0,0,0,0,0,0,1,1,1,8
2,258190421.9495.9485,1182107,Talk:A. J. Ayer,81,258190421.9485.9485,258190421,8509347.0,P Cezanne,False,1,...,0,0,0,0,0,0,0,0,0,0
3,33407073.587.587,1182107,Talk:A. J. Ayer,81,33406513.241.241,33407073,129409.0,SlimVirgin,True,3,...,0,0,0,0,0,0,0,1,0,3
4,532828193.32635.32635,2850785,Talk:Aaron Swartz,236,532664191.23453.23453,532828193,40487.0,Sj,False,9,...,0,0,0,1,0,0,6,5,5,29


## Politeness strategy of current comment

In [47]:
features = features.merge(politeness_strategies, on='id')

## Current comment length

In [48]:
features = features.merge(matches[['id', 'cleaned_content']], on='id')

In [49]:
# length of current post
features['len_chars'] = features.cleaned_content.map(len)
features['len_words'] = features.cleaned_content.map(lambda text: len(text.split()))

In [50]:
features[['len_chars', 'len_words']].head()

Unnamed: 0,len_chars,len_words
0,236,42
1,168,35
2,21,4
3,178,37
4,62,10


## Current comment includes url, wiki rule or user name

In [51]:
from spacy.lang.en import English


parser = English()


foxes = [] # tmp dataframe

for i, start in starts.groupby('marker'):

    text = start.iloc[-1]['cleaned_content'].lower()
    
    if len(text) == 0:
        a_fox = {'id': i, 'has_url': False, 'has_wikirule': False, 'has_username': False}
        
    else:
        tokens = parser(text)
        author = start.iloc[-1]['user_text']
        users = [u.lower() for u in start.user_text.unique() if u != author and not pd.isnull(u)]

        a_fox = {'id': i, 
                 'has_url': any([token.like_url for token in tokens]),
                 'has_wikirule': any([token.orth_.startswith('wp:') for token in tokens]),
                 'has_username': any([token.orth_ in u for u in users for token in tokens])
                }
        
    foxes.append(a_fox)
    

In [52]:
f = pd.DataFrame(foxes)
f.head()

Unnamed: 0,id,has_url,has_wikirule,has_username
0,100056213.25366.25366,False,False,True
1,10011304.27024.26589,False,False,True
2,10011440.16241.16241,False,False,True
3,100161230.490.490,False,False,True
4,100202560.123.74,False,False,False


In [53]:
features = features.merge(f, on='id')

## Prior comments median and max length

In [54]:
# median length of prior posts

def prior_comments_len(row):
    '''
    Return median, max num_characters and median, max num_words together.
    '''
    
    if row['startswith2'] == 1:
        return (0, 0, 0, 0)
    
    # get prior conversation
    start = starts[starts.marker==row['id']]
    start = start[start.type!='CREATION']
    start = start.iloc[:-1]
    
        
    nchars = start.cleaned_content.map(len)
    nwords = start.cleaned_content.map(lambda text: len(text.split()))
    
    return (np.median(nchars), max(nchars), np.median(nwords), max(nwords))

In [55]:
aaa = features.apply(prior_comments_len, axis=1)

In [56]:
features['len_chars_prior_median'] = list(aaa.map(itemgetter(0)))
features['len_chars_prior_max'] = list(aaa.map(itemgetter(1)))
features['len_words_prior_median'] = list(aaa.map(itemgetter(2)))
features['len_words_prior_max'] = list(aaa.map(itemgetter(3)))
features.head()

Unnamed: 0,id,page_id,page_title,file_number,conversation_id,rev_id,user_id,user_text,toxic,pos,...,cleaned_content,len_chars,len_words,has_url,has_wikirule,has_username,len_chars_prior_median,len_chars_prior_max,len_words_prior_median,len_words_prior_max
0,24941859.1647.1647,2778861,Talk:A. E. Wilder-Smith,52,24183500.0.0,24941859,,66.81.142.89,True,4,...,So the article is not neutral because you thi...,236,42,False,False,True,478.0,1039,79.0,179
1,67928757.6071.6071,2778861,Talk:A. E. Wilder-Smith,52,67888171.4644.4644,67928757,691327.0,Steve Hart,False,3,...,"Well, as I'm sure you know, WP cite sources, ...",168,35,False,False,True,687.0,1374,117.0,234
2,258190421.9495.9485,1182107,Talk:A. J. Ayer,81,258190421.9485.9485,258190421,8509347.0,P Cezanne,False,1,...,What was the cause?,21,4,False,False,False,0.0,0,0.0,0
3,33407073.587.587,1182107,Talk:A. J. Ayer,81,33406513.241.241,33407073,129409.0,SlimVirgin,True,3,...,"If it's a source, it should go in a references...",178,37,False,False,True,103.5,125,18.0,22
4,532828193.32635.32635,2850785,Talk:Aaron Swartz,236,532664191.23453.23453,532828193,40487.0,Sj,False,9,...,Here's a mention in the Economist: Everything...,62,10,False,False,True,206.0,5549,32.5,855


## Time between comment and response

In [58]:
responses = pd.read_csv('../data/responses_bio_updated2.csv', parse_dates=['timestamp'])
responses.head()

Unnamed: 0,comment_id,id,type_dup,reply,step,ancestor_id,authors,cleaned_content,content,conversation_id,...,file_number,toxic,row_new,row_new2,toxic_new,toxic_new2,pos,mod_by_bot,user_text_comment,same_user
0,100056213.25366.25366,100058183.25366.25366,MODIFICATION,False,0,100056213.25366.25366,28381:Goethean,"This was known ages before Nietzsche, and it w...","This was known ages before Nietzsche, and it w...",87117156.23385.23385,...,24751.0,False,False,True,False,False,4.0,False,201.19.143.44,0.0
1,10011304.27024.26589,10966162.443.32875,DELETION,False,0,10011304.27024.26589,138288:Keetoowah,"As far as I can tell, you have had days and da...","::::As far as I can tell, you have had days an...",8766155.18819.18819,...,13742.0,False,False,False,False,False,17.0,False,Keetoowah,0.0
2,10011440.16241.16241,10166765.17044.17044,MODIFICATION,False,1,10011440.16241.16241,138288:Keetoowah,I don't know who wrote the response to my comm...,::I don't know who wrote the response to my co...,7580569.11639.11639,...,13742.0,False,False,True,False,False,13.0,False,Keetoowah,0.0
3,100161230.490.490,200435884.1579.1579,MODIFICATION,False,0,100161230.490.490,ANONYMOUS:75.2.252.205,"A problem in this section, it just pulls these...","--A problem in this section, it just pulls the...",60599174.9.9,...,56808.0,False,False,True,False,False,5.0,False,75.2.252.205,0.0
4,100202560.123.74,785864387.2481.2485,MODIFICATION,False,3,100202560.123.74,ANONYMOUS:24.118.107.3,This article seems to be mostly concerned with...,This article seems to be mostly concerned with...,100202560.74.74,...,60791.0,False,False,True,False,False,7.0,False,24.118.107.3,0.0


In [59]:
times = features[['id', 'timestamp', 'pos']]

times_response = responses[['comment_id', 'timestamp', 'pos']]
times_response.columns = ['id', 'timestamp_response', 'pos_response']

times = times.merge(times_response, on='id', how='left')
times

Unnamed: 0,id,timestamp,pos,timestamp_response,pos_response
0,24941859.1647.1647,2005-10-07 02:33:48+00:00,4,2005-10-08 16:39:34+00:00,5.0
1,67928757.6071.6071,2006-08-06 02:13:39+00:00,3,NaT,
2,258190421.9495.9485,2008-12-15 20:24:31+00:00,1,2013-10-03 11:29:07+00:00,2.0
3,33407073.587.587,2005-12-31 19:58:26+00:00,3,2005-12-31 20:26:30+00:00,5.0
4,532828193.32635.32635,2013-01-13 07:45:38+00:00,9,2013-01-14 22:48:10+00:00,10.0
...,...,...,...,...,...
6858,307018439.43174.43150,2009-08-09 18:55:36+00:00,1,2009-08-09 19:29:34+00:00,2.0
6859,160525037.307.276,2007-09-26 18:50:25+00:00,1,NaT,
6860,21150249.24.0,2005-08-16 17:13:25+00:00,1,2005-08-16 17:38:49+00:00,2.0
6861,91404508.189.189,2006-12-01 16:45:31+00:00,1,2006-12-02 04:09:04+00:00,2.0


In [60]:
def find_response_time(row):
    
    if pd.isnull(row['timestamp_response']):
        return None
    
    return (row['timestamp_response'] - row['timestamp']).total_seconds() / 60 / 60
    


features['respond_in_hours'] = times.apply(find_response_time, axis=1)
features['respond_in_days'] = features.respond_in_hours / 24
features['respond_in_turns'] = times.pos_response - times.pos - 1
features.head()

Unnamed: 0,id,page_id,page_title,file_number,conversation_id,rev_id,user_id,user_text,toxic,pos,...,has_url,has_wikirule,has_username,len_chars_prior_median,len_chars_prior_max,len_words_prior_median,len_words_prior_max,respond_in_hours,respond_in_days,respond_in_turns
0,24941859.1647.1647,2778861,Talk:A. E. Wilder-Smith,52,24183500.0.0,24941859,,66.81.142.89,True,4,...,False,False,True,478.0,1039,79.0,179,38.096111,1.587338,0.0
1,67928757.6071.6071,2778861,Talk:A. E. Wilder-Smith,52,67888171.4644.4644,67928757,691327.0,Steve Hart,False,3,...,False,False,True,687.0,1374,117.0,234,,,
2,258190421.9495.9485,1182107,Talk:A. J. Ayer,81,258190421.9485.9485,258190421,8509347.0,P Cezanne,False,1,...,False,False,False,0.0,0,0.0,0,42063.076667,1752.628194,0.0
3,33407073.587.587,1182107,Talk:A. J. Ayer,81,33406513.241.241,33407073,129409.0,SlimVirgin,True,3,...,False,False,True,103.5,125,18.0,22,0.467778,0.019491,1.0
4,532828193.32635.32635,2850785,Talk:Aaron Swartz,236,532664191.23453.23453,532828193,40487.0,Sj,False,9,...,False,False,True,206.0,5549,32.5,855,39.042222,1.626759,0.0


In [61]:
features.to_csv('../data/bio_features_text.csv', index=False)