# Measure whether language norms were expressed explicitly in community guidelines

Outline of notebook:

- Search text for metalanguage 
- Search for imperative sentences with set of regular expressions  (have 2nd person in it) (Not done)
- Search set of ‘style’ words taken from some corpus? (Not done)

The metal-language search already seems to provide some pretty clear examples, not sure how to operationalize this, but at the very least could count the number of times these occur


In [1]:
%load_ext dotenv
%dotenv

import numpy as np
import csv as csv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import math
import json
from scipy import stats
from datetime import datetime

from nltk import pos_tag
from nltk.util import pad_sequence
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from termcolor import colored

import praw
import requests
import json
import os

client_id = os.environ.get("client_id")
client_secret = os.environ.get("client_secret")
user_agent = os.environ.get("user_agent")

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)


 ### Search text for metalanguage
 https://shomir.net/pdf/publications/swilson_ijcnlp_2013.pdf and more specifically https://shomir.net/pdf/publications/swilson_cicling11.pdf -- shows top ten words that appear near metalanguage, very useful since if you 
take both the words before and after, you get around 70% coverage of mentions, which I think is pretty good
 This will be useful for mentions of individualized words, can also use LIWC for mentions of style, or quotes, sicne not sure LIWC has this.

In [5]:
### Test subreddits
### Test some subreddits randomly taken from the identity paper
subs = ['Cooking', 'news', 'Seahawks', 'Babybumps', 'politics', 'science', 'mentalhealth']
# tested_subs = ['science', 'politics', 'economics', 'depression', 'Cooking', 'pics', 'Naruto', 'BabyBumps']
rows = []
for s in subs:
    sub = reddit.subreddit(s)
    rows.append({'subreddit': s , 'descr':sub.description, 'public_descr':sub.public_description})
    
df_sub = pd.DataFrame(rows)


df_sub['full_descr'] = df_sub['descr'] + df_sub['public_descr']
df_sub

Unnamed: 0,descr,public_descr,subreddit,full_descr
0,#####Please read these\n\n1. All posts must be...,/r/Cooking is a place for the cooks of reddit ...,Cooking,#####Please read these\n\n1. All posts must be...
1,>* **[/r/inthenews](/r/inthenews?hl)**\n\n>#\n...,"/r/news is: real news articles, primarily but ...",news,>* **[/r/inthenews](/r/inthenews?hl)**\n\n>#\n...
2,"# [](/r/NFL ""/r/NFL"")\n\n> * [][nfc]\n> * [][a...",A community for fans of the Seattle Seahawks. ...,Seahawks,"# [](/r/NFL ""/r/NFL"")\n\n> * [][nfc]\n> * [][a..."
3,\n###All Bump Photos belong in our Stickied D...,"A place for pregnant redditors, those who have...",Babybumps,\n###All Bump Photos belong in our Stickied D...
4,## **Welcome to /r/Politics! Please read [the ...,/r/Politics is for news and discussion about U...,politics,## **Welcome to /r/Politics! Please read [the ...
5,# [Submission Rules](https://www.reddit.com/r/...,This community is a place to share and discuss...,science,# [Submission Rules](https://www.reddit.com/r/...
6,"**Welcome!**\nThis is a safe place to discuss,...",The Mental Health subreddit is the central for...,mentalhealth,"**Welcome!**\nThis is a safe place to discuss,..."


In [12]:
word_tokenize(df_sub.loc[0]['full_descr'])

['#',
 '[',
 'Submission',
 'Rules',
 ']',
 '(',
 'https',
 ':',
 '//www.reddit.com/r/science/wiki/rules',
 '#',
 'wiki_submission_rules',
 ')',
 '1',
 '.',
 'Directly',
 'link',
 'to',
 'published',
 'peer-reviewed',
 'research',
 'or',
 'media',
 'summary',
 '2',
 '.',
 'No',
 'summaries',
 'of',
 'summaries',
 ',',
 're-hosted',
 'press',
 'releases',
 ',',
 'reviews',
 ',',
 'or',
 'reposts',
 '3',
 '.',
 'Research',
 'must',
 'be',
 'less',
 'than',
 '6',
 'months',
 'old',
 '4',
 '.',
 'No',
 'editorialized',
 ',',
 'sensationalized',
 ',',
 'or',
 'biased',
 'titles',
 '5',
 '.',
 'No',
 'blogspam',
 ',',
 'images',
 ',',
 'videos',
 ',',
 'or',
 'infographics',
 '6',
 '.',
 'All',
 'submissions',
 'must',
 'have',
 'flair',
 'assigned',
 '#',
 '[',
 'Comment',
 'Rules',
 ']',
 '(',
 'https',
 ':',
 '//www.reddit.com/r/science/wiki/rules',
 '#',
 'wiki_comment_rules',
 ')',
 '1',
 '.',
 'No',
 'off-topic',
 'comments',
 ',',
 'memes',
 ',',
 'or',
 'jokes',
 '2',
 '.',
 'No',
 '

In [10]:
#### Some functions for finding metalanguage

### following the method of the above paper, stem and POS tag the words with NLTK
stemmer = PorterStemmer()

# tokenizes, stems, pads, and pos tags phrases, also returns a tuple of the pre-tagged and tagged phrase 
def process_phrase(phrase):
    # tokenize  
    phrase = word_tokenize(phrase)
    # pad the sequence for simplying printing surrounding words for below
    phrase = list(pad_sequence(phrase, 4, pad_left=True, pad_right=True, left_pad_symbol='<pad>', right_pad_symbol='</pad>'))
    # save a phrase before stemming
    pre_stemmed_phrase = phrase 
    # stem
    phrase = [stemmer.stem(word) for word in phrase]
    # pos tag
    phrase = pos_tag(phrase)
    return(pre_stemmed_phrase, phrase)


def print_phrase(print_len, word, tag, phrase, index):
    print(phrase[index-print_len:index], end=' ')
    print(colored(phrase[index], 'magenta'), end=' ')
    print(phrase[index+1:index+print_len])


def find_meta_lang(df_row, word_list):
    (pre_stemmed_phrase, phrase) = process_phrase(df_row['full_descr'])
    # number of chars to print before and after the word, useful for context in the phrase
    print_len = 10
    for i, (word, tag) in enumerate(phrase):
        if (word, tag) in word_list:
            print('-------------------------------------')
            print('found', end=' ')
            print(colored((word, tag), 'magenta'), end=' ')
            print('in', end=' ')
            print(colored(df_row['subreddit'], 'green'))
            print('-------------------------------------')
            print_phrase(print_len, word, tag, phrase, i)
            print('++++++++++++++++')
            # print for easier reading
            print_phrase(print_len, word, tag, pre_stemmed_phrase, i)
            return phrase[i-print_len:i+print_len]
            
def find_quotes(df_row, quote_list):
    (pre_stemmed_phrase, phrase) = process_phrase(df_row['full_descr'])
    print_len = 10
    for i, (word, tag) in enumerate(phrase):
        if tag in quote_list:
            print('-------------------------------------')
            print('found', end=' ')
            print(colored((word, tag), 'magenta'), end=' ')
            print('in', end=' ')
            print(colored( df_row['subreddit'], 'green'))
            print('-------------------------------------')
            print_phrase(print_len, word, tag, phrase, i)
            print('++++++++++++++++')
            # print for easier reading
            print_phrase(print_len, word, tag, pre_stemmed_phrase, i)
    


In [20]:
      
# meta words from the paper
meta_words = [('call', 'VB'),
    ('name', 'NN'), 
    ('name', 'VB'), 
    ('say', 'VB'), 
    ('term', 'NN'), 
    ('title', 'NN'), 
    ('title', 'VB'), 
    ('word', 'NN'), 
    ('write', 'VB'), 
    ('mean', 'VB'), 
    ('refer', 'VB'), 
    ('meaning', 'NN'), 
    ('translate', 'VB'), 
    ('phrase', 'NN'), 
    ('symbol', 'NN'), 
    ('pronounce', 'VB'), 
    ('tell', 'VB'), 
    ('letter', 'NN'), 
    ('pronunciation', 'NN'), 
    ('ask', 'VB'), 
    ('sentence', 'NN')]

df_sub.apply(lambda x: find_meta_lang(x, meta_words), axis=1)

-------------------------------------
found [35m('word', 'NN')[0m in [32mpolitics[0m
-------------------------------------
[('of', 'IN'), ('the', 'DT'), ('articl', 'NN'), ('is', 'VBZ'), ('in', 'IN'), ('all', 'DT'), ('cap', 'NN'), ('or', 'CC'), ('contain', 'VB'), ('the', 'DT')] [35m('word', 'NN')[0m [("'break", 'POS'), ("'", 'POS'), ('.', '.'), ('thi', 'NNS'), ('rule', 'NN'), ('may', 'MD'), ('be', 'VB'), ('appli', 'VBN'), ('to', 'TO')]
++++++++++++++++
['of', 'the', 'article', 'is', 'in', 'all', 'caps', 'or', 'contains', 'the'] [35mword[0m ["'Breaking", "'", '.', 'This', 'rule', 'may', 'be', 'applied', 'to']
-------------------------------------
found [35m('mean', 'VB')[0m in [32mdepression[0m
-------------------------------------
[('when', 'WRB'), ('thi', 'NN'), ('subreddit', 'NN'), ('is', 'VBZ'), ('rel', 'JJ'), ('quiet', 'JJ'), ('.', '.'), ('thi', 'VB'), ('doe', 'NN'), ('not', 'RB')] [35m('mean', 'VB')[0m [('no', 'DT'), ('one', 'CD'), ('care', 'NN'), ('.', '.'), ('If', '

0                                                 None
1    [(of, IN), (the, DT), (articl, NN), (is, VBZ),...
2                                                 None
3    [(when, WRB), (thi, NN), (subreddit, NN), (is,...
4    [(relat, NN), (., .), (after, IN), (all, DT), ...
5    [(to, TO), (other, JJ), (., .), (**, JJ), (per...
6    [(>, NNP), (>, NNP), (*, NNP), (titl, VB), (yo...
7    [(,, ,), (and, CC), (anyon, NN), (who, WP), (s...
dtype: object

In [15]:
### possible other words to test, this is based on looking at the data, and ideally some papers....?
reddit_words = [('post', 'NN'),
              ('comment', 'NN'),
              ('submission', 'NN'),
              ('moderator', 'NN'),
              ('flair', 'NN'),
              ('text', 'NN'),
             ]


df_sub.apply(lambda x: find_meta_lang(x, reddit_words), axis=1)



-------------------------------------
found [35m('comment', 'NN')[0m in [32mscience[0m
-------------------------------------
[('6', 'CD'), ('.', '.'), ('all', 'DT'), ('submiss', 'JJ'), ('must', 'MD'), ('have', 'VB'), ('flair', 'VBN'), ('assign', 'JJ'), ('#', '#'), ('[', 'JJ')] [35m('comment', 'NN')[0m [('rule', 'NN'), (']', 'NNP'), ('(', '('), ('http', 'NN'), (':', ':'), ('//www.reddit.com/r/science/wiki/rul', 'JJ'), ('#', '#'), ('wiki_comment_rul', 'NN'), (')', ')')]
++++++++++++++++
['6', '.', 'All', 'submissions', 'must', 'have', 'flair', 'assigned', '#', '['] [35mComment[0m ['Rules', ']', '(', 'https', ':', '//www.reddit.com/r/science/wiki/rules', '#', 'wiki_comment_rules', ')']
-------------------------------------
found [35m('comment', 'NN')[0m in [32mscience[0m
-------------------------------------
[('http', 'NN'), (':', ':'), ('//www.reddit.com/r/science/wiki/rul', 'JJ'), ('#', '#'), ('wiki_comment_rul', 'NN'), (')', ')'), ('1', 'CD'), ('.', '.'), ('No', 'DT'), ('of

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
dtype: object

In [16]:
# tags for quotes from pen treebank
quote_symbols = ['"', '``', '\'\'']

df_sub.apply(lambda x: find_quotes(x, quote_symbols), axis=1)

-------------------------------------
found [35m("'", "''")[0m in [32mpolitics[0m
-------------------------------------
[(']', 'NN'), ('(', '('), ('/r/politics/wiki/index', 'JJ'), ('#', '#'), ('wiki_be_civil', 'NN'), (')', ')'), ('Do', 'VBP'), ('not', 'RB'), ('post', 'VB'), ('user', 'NN')] [35m("'", "''")[0m [('person', 'NN'), ('information.|us', 'NNS'), ('who', 'WP'), ('violat', 'VBP'), ('thi', 'NN'), ('rule', 'NN'), ('will', 'MD'), ('be', 'VB'), ('ban', 'VBN')]
++++++++++++++++
[']', '(', '/r/politics/wiki/index', '#', 'wiki_be_civil', ')', 'Do', 'not', 'post', 'users'] [35m'[0m ['personal', 'information.|Users', 'who', 'violate', 'this', 'rule', 'will', 'be', 'banned']
-------------------------------------
found [35m('``', '``')[0m in [32mpolitics[0m
-------------------------------------
[('.', '.'), (']', 'NN'), ('(', '('), ('/r/politics/wiki/index', 'JJ'), ('#', '#'), ('wiki_disallowed_submission_types.3a', 'NN'), (')', ')'), ('Do', 'VBP'), ('not', 'RB'), ('use', 'VB')

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
dtype: object