In [5]:
import re
import math
import string
import itertools
import pandas as pd
from itertools import chain
from statistics import mean
from string import punctuation

In [100]:
# Reading in the data.

samp = pd.read_csv('stats_sample.csv', error_bad_lines = False, encoding = 'latin1')
samp = samp.dropna()

whole_set = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
whole_set = whole_set.dropna()
data = whole_set[0:5000]

In [7]:
# Helper function which splits text input into individual sentances. Used in generation of f4.

acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
alphabets= "([A-Za-z])"
digits = "([0-9])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text) 
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [104]:
def calculate_punctuation_stats(dataset):
    
    punc_stats = pd.DataFrame()
    
    thresh = 10
    for writing_sample in dataset['text']:
        
        # Get the demographic info in dictionary form:
        age = int(dataset.loc[dataset['text'] == writing_sample, 'age'].iloc[0])
        gender = dataset.loc[dataset['text'] == writing_sample, 'gender'].iloc[0]
        author = dataset.loc[dataset['text'] == writing_sample, 'author'].iloc[0]
        fulllink = dataset.loc[dataset['text'] == writing_sample, 'full_link'].iloc[0]
        dem_dict = {'Age': age, 'Gender': gender, 'Author': author, 'Full Link': fulllink}
    
        # Remove new line
        writing_sample = writing_sample.replace('\n', " ")

        # Removing all non-ASCII letters
        writing_sample = writing_sample.encode("ascii", "ignore").decode()
    
        # Remove URLs from comments
        writing_sample = re.sub(r'\(https?:\/\/.*?\)', '', writing_sample)

        # Now, we need to check for instances of elipses and assign them to a new symbol--less confusion with periods, easier analysis.
        # The tilde will represent an elipse from here on out.
        writing_sample = writing_sample.replace('...', '~')

        # We will loop over every character in the string and extract the punctuation, appending each mark to a list.
        # We will keep this punctuation sequence in both list and string form for further analysis.
        marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
        marks_str = ''.join(map(str, marks))

        punct_list = []

        for character in writing_sample:
            if character in marks:
                punct_list.append(character)

        punct_str = ''.join(punct_list)
        
        #AHHHHH HELP
        if len(punct_list) <= thresh:
            continue
            
        # Now, we want to get our input in the following format..[2, '!', 3, '.'].
        split_by_punc = (x.split() for x in re.split('[' + marks_str + ']', writing_sample))
        wrds_btwn = list(map(len, split_by_punc))
        woven = list(chain.from_iterable(zip(wrds_btwn, punct_list)))
        
        # Develop punctuation stats for comments_text
        # Generation of F1, relative frequency of each punctuation mark of interest.
        num_period = punct_str.count('.')
        num_comma = punct_str.count(',')
        num_colon = punct_str.count(':')
        num_semicol = punct_str.count(';')
        num_question = punct_str.count('?')
        num_exclaim = punct_str.count('!')
        num_dash = punct_str.count('-')
        num_left_paren = punct_str.count('(')
        num_right_paren = punct_str.count(')')
        num_sing_quote = punct_str.count("'")
        num_doub_quote = punct_str.count('"')
        num_elipse = punct_str.count('~')

        mark_counts = [num_period, num_comma, num_colon, num_semicol, num_question, num_exclaim, num_dash, num_left_paren, num_right_paren, num_sing_quote, num_doub_quote, num_elipse] 
        F1 = [x / len(punct_list) for x in mark_counts]
    
        # Generation of F2 and F3, conditional/joint probability of observing two punctuation marks in succession.
        F2_dict = {}
        F3_dict = {}
        successive_pairs = []
    
        combos = list(itertools.product(marks_str, marks_str))
        indices = list(range(len(marks_str)))
        index_combos = list((itertools.product(indices, indices)))
        index_strings = [[str(x) for x in tup] for tup in index_combos]
        final_ind = list(map('-'.join, index_strings))
        #combos = list(itertools.product(marks_str, marks_str))
        #indices = list(range(marks_str)) 
        #index_combos = list(itertools.product(indices, indices)) 

        for i in combos:
            successive_pairs.append(''.join(i))
        
        zipped = list(zip(final_ind, successive_pairs)) 

        for (index, pair) in zipped:
            numerator = punct_str.count(pair)
            denominator = punct_str.count(pair[0])

            if numerator != 0:
                F2_dict['Conditional Prob' + ' ' + index] = (numerator/denominator)
                F3_dict['Joint Prob' + ' ' + index] = (numerator/len(punct_list))
                #F2_dict['Conditional Prob' + ' ' + index + ' ' + pair + ' '] = (numerator/denominator)
                #F3_dict['Joint Prob' + ' ' + index + ' ' + pair + ' '] = (numerator/len(punct_list))
            else:
                F2_dict['Conditional Prob' + ' ' + index] = 0
                F3_dict['Joint Prob' + ' ' + index] = 0
                #F2_dict['Conditional Prob' + ' ' + index + ' ' + pair + ' '] = 0
                #F3_dict['Joint Prob' + ' ' + index + ' ' + pair + ' '] = 0
            
        # Generation of f4, redditor's average sentence length in words.
        sent_lens = []
        max_sents = 200
        regex = "\w+('\w+)?(?<!('s))"
        sentences = split_into_sentences(writing_sample)

        for sentence in sentences:
            sent_lens.append(len(re.findall(regex, sentence)))

        if len(sent_lens) == 0:
            sent_lens.append(0)
    
        f4 = mean(sent_lens)

        # f4 as a probability distribution of sentence length--as it is in Darmon's work.
        # Cap sentence length at 199 words.
        F4 = [0]*max_sents
    
        sent_lens = [x if x < max_sents else max_sents-1 for x in sent_lens]

        for i in sent_lens:
            F4[i] += 1

        F4 = [x / len(sent_lens) for x in F4]
    
        # Generation of f5, redditor's average number of words between successive punctuation marks.
        if len(wrds_btwn) == 0:
            wrds_btwn.append(0)
        
        f5 = mean(wrds_btwn)

        # f5 as a probability distribution of number of words between successive marks--as it is in Darmon's work.
        # Cap number of in-between words at 39.
        max_wrds_btwn = 40
        wrds_btwn = [x if x < max_wrds_btwn else max_wrds_btwn-1 for x in wrds_btwn]
        F5 = [0]*max_wrds_btwn

        for i in wrds_btwn:
            F5[i] += 1

        F5 = [x / len(wrds_btwn) for x in F5]
    
        # Generation of f6, ratio of punctuation to words.
        total_punc = len(punct_list)
        total_words = len(re.findall(regex, writing_sample))

        f6 = total_punc/total_words
    
        # Output to dataframe.
        # An uppercase F signals that the feature was mimicked directly from Darmon's work.
        F1_dict_keys = ['Period Freq', 'Comma Freq', 'Colon Freq', 'Semicol Freq', 'Question Freq', 'Exclaim Freq', 'Dash Freq', 'L. Paren Freq', 'R. Paren Freq', 'S. Quote Freq', 'D. Quote Freq', 'Elipse Freq']
        F1_dict = {F1_dict_keys[i]: F1[i] for i in range(len(F1_dict_keys))} 

        F4_dict = {'Prob Sentence Len:' + ' ' + str(i): F4[i] for i in range(len(F4))}
        F5_dict = {'Prob' + ' ' + str(i) + ' ' + 'Words Between Punc': F5[i] for i in range(len(F5))}

        f4_dict = {'Avg Sentence Len': f4}
        f5_dict = {'Avg Wrds Btwn Punc': f5}
        f6_dict = {'Punc Ratio': f6}
    
        # Concatinate all the keys into a single list.
        master = {**dem_dict, **F1_dict, **F2_dict, **F3_dict, **F4_dict, **F5_dict, **f4_dict, **f5_dict, **f6_dict}
    
        punc_stats = punc_stats.append(master, ignore_index = True)
    return punc_stats  

In [105]:
out = calculate_punctuation_stats(data)

KeyboardInterrupt: 

In [98]:
out

Unnamed: 0,Age,Author,Avg Sentence Len,Avg Wrds Btwn Punc,Colon Freq,Comma Freq,Conditional Prob 0-0,Conditional Prob 0-1,Conditional Prob 0-10,Conditional Prob 0-11,...,Prob Sentence Len: 95,Prob Sentence Len: 96,Prob Sentence Len: 97,Prob Sentence Len: 98,Prob Sentence Len: 99,Punc Ratio,Question Freq,R. Paren Freq,S. Quote Freq,Semicol Freq
0,28.0,lunabuddy,21.880911,8.041833,0.011722,0.251578,0.170467,0.259501,0.355049,0.004343,...,0.0,0.0,0.0,0.0,0.000000,0.133032,0.041779,0.014728,0.245567,0.001803
1,55.0,Peenutbuttjellytime,18.305303,7.388064,0.005750,0.240656,0.230442,0.286565,0.266156,0.011905,...,0.0,0.0,0.0,0.0,0.000000,0.143850,0.040253,0.008913,0.220242,0.001725
2,24.0,DatingAdviceTA1224,14.984375,9.959184,0.040000,0.240000,0.377049,0.245902,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.104275,0.030000,0.010000,0.020000,0.000000
3,18.0,Purple_Cherry_Hunter,20.521552,6.238520,0.024242,0.341818,0.117371,0.347418,0.103286,0.023474,...,0.0,0.0,0.0,0.0,0.000000,0.173283,0.021818,0.019394,0.110303,0.055758
4,21.0,kafkascockroachgrace,21.843866,12.660297,0.107066,0.152034,0.309524,0.158730,0.000000,0.007937,...,0.0,0.0,0.0,0.0,0.003717,0.079287,0.119914,0.089936,0.000000,0.019272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,28.0,ottawagurl,14.625000,8.415000,0.005128,0.307692,0.272727,0.376623,0.000000,0.025974,...,0.0,0.0,0.0,0.0,0.000000,0.119048,0.051282,0.030769,0.000000,0.000000
4996,22.0,3-m-a-i-l,17.129870,7.353535,0.000000,0.055556,0.121212,0.000000,0.696970,0.030303,...,0.0,0.0,0.0,0.0,0.000000,0.146558,0.136364,0.040404,0.429293,0.000000
4997,25.0,TheSixthMambo,12.573276,5.649635,0.010545,0.244288,0.167568,0.367568,0.205405,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.195063,0.042179,0.029877,0.195079,0.001757
4998,34.0,Debate-Informal,16.711656,8.375000,0.020468,0.114035,0.221477,0.134228,0.416107,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.125551,0.000000,0.008772,0.347953,0.000000


In [99]:
out.to_csv()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [106]:
whole_set_stats = calculate_punctuation_stats(whole_set)

In [108]:
whole_set_stats.dropna()

Unnamed: 0,Age,Author,Avg Sentence Len,Avg Wrds Btwn Punc,Colon Freq,Comma Freq,Conditional Prob 0-0,Conditional Prob 0-1,Conditional Prob 0-10,Conditional Prob 0-11,...,Prob Sentence Len: 95,Prob Sentence Len: 96,Prob Sentence Len: 97,Prob Sentence Len: 98,Prob Sentence Len: 99,Punc Ratio,Question Freq,R. Paren Freq,S. Quote Freq,Semicol Freq
0,28.0,lunabuddy,21.880911,8.041833,0.011722,0.251578,0.170467,0.259501,0.355049,0.004343,...,0.0,0.0,0.0,0.0,0.000000,0.133032,0.041779,0.014728,0.245567,0.001803
1,55.0,Peenutbuttjellytime,18.305303,7.388064,0.005750,0.240656,0.230442,0.286565,0.266156,0.011905,...,0.0,0.0,0.0,0.0,0.000000,0.143850,0.040253,0.008913,0.220242,0.001725
2,24.0,DatingAdviceTA1224,14.984375,9.959184,0.040000,0.240000,0.377049,0.245902,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.104275,0.030000,0.010000,0.020000,0.000000
3,18.0,Purple_Cherry_Hunter,20.521552,6.238520,0.024242,0.341818,0.117371,0.347418,0.103286,0.023474,...,0.0,0.0,0.0,0.0,0.000000,0.173283,0.021818,0.019394,0.110303,0.055758
4,21.0,kafkascockroachgrace,21.843866,12.660297,0.107066,0.152034,0.309524,0.158730,0.000000,0.007937,...,0.0,0.0,0.0,0.0,0.003717,0.079287,0.119914,0.089936,0.000000,0.019272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26513,26.0,bonky24,12.526316,9.182482,0.007812,0.023438,0.418605,0.023256,0.209302,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.105960,0.023438,0.007812,0.187500,0.000000
26514,19.0,Rmonie01,17.812500,9.447368,0.000000,0.081081,0.230769,0.000000,0.769231,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.110448,0.054054,0.000000,0.486486,0.000000
26515,24.0,thiccvegan_,11.189655,5.825000,0.025000,0.191667,0.142857,0.238095,0.142857,0.047619,...,0.0,0.0,0.0,0.0,0.000000,0.183486,0.041667,0.016667,0.183333,0.008333
26516,23.0,markrediot,10.645161,6.545455,0.000000,0.166667,0.300000,0.100000,0.100000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.163636,0.370370,0.000000,0.203704,0.000000


In [109]:
whole_set_stats

Unnamed: 0,Age,Author,Avg Sentence Len,Avg Wrds Btwn Punc,Colon Freq,Comma Freq,Conditional Prob 0-0,Conditional Prob 0-1,Conditional Prob 0-10,Conditional Prob 0-11,...,Prob Sentence Len: 95,Prob Sentence Len: 96,Prob Sentence Len: 97,Prob Sentence Len: 98,Prob Sentence Len: 99,Punc Ratio,Question Freq,R. Paren Freq,S. Quote Freq,Semicol Freq
0,28.0,lunabuddy,21.880911,8.041833,0.011722,0.251578,0.170467,0.259501,0.355049,0.004343,...,0.0,0.0,0.0,0.0,0.000000,0.133032,0.041779,0.014728,0.245567,0.001803
1,55.0,Peenutbuttjellytime,18.305303,7.388064,0.005750,0.240656,0.230442,0.286565,0.266156,0.011905,...,0.0,0.0,0.0,0.0,0.000000,0.143850,0.040253,0.008913,0.220242,0.001725
2,24.0,DatingAdviceTA1224,14.984375,9.959184,0.040000,0.240000,0.377049,0.245902,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.104275,0.030000,0.010000,0.020000,0.000000
3,18.0,Purple_Cherry_Hunter,20.521552,6.238520,0.024242,0.341818,0.117371,0.347418,0.103286,0.023474,...,0.0,0.0,0.0,0.0,0.000000,0.173283,0.021818,0.019394,0.110303,0.055758
4,21.0,kafkascockroachgrace,21.843866,12.660297,0.107066,0.152034,0.309524,0.158730,0.000000,0.007937,...,0.0,0.0,0.0,0.0,0.003717,0.079287,0.119914,0.089936,0.000000,0.019272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26513,26.0,bonky24,12.526316,9.182482,0.007812,0.023438,0.418605,0.023256,0.209302,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.105960,0.023438,0.007812,0.187500,0.000000
26514,19.0,Rmonie01,17.812500,9.447368,0.000000,0.081081,0.230769,0.000000,0.769231,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.110448,0.054054,0.000000,0.486486,0.000000
26515,24.0,thiccvegan_,11.189655,5.825000,0.025000,0.191667,0.142857,0.238095,0.142857,0.047619,...,0.0,0.0,0.0,0.0,0.000000,0.183486,0.041667,0.016667,0.183333,0.008333
26516,23.0,markrediot,10.645161,6.545455,0.000000,0.166667,0.300000,0.100000,0.100000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.163636,0.370370,0.000000,0.203704,0.000000


In [110]:
whole_set_stats.isnull().values.any()

False

In [113]:
whole_set_stats.values.tolist()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
