In [2]:
import os
import re
import math
import string
import itertools
import pandas as pd
from itertools import chain
from statistics import mean
from string import punctuation

In [3]:
# Reading in the data.
samp = pd.read_csv('stats_sample.csv', error_bad_lines = False, encoding = 'latin1')
samp = samp.dropna()

whole_set = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
whole_set = whole_set.dropna()

# Bin the age column according to generation.
bins = [0, 24.5, 43.5, 55.5, 74.5, 100]
labels = ["GenZ", "GenY", "GenX", "BabyBoomers", "Traditionalists"]
whole_set['age'] = pd.cut(whole_set['age'], bins=bins, labels=labels)

In [12]:
# Subset the data.
five_thou = whole_set[0:5000]
three_thou = whole_set[0:3000]
two_thou = whole_set[0:2000]
one_thou = whole_set[0:1000]

In [7]:
# Helper function which splits text input into individual sentances. Used in generation of f4.

acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
alphabets= "([A-Za-z])"
digits = "([0-9])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text) 
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [10]:
def calculate_punctuation_stats(dataset):
    
    punc_stats = pd.DataFrame()
    
    thresh = 10
    for writing_sample in dataset['text']:
        
        # Get the demographic info in dictionary form:
        age = dataset.loc[dataset['text'] == writing_sample, 'age'].iloc[0]
        gender = dataset.loc[dataset['text'] == writing_sample, 'gender'].iloc[0]
        author = dataset.loc[dataset['text'] == writing_sample, 'author'].iloc[0]
        fulllink = dataset.loc[dataset['text'] == writing_sample, 'full_link'].iloc[0]
        dem_dict = {'Age': age, 'Gender': gender, 'Author': author, 'Full Link': fulllink}
    
        # Remove new line
        writing_sample = writing_sample.replace('\n', " ")

        # Removing all non-ASCII letters
        writing_sample = writing_sample.encode("ascii", "ignore").decode()
    
        # Remove URLs from comments
        writing_sample = re.sub(r'\(https?:\/\/.*?\)', '', writing_sample)

        # Now, we need to check for instances of elipses and assign them to a new symbol--less confusion with periods, easier analysis.
        # The tilde will represent an elipse from here on out.
        writing_sample = writing_sample.replace('...', '~')

        # We will loop over every character in the string and extract the punctuation, appending each mark to a list.
        # We will keep this punctuation sequence in both list and string form for further analysis.
        marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
        marks_str = ''.join(map(str, marks))

        punct_list = []

        for character in writing_sample:
            if character in marks:
                punct_list.append(character)

        punct_str = ''.join(punct_list)
        
        #AHHHHH HELP
        if len(punct_list) <= thresh:
            continue
            
        # Now, we want to get our input in the following format..[2, '!', 3, '.'].
        split_by_punc = (x.split() for x in re.split('[' + marks_str + ']', writing_sample))
        wrds_btwn = list(map(len, split_by_punc))
        woven = list(chain.from_iterable(zip(wrds_btwn, punct_list)))
        
        # Develop punctuation stats for comments_text
        # Generation of F1, relative frequency of each punctuation mark of interest.
        num_period = punct_str.count('.')
        num_comma = punct_str.count(',')
        num_colon = punct_str.count(':')
        num_semicol = punct_str.count(';')
        num_question = punct_str.count('?')
        num_exclaim = punct_str.count('!')
        num_dash = punct_str.count('-')
        num_left_paren = punct_str.count('(')
        num_right_paren = punct_str.count(')')
        num_sing_quote = punct_str.count("'")
        num_doub_quote = punct_str.count('"')
        num_elipse = punct_str.count('~')

        mark_counts = [num_period, num_comma, num_colon, num_semicol, num_question, num_exclaim, num_dash, num_left_paren, num_right_paren, num_sing_quote, num_doub_quote, num_elipse] 
        F1 = [x / len(punct_list) for x in mark_counts]
    
        # Generation of F2 and F3, conditional/joint probability of observing two punctuation marks in succession.
        F2_dict = {}
        F3_dict = {}
        successive_pairs = []
    
        combos = list(itertools.product(marks_str, marks_str))
        indices = list(range(len(marks_str)))
        index_combos = list((itertools.product(indices, indices)))
        index_strings = [[str(x) for x in tup] for tup in index_combos]
        final_ind = list(map('-'.join, index_strings))
        #combos = list(itertools.product(marks_str, marks_str))
        #indices = list(range(marks_str)) 
        #index_combos = list(itertools.product(indices, indices)) 

        for i in combos:
            successive_pairs.append(''.join(i))
        
        zipped = list(zip(final_ind, successive_pairs)) 

        for (index, pair) in zipped:
            numerator = punct_str.count(pair)
            denominator = punct_str.count(pair[0])

            if numerator != 0:
                F2_dict['Conditional Prob' + ' ' + index] = (numerator/denominator)
                F3_dict['Joint Prob' + ' ' + index] = (numerator/len(punct_list))
                #F2_dict['Conditional Prob' + ' ' + index + ' ' + pair + ' '] = (numerator/denominator)
                #F3_dict['Joint Prob' + ' ' + index + ' ' + pair + ' '] = (numerator/len(punct_list))
            else:
                F2_dict['Conditional Prob' + ' ' + index] = 0
                F3_dict['Joint Prob' + ' ' + index] = 0
                #F2_dict['Conditional Prob' + ' ' + index + ' ' + pair + ' '] = 0
                #F3_dict['Joint Prob' + ' ' + index + ' ' + pair + ' '] = 0
            
        # Generation of f4, redditor's average sentence length in words.
        sent_lens = []
        max_sents = 200
        regex = "\w+('\w+)?(?<!('s))"
        sentences = split_into_sentences(writing_sample)

        for sentence in sentences:
            sent_lens.append(len(re.findall(regex, sentence)))

        if len(sent_lens) == 0:
            sent_lens.append(0)
    
        f4 = mean(sent_lens)

        # f4 as a probability distribution of sentence length--as it is in Darmon's work.
        # Cap sentence length at 199 words.
        F4 = [0]*max_sents
    
        sent_lens = [x if x < max_sents else max_sents-1 for x in sent_lens]

        for i in sent_lens:
            F4[i] += 1

        F4 = [x / len(sent_lens) for x in F4]
    
        # Generation of f5, redditor's average number of words between successive punctuation marks.
        if len(wrds_btwn) == 0:
            wrds_btwn.append(0)
        
        f5 = mean(wrds_btwn)

        # f5 as a probability distribution of number of words between successive marks--as it is in Darmon's work.
        # Cap number of in-between words at 39.
        max_wrds_btwn = 40
        wrds_btwn = [x if x < max_wrds_btwn else max_wrds_btwn-1 for x in wrds_btwn]
        F5 = [0]*max_wrds_btwn

        for i in wrds_btwn:
            F5[i] += 1

        F5 = [x / len(wrds_btwn) for x in F5]
    
        # Generation of f6, ratio of punctuation to words.
        total_punc = len(punct_list)
        total_words = len(re.findall(regex, writing_sample))

        f6 = total_punc/total_words
    
        # Output to dataframe.
        # An uppercase F signals that the feature was mimicked directly from Darmon's work.
        F1_dict_keys = ['Period Freq', 'Comma Freq', 'Colon Freq', 'Semicol Freq', 'Question Freq', 'Exclaim Freq', 'Dash Freq', 'L. Paren Freq', 'R. Paren Freq', 'S. Quote Freq', 'D. Quote Freq', 'Elipse Freq']
        F1_dict = {F1_dict_keys[i]: F1[i] for i in range(len(F1_dict_keys))} 

        F4_dict = {'Prob Sentence Len:' + ' ' + str(i): F4[i] for i in range(len(F4))}
        F5_dict = {'Prob' + ' ' + str(i) + ' ' + 'Words Between Punc': F5[i] for i in range(len(F5))}

        f4_dict = {'Avg Sentence Len': f4}
        f5_dict = {'Avg Wrds Btwn Punc': f5}
        f6_dict = {'Punc Ratio': f6}
    
        # Concatinate all the keys into a single list.
        master = {**dem_dict, **F1_dict, **F2_dict, **F3_dict, **F4_dict, **F5_dict, **f4_dict, **f5_dict, **f6_dict}
    
        punc_stats = punc_stats.append(master, ignore_index = True)
    return punc_stats  

In [5]:
five_thou_stats = calculate_punctuation_stats(five_thou)
five_thou.to_csv(r'/gpfs/fs1/home/growe3/five_stats.csv', index = False)

In [11]:
three_thou_stats = calculate_punctuation_stats(three_thou)
three_thou_stats.to_csv(r'/gpfs/fs1/home/growe3/three_stats.csv', index = False)

In [11]:
two_thou_stats = calculate_punctuation_stats(two_thou)
two_thou_stats.to_csv(r'/gpfs/fs1/home/growe3/two_stats.csv', index = False)

In [13]:
one_thou_stats = calculate_punctuation_stats(one_thou)
one_thou_stats.to_csv(r'/gpfs/fs1/home/growe3/one_stats.csv', index = False)

In [14]:
whole_set_stats = calculate_punctuation_stats(whole_set)
whole_set_stats.to_csv(r'/gpfs/fs1/home/growe3/whole_stats.csv', index = False)

In [34]:
def seq_stats(dataset):
    
    # Initialize a DF to hold our results.
    ee_stats = pd.DataFrame()
    
    for writing_sample in dataset['text']:
        
        # Get the demographic info in dictionary form:
        age = dataset.loc[dataset['text'] == writing_sample, 'age'].iloc[0]
        gender = dataset.loc[dataset['text'] == writing_sample, 'gender'].iloc[0]
        author = dataset.loc[dataset['text'] == writing_sample, 'author'].iloc[0]
        fulllink = dataset.loc[dataset['text'] == writing_sample, 'full_link'].iloc[0]
        dem_dict = {'Age': age, 'Gender': gender, 'Author': author, 'Full Link': fulllink}
    
        # Remove new line
        writing_sample = writing_sample.replace('\n', " ")

        # Removing all non-ASCII letters
        writing_sample = writing_sample.encode("ascii", "ignore").decode()
    
        # Remove URLs from comments
        writing_sample = re.sub(r'\(https?:\/\/.*?\)', '', writing_sample)
        
        num_elip = len(re.findall(r'[\.]{2,}', writing_sample))
        num_mult_ex = len(re.findall(r'[\!]{2,}', writing_sample))
        num_mult_que = len(re.findall(r'[\?]{2,}', writing_sample))
        num_ex_que_mixed = len(re.findall(r'[\?\!]{2,}', writing_sample)) - (num_mult_ex + num_mult_que)
        
        counts_dict = {'Elipse Count': num_elip, 'Exclaimation Seq Count': num_mult_ex, 'Question Seq Count': num_mult_que, 'Ex/Que Seq Count': num_ex_que_mixed}
        master = {**dem_dict, **counts_dict}
        ee_stats = ee_stats.append(master, ignore_index = True)
    
    return ee_stats

In [35]:
whole_seq_stats = seq_stats(whole_set)
whole_seq_stats.to_csv(r'/gpfs/fs1/home/growe3/whole_seq_stats.csv', index = False)

In [36]:
whole_seq_stats

Unnamed: 0,Age,Author,Elipse Count,Ex/Que Seq Count,Exclaimation Seq Count,Full Link,Gender,Question Seq Count
0,GenY,lunabuddy,29.0,0.0,6.0,https://www.reddit.com/r/relationships/comment...,Female,5.0
1,GenX,Peenutbuttjellytime,43.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Male,5.0
2,GenZ,DatingAdviceTA1224,1.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Male,0.0
3,GenZ,Purple_Cherry_Hunter,10.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Male,0.0
4,GenZ,kafkascockroachgrace,1.0,0.0,4.0,https://www.reddit.com/r/relationships/comment...,Female,2.0
5,GenZ,PreteenArsonist,2.0,0.0,1.0,https://www.reddit.com/r/relationships/comment...,Female,1.0
6,GenY,Scottishlad28,0.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Female,0.0
7,GenZ,Reimymouse,3.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Female,0.0
8,GenZ,ButchNagga,0.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Male,1.0
9,GenY,sapphiressunny,0.0,0.0,0.0,https://www.reddit.com/r/relationships/comment...,Female,0.0


In [5]:
whole_set['age'].value_counts()

GenZ               15366
GenY               10976
GenX                 248
BabyBoomers           50
Traditionalists        5
Name: age, dtype: int64

In [6]:
whole_set['gender'].value_counts()

Female    18130
Male       8515
Name: gender, dtype: int64