In [142]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, pos_tag
import re

import json

In [35]:
with open ('./data/json_pr.json') as x:
    prj = json.load(x)
    
with open ('./data/json_ra.json') as y:
    raj = json.load(y)

In [36]:
df_prj = pd.DataFrame(prj)
df_raj = pd.DataFrame(raj)

In [51]:
# #most up-to-date files have 0 nulls.  confirmed
# print(df_prj.isnull().sum())
# print(df_raj.isnull().sum())

# Structure analysis separate from diction:

Number of sentences, sentence length, and *rate* of punctuation use (. , ?) could be informative.  How preserve?  

2 separate analyses: nlp based on Bag of Words (0 structure or punctuation retention), and one based on structure.  

Word Frequencies (see lesson 5.04) counts *everything* - punctuation, numbers, etc.  But I'd rather use WordFreq, CountVec, etc for lexicon and reduce punctuation down to counts.

New columns for total wordcount, # sentences, average sentence length, total commas / ? / ! (which can be changed into per wordcount or per sentence 'rates' later).  

### Add column with sentence count
Methods based on experimentation in scratch notebook

In [122]:
df_prj['num_sentences'] = df_prj['selftext'].map(lambda x: len(re.split(r'[.!?]+', x))-1)
df_raj['num_sentences'] = df_raj['selftext'].map(lambda x: len(re.split(r'[.!?]+', x))-1)

### Add column with average sentence length (by word)

In [82]:
# len counts spaces and characters.  you want wordcount.
length = 'this is twenty lettersS..!?,'
len(length)

28

In [224]:
#df_raj_mini['words_per_sentence'] = 

def avg_words_per_sentence(df):
    
    all_sentences_bypost = []

    for i in df['selftext']:
        all_sentences_bypost.append(re.split(r'[.!?]+', i))
    
    avgwordsper = []

    for post in all_sentences_bypost:
        wordspersentence = []
        for i in post:
            wordspersentence.append(len(i.split()))
        avgwordsper.append(int(np.mean(wordspersentence).round()))
        
    return avgwordsper

In [227]:
df_raj['avg_sentence_len'] = avg_words_per_sentence(df_raj)
df_prj['avg_sentence_len'] = avg_words_per_sentence(df_prj)

### Add columns with total wordcount, commas, ?s, and !s

In [233]:
df_raj['wordcount'] = df_raj['selftext'].map(lambda x: len(x.split()))
df_prj['wordcount'] = df_prj['selftext'].map(lambda x: len(x.split()))

In [295]:
df_raj['comma_count'] = df_raj['selftext'].map(lambda x: x.count(','))
df_raj['qmark_count'] = df_raj['selftext'].map(lambda x: x.count('?'))
df_raj['exclamatios'] = df_raj['selftext'].map(lambda x: x.count('!'))

In [301]:
def count_punc(punc, df):
    perpost = []
    for i in df['selftext']:
        perpost.append(i.count(punc))
    return perpost

In [307]:
df_prj['comma_count'] = count_punc(',', df_prj)
df_prj['qmark_count'] = count_punc('?', df_prj)
df_prj['exclamatios'] = count_punc('!', df_prj)

In [280]:
## No. Use total comma count, ! count, ? count
## Can turn into 'rate' later.  Prob by sentence count, not wordcount...

# def punc_rate(punctuation, df):
    
#     puncs=[]
#     punc_rate=[]
    
#     for i in df['selftext']:
#         commas.append(len(re.findall(punctuation, i)))
    
#     for j in commas:
#         comma_rate.append(commas[j]/df['wordcount'][j])
        
#     return punc_rate

In [277]:
# df_raj['comma_rate'] = punc_rate(',', df_raj)
# df_prj['comma_rate'] = punc_rate(',', df_prj)

In [284]:
#overly complicated just use map + str.count
# run into issues with 0s with re.findall I think

# def punc_count(punc, df):
#     count = []
#     for i in df['selftext']:
#         count.append(len(re.findall(punc, i)))
#     return count

## Special characters to scrub:
* characters with alphanum attached: I think remove these whole units first (or the attached text), otherwise left with 'n', 'amp', 'u username', etc that won't be picked up by subsequent alphanum filter
    * ' & amp ;
    * \n
    * u/username
    * (Mobile)
        * edit: leave mobile in place, could be differentiating feature between subreddits
* all non-alpha-numerics: \, /, &, * , "", '', -
* emojis
* any URLs:  start with https://...  
* I don't think numbers will be informative
* leave punctuation in place?  ADHD-comma anecdote, prevalence of ! ? might be informative?
    * RA likely contains ? all the time.

In [49]:
df_prj.loc[df_prj['selftext'].str.contains('mobile|Mobile')].shape

(393, 10)

In [50]:
df_raj.loc[df_raj['selftext'].str.contains('mobile|Mobile')].shape

(94, 10)

In [83]:
testpost.split()

['Sorry',
 'in',
 'advance',
 'if',
 'my',
 'texts',
 'are',
 'a',
 'bit',
 'sloppy..',
 "It's",
 'taken',
 'me',
 'a',
 'long',
 'while',
 'to',
 'finally',
 'write',
 'this',
 'out',
 'as',
 "I'm",
 'just',
 'very',
 'unsure.',
 'Side',
 'notes:',
 "I'm",
 'not',
 'from',
 'USA.',
 "I'm",
 'from',
 'Asia',
 '&amp;',
 'My',
 'country',
 'to',
 'hers',
 'is',
 'about',
 '30m',
 'flight',
 '&amp;',
 'cheap.',
 'Backstory;',
 'When',
 'I',
 'was',
 '14,',
 'I',
 'had',
 'my',
 'first',
 'ever',
 "'real'",
 'relationship.',
 'At',
 'the',
 'time',
 'it',
 'was',
 'solely',
 'online',
 'until',
 'I',
 'was',
 'able',
 'to',
 'fly',
 'over',
 'to',
 'her',
 'country',
 '(accompanied',
 'by',
 'my',
 "mom's",
 'friend)',
 'twice',
 'a',
 'month',
 'when',
 'I',
 'was',
 '15.',
 'Before',
 'you',
 'hammer',
 'about',
 'my',
 'parents',
 'not',
 'seeing',
 'the',
 'wrong',
 'in',
 'our',
 'age',
 'gap,',
 'she',
 'always',
 'made',
 'me',
 'tell',
 'them',
 'that',
 'we',
 'were',
 'only',
 'a