In [216]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import FreqDist, pos_tag
import re

import json

In [19]:
with open ('./data/json_pr.json') as x:
    prj = json.load(x)
    
with open ('./data/json_ra.json') as y:
    raj = json.load(y)

In [20]:
df_prj = pd.DataFrame(prj)
df_raj = pd.DataFrame(raj)

In [21]:
# #confirm latest dfs: 0 nulls.  confirmed
# print(df_prj.isnull().sum())
# print(df_raj.isnull().sum())

# All posts left in original format, no special character elimination

# Structure analysis separate from diction:

Number of sentences, sentence length, and *rate* of punctuation use (. , ?) could be informative.  How preserve?  

2 separate analyses: nlp based on Bag of Words (0 structure or punctuation retention), and one based on structure.  

Word Frequencies (see lesson 5.04) counts *everything* - punctuation, numbers, etc.  But I'd rather use WordFreq, CountVec, etc for lexicon and reduce punctuation down to counts.

New columns for total wordcount, # sentences, average sentence length, total commas / ? / ! (which can be changed into per wordcount or per sentence 'rates' later).  

### Add column with sentence count
Methods based on experimentation in scratch notebook

In [22]:
df_prj['num_sentences'] = df_prj['selftext'].map(lambda x: len(re.split(r'[.!?]+', x))-1)
df_raj['num_sentences'] = df_raj['selftext'].map(lambda x: len(re.split(r'[.!?]+', x))-1)

### Add column with average sentence length (by word)

In [23]:
# len counts spaces and characters.  you want wordcount.
length = 'this is twenty lettersS..!?,'
len(length)

28

In [24]:
#df_raj_mini['words_per_sentence'] = 

def avg_words_per_sentence(df):
    
    all_sentences_bypost = []

    for i in df['selftext']:
        all_sentences_bypost.append(re.split(r'[.!?]+', i))
    
    avgwordsper = []

    for post in all_sentences_bypost:
        wordspersentence = []
        for i in post:
            wordspersentence.append(len(i.split()))
        avgwordsper.append(int(np.mean(wordspersentence).round()))
        
    return avgwordsper

In [25]:
df_raj['avg_words_per_sent'] = avg_words_per_sentence(df_raj)
df_prj['avg_words_per_sent'] = avg_words_per_sentence(df_prj)

### Add columns with total wordcount, commas, ?s, and !s

In [26]:
df_raj['wordcount'] = df_raj['selftext'].map(lambda x: len(x.split()))
df_prj['wordcount'] = df_prj['selftext'].map(lambda x: len(x.split()))

In [27]:
df_raj['comma_count'] = df_raj['selftext'].map(lambda x: x.count(','))
df_raj['qmark_count'] = df_raj['selftext'].map(lambda x: x.count('?'))
df_raj['exclamatios'] = df_raj['selftext'].map(lambda x: x.count('!'))

In [28]:
def count_punc(punc, df):
    perpost = []
    for i in df['selftext']:
        perpost.append(i.count(punc))
    return perpost

In [29]:
df_prj['comma_count'] = count_punc(',', df_prj)
df_prj['qmark_count'] = count_punc('?', df_prj)
df_prj['exclamatios'] = count_punc('!', df_prj)

In [30]:
## No. Use total comma count, ! count, ? count
## Can turn into 'rate' later.  Prob by sentence count, not wordcount...

# def punc_rate(punctuation, df):
    
#     puncs=[]
#     punc_rate=[]
    
#     for i in df['selftext']:
#         commas.append(len(re.findall(punctuation, i)))
    
#     for j in commas:
#         comma_rate.append(commas[j]/df['wordcount'][j])
        
#     return punc_rate

In [31]:
# df_raj['comma_rate'] = punc_rate(',', df_raj)
# df_prj['comma_rate'] = punc_rate(',', df_prj)

In [32]:
#overly complicated just use map + str.count
# run into issues with 0s with re.findall I think

# def punc_count(punc, df):
#     count = []
#     for i in df['selftext']:
#         count.append(len(re.findall(punc, i)))
#     return count

### Add column is_mobile to distinguish if posted by mobile user
Users generally announce '(on) Mobile" if they are posting from mobile phones, but there is no consistent format to this announcement.  Sometimes it's a sentence, sometimes a single word declaration.  ID'ing is_mobile by simply searching for the word 'mobile' will also flag posts containing words like 'automobile', 'immobile', 'mobile bank account', etc ('shitmobile' is one I found that I did not expect...).  
Spot check 100 posts from each df to get rate of false positives to see if an is_mobile column is informative.

---

Results from check: 
* relationship_advice had 10.6% false positives (10 of total 94)
* pettyrev had 8% false positive rate (8 of 100 random checks, total count 395)  

##### Assume a fp rate of 9%, still worth identifying.  Pretty consistent fp rate across both dataframes makes it a reliable enough metric.  Concatenate before adding column. 

In [47]:
#use ctrl-f in jq pretty-print to check if posting from mobile, or talking about mobile homes, etc. 
df_raj.loc[df_raj['selftext'].str.lower().str.contains('mobile')].to_json('./scratch/is_mobile_ra.json', orient='records')

In [48]:
# 10/94 records containing 'mobile' NOT posted from mobile user 
#is_mobile = false positive rate 10.6% for RA
df_raj.loc[df_raj['selftext'].str.lower().str.contains('mobile')].shape

(94, 16)

In [52]:
# repeat for pettyrev.  len = 395.  check first 100.
df_prj.loc[df_prj['selftext'].str.lower().str.contains('mobile')].head(100).to_json('./scratch/is_mobile_pr.json', orient='records')

In [94]:
combo_structured = pd.concat([df_prj, df_raj], axis = 0, ignore_index=True)

In [95]:
combo_structured['is_mobile'] = combo_structured['selftext'].apply(lambda x: 1 if 'mobile' in x.lower() else 0)

In [96]:
combo_structured.is_mobile.value_counts()

0    11519
1      489
Name: is_mobile, dtype: int64

In [97]:
#export csv and json versions
combo_structured.to_csv('./data/allposts_struc.csv', index=False)
combo_structured.to_json('./data/j_allposts_struc.json', orient='records')

## Special characters to scrub:
* **all URLs:  start with https://...** 
* characters with alphanum attached: I think remove these whole units first (or the attached text), otherwise left with 'n', 'amp', 'u username', etc that won't be picked up by subsequent alphanum filter?
    * ' & amp ;
    * \n
    * u/username
    * (Mobile)
        * edit: leave mobile in place, could be differentiating feature between subreddits
        * edit 2: no matter if scrub or not, already have column indicating is_mobile
* all non-alpha-numerics: \, /, &, * , "", '', -
* emojis 
* I don't think numbers will be informative
* stopwords='english' : word freq table for petty revenge showed top words were all stopwords.  Not informative.

In [220]:
bagowords = combo_structured.copy()

In [221]:
#start with urls so you don't accidentally turn them into text by ditching identifying characters
bagowords['selftext'] = bagowords['selftext'].apply(lambda x: re.sub(r'http\S+', '', x))

In [222]:
# word_tokenizer drops many special characters but keeps 'amp' from &amp;
# drop that unit specifically first:
bagowords['selftext'] = bagowords['selftext'].apply(lambda x: x.replace('&amp;',''))
#also apostrophes: just merge into one word (don't -> dont)
bagowords['selftext'] = bagowords['selftext'].apply(lambda x: x.replace("'",''))

In [223]:
test = "could've would've don't I'm isn't"
test.replace("'",'')

'couldve wouldve dont Im isnt'

In [224]:
#word_tokenize then return only alpha characters re-joined in original order
#NUMBERS removed
#special characters removed
tokenizer = RegexpTokenizer(r'\w+')

In [225]:
bagowords['selftext'] = bagowords['selftext'].apply(lambda x: ' '.join([i for i in tokenizer.tokenize(x.lower()) if i.isascii() and i.isalpha()]))

In [236]:
# %%time
# #remove stopwords
# bagowords['selftext'] = bagowords['selftext'].apply(lambda x: [i for i in x if i not in stopwords.words('english')])

CPU times: user 23min 1s, sys: 2min 58s, total: 26min
Wall time: 2h 5min 55s


In [237]:
#check results in jq pretty print
bagowords.to_json('./scratch/check_selftext_b.json', orient='records')

##### Word frequency function from lesson 5.04

In [226]:
def freq_table(text, count=None):

    freqs = FreqDist(text).most_common()
    prob = [round(x[1]/len(text),4) for x in freqs]

    freq = zip(freqs,prob)

    comp_freqs = []
    for s,p in freq:
        comp_freqs.append([s[0],s[1],p])

    comp_freqs.sort(key=lambda tup: tup[1], reverse=True) #Sort the list so it's in order, big to small

    if count == None:
        most = comp_freqs[:26]    
        hapax = comp_freqs[:-25:-1]
    else:
        most = comp_freqs[:count + 1]
        hapax = comp_freqs[:-count:-1]

    print('Most Common \t\t  Least Common')

    for i in zip(most, hapax):
        print(i[0], " "*(24-len(str(i[0]))),i[1])

In [229]:
pettyrev_text = ' '.join(list(bagowords['selftext'])).split()

In [231]:
len(pettyrev_text) #4.3M! cool

4385515

In [232]:
#all single letters even though apostrophes removed...
freq_table(pettyrev_text, count=None)

Most Common 		  Least Common
['i', 187490, 0.0428]     ['downgraded', 1, 0.0]
['and', 149921, 0.0342]   ['supine', 1, 0.0]
['the', 140221, 0.032]    ['symptomatic', 1, 0.0]
['to', 138681, 0.0316]    ['agh', 1, 0.0]
['a', 97909, 0.0223]      ['untalkable', 1, 0.0]
['my', 64050, 0.0146]     ['loong', 1, 0.0]
['of', 60956, 0.0139]     ['rifts', 1, 0.0]
['me', 57438, 0.0131]     ['sextng', 1, 0.0]
['it', 55755, 0.0127]     ['sexuakky', 1, 0.0]
['was', 55636, 0.0127]    ['turban', 1, 0.0]
['that', 54384, 0.0124]   ['overweighted', 1, 0.0]
['in', 53684, 0.0122]     ['wizard', 1, 0.0]
['he', 52457, 0.012]      ['corollary', 1, 0.0]
['she', 51416, 0.0117]    ['signup', 1, 0.0]
['her', 47235, 0.0108]    ['hotliners', 1, 0.0]
['for', 42346, 0.0097]    ['repetitions', 1, 0.0]
['with', 37952, 0.0087]   ['nevers', 1, 0.0]
['but', 36105, 0.0082]    ['yeaah', 1, 0.0]
['we', 34754, 0.0079]     ['impetuousness', 1, 0.0]
['this', 34482, 0.0079]   ['rehabbing', 1, 0.0]
['so', 34446, 0.0079]     ['subleas

In [205]:
bagowords.loc[bagowords['selftext'].str.contains('配')]

Unnamed: 0,selftext,subreddit,created_utc,is_self,score,title,author,num_comments,timestamp,year,num_sentences,avg_words_per_sent,wordcount,comma_count,qmark_count,exclamatios,is_mobile,selftext_b


In [198]:
bagowords['selftext_b'][6071]

'my friend and i met in high school when he was a foreign exchange student he barely spoke english and i didn t even know what japanese was but we connected like crazy it was years ago but we ve still kept in contact my major in college was japanese and i lived there for a short while we ve always kept in contact and we speak somewhat regularly i ve always considered him my best friend but i sent this to him two days ago psa he knew my marriage was on the fritz もう長い間 腹を割って話してないね 僕たちは分かれてから別の場所で それぞれの道のりを歩んできた 君も僕も 大変で長い道だったと思う 僕は君を本当の兄のように思って 慕っている どんなに離れて暮らしても それは変わることはない 知り合った日から つながりはずっとここにあるから それで 僕は最近離婚のことに緊張して すごく辛い 誰かに話そうと思っても そういう人がいないんだ 君にも電話しようと思ったんだけど 余計な心配をかけちゃいけないと思って 結局やめちゃった 僕はただ 友達が必要なんだ 廣敬と話したいよ loosely translated i know we haven t spoken to each other meaningfully in a long time we have both been going through so much lately in different ways but each way hard in our own respects i love you like a brother and that s never going to go away no matter how far we live fro

In [217]:
print(CountVectorizer(stop_words='english').get_stop_words())

frozenset({'wherever', 'two', 'we', 'no', 'before', 'couldnt', 'show', 'go', 'enough', 'former', 'eg', 'whom', 'those', 'very', 'move', 'its', 'anyone', 'others', 'interest', 'the', 'many', 'am', 'give', 'your', 'name', 'into', 'whose', 'otherwise', 'put', 'nor', 'she', 'as', 'how', 'full', 'almost', 'keep', 'for', 'twelve', 'forty', 'someone', 'less', 'whereas', 'detail', 'system', 'against', 'than', 'been', 'somewhere', 'moreover', 'and', 'after', 'next', 'more', 'every', 'five', 'herein', 'hundred', 'made', 'first', 'much', 'whole', 'only', 'should', 'sometimes', 'thin', 'same', 'you', 'thence', 'his', 'yours', 'meanwhile', 'thereupon', 'their', 'while', 'on', 'due', 'ten', 'rather', 'themselves', 'a', 'ever', 'ours', 'down', 'ie', 'off', 'fill', 'why', 'few', 'formerly', 'see', 'would', 'may', 'has', 'nobody', 'yourself', 'please', 'where', 'in', 'so', 'beyond', 'when', 'bill', 'over', 'whoever', 'mine', 'ltd', 'to', 'find', 'cry', 'else', 'third', 'namely', 'this', 'our', 'thru', 

In [201]:
print(cop.isascii() and cop.isalpha())

NameError: name 'cop' is not defined