In [46]:
import pathlib
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Use regular expressions to remove elements that are not words such as: html tags, latex expressions, urls, digits, line returns, …

In [132]:
RAW_DATA_DIR = pathlib.Path('../data/raw/')
PROCESSED_DATA_DIR = pathlib.Path('../data/processed/')

In [79]:
def missing(so):
    so = so.replace('', np.NaN)
    so = so.dropna(subset=['text'])
    return so

In [27]:
code_exp = re.compile(r"<pre[^>]*>.+?</pre>", re.DOTALL)
url_exp = re.compile(r"(?P<url>(http\S+))")
start_tag = re.compile(r"<[a-z][^>]*>")
end_tag = re.compile(r"</[a-z]+>")
latex_exp = re.compile(r"(?P<latex>(\$\S+\$))")
latex_exp2 = re.compile(r"\${2}.+\${2}")
newline_exp = re.compile(r"(?P<newline>(\n+))")
digit_exp = re.compile(r"(?P<digit>\d+\.*\d+)")


def regex(so):
    patterns = [
        code_exp, url_exp, start_tag, end_tag, latex_exp, latex_exp2,
        newline_exp, digit_exp
    ]

    for pattern in patterns:
        so.loc[:, 'text'] = so.text.replace(pattern, '', regex=True)
    return so



In [28]:
so = pd.read_csv(RAW_DATA_DIR / 'stackexchange_812k.csv')

In [30]:
so = regex(so)

In [31]:
so.category.unique()

array(['title', 'post', 'comment'], dtype=object)

In [32]:
sample_text = so.query("category == 'post'").text.iloc[50]
sample_text

"You tend to use the covariance matrix when the variable scales are similar and the correlation matrix when variables are on different scales.Using the correlation matrix is equivalent to standardizing each of the variables (to mean 0 and standard deviation 1). In general, PCA with and without standardizing will give different results. Especially when the scales are different.As an example, take a look at this R heptathlon data set. Some of the variables have an average value of about  (the high jump), whereas other variables (run m) are around .This outputs:Now let's do PCA on covariance and on correlation:Notice that PCA on covariance is dominated by runm and javelin: PC1 is almost equal to runm (and explains  of the variance) and PC2 is almost equal to javelin (together they explain ). PCA on correlation is much more informative and reveals some structure in the data and relationships between variables (but note that the explained variances drop to  and ).Notice also that the outlyi

In [35]:
so.isna().sum()

post_id            0
parent_id     736597
comment_id    259056
text               0
category           0
dtype: int64

In [37]:
sample_text2 = so.query("category == 'post'").text.iloc[53]
sample_text2

'A random variable is a variable whose value depends on unknown events.  We can summarize the unknown events as "state", and then the random variable is a function of the state.Example:  Suppose we have three dice rolls ().  Then the state . One random variable  is the number of 5s. This is:Another random variable  is the sum of the dice rolls. This is:'

In [71]:
so['text_lengths'] = so.text.str.len()

In [73]:
so.text_lengths.describe()

count    812132.000000
mean        343.463238
std         503.936451
min           0.000000
25%          87.000000
50%         199.000000
75%         405.000000
max       22773.000000
Name: text_lengths, dtype: float64

In [76]:
so.replace('', np.NaN)

Unnamed: 0,post_id,parent_id,comment_id,text,category,text_lengths
0,1,,,Eliciting priors from experts,title,29
1,2,,,What is normality?,title,18
2,3,,,What are some valuable Statistical Analysis op...,title,65
3,4,,,Assessing the significance of differences in d...,title,58
4,6,,,The Two Cultures: statistics vs. machine learn...,title,50
...,...,...,...,...,...,...
812127,279994,,536471.0,"It does run, and gives very valid looking esti...",comment,205
812128,279998,,536439.0,It seems to me that you are correct; the doubl...,comment,345
812129,279998,,536514.0,It wouldn't be the first time a grader has mis...,comment,128
812130,279999,,536802.0,The basic idea is to compare the clustering co...,comment,464


In [80]:
so = missing(so)

In [84]:
import nltk

In [85]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [102]:
so = so[so.text_lengths < so.text_lengths.quantile(0.90)]

In [103]:
so = so[so.text_lengths > so.text_lengths.quantile(0.10)]

In [104]:
so

Unnamed: 0,post_id,parent_id,comment_id,text,category,text_lengths
2,3,,,What are some valuable Statistical Analysis op...,title,65
3,4,,,Assessing the significance of differences in d...,title,58
6,8,,,So how many staticians *does* it take to screw...,title,62
7,10,,,Under what conditions should Likert scales be ...,title,79
15,31,,,What is the meaning of p values and t values i...,title,66
...,...,...,...,...,...,...
812127,279994,,536471.0,"It does run, and gives very valid looking esti...",comment,205
812128,279998,,536439.0,It seems to me that you are correct; the doubl...,comment,345
812129,279998,,536514.0,It wouldn't be the first time a grader has mis...,comment,128
812130,279999,,536802.0,The basic idea is to compare the clustering co...,comment,464


In [None]:
def tokenize(text):
    tokens = word_tokenize(text)
    return ' '.join(tokens)

In [131]:
' '.join(word_tokenize(so.query("category == 'post'").text.iloc[56]))

"If you carved your distribution ( histogram ) outof wood , and tried to balance it onyour finger , the balance point wouldbe the mean , no matter the shape of the distribution.If you put a stick in the middle ofyour scatter plot , and attached thestick to each data point with aspring , the resting point of thestick would be your regression line . [ 1 ] [ 1 ] this would technically be principal components regression . you would have to force the springs to move only `` vertically '' to be least squares , but the example is illustrative either way ."

In [134]:
sop = pd.read_csv(PROCESSED_DATA_DIR / 'tokenized.csv')

In [136]:
sop.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,3,,,What are some valuable Statistical Analysis op...,title
1,4,,,Assessing the significance of differences in d...,title
2,6,,,The Two Cultures : statistics vs. machine lear...,title
3,8,,,So how many staticians *does* it take to screw...,title
4,10,,,Under what conditions should Likert scales be ...,title
