In [7]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 100)

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonthorogood/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simonthorogood/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
raw_file = '../src/cleaned.csv'
df = pd.read_csv(raw_file)
df['cleaned'].head()

0    Live conversation via phone Hi guys I m quite confused So i started this month Since yesterday i...
1    What's a test with the most steps you've ever got I got one with 57 steps a while back and just ...
2    Preparing for Live Conversation Hi Ive been scheduled for a Live Test on Aug 27 PDT which is Aug...
3    Mobile Test Hey yallIm about to take a test on my mobile device please do yall use headphones fo...
4    Curious about message x200B  Anyone know what this ishttpspreviewredditjqfjcozz3fj51pngwidth1283...
Name: cleaned, dtype: object

In [12]:
lower = df["cleaned"].str.lower()
lower.head()

0    live conversation via phone hi guys i m quite confused so i started this month since yesterday i...
1    what's a test with the most steps you've ever got i got one with 57 steps a while back and just ...
2    preparing for live conversation hi ive been scheduled for a live test on aug 27 pdt which is aug...
3    mobile test hey yallim about to take a test on my mobile device please do yall use headphones fo...
4    curious about message x200b  anyone know what this ishttpspreviewredditjqfjcozz3fj51pngwidth1283...
Name: cleaned, dtype: object

In [13]:
PUNCT_TO_REMOVE = string.punctuation
PUNCT_TO_REMOVE

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

lower_no_punct = lower.apply(lambda x: remove_punctuation(x))
lower_no_punct.head()

0    live conversation via phone hi guys i m quite confused so i started this month since yesterday i...
1    whats a test with the most steps youve ever got i got one with 57 steps a while back and just di...
2    preparing for live conversation hi ive been scheduled for a live test on aug 27 pdt which is aug...
3    mobile test hey yallim about to take a test on my mobile device please do yall use headphones fo...
4    curious about message x200b  anyone know what this ishttpspreviewredditjqfjcozz3fj51pngwidth1283...
Name: cleaned, dtype: object

In [24]:
", ".join(stopwords.words('english')[0:20])

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his"

In [25]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

lower_no_punct_sw = lower_no_punct.apply(lambda text: remove_stopwords(text))
lower_no_punct_sw.head()

0    live conversation via phone hi guys quite confused started month since yesterday got 3 tests das...
1    whats test steps youve ever got got one 57 steps back 48 steps curious anyone got anything gruesome
2    preparing live conversation hi ive scheduled live test aug 27 pdt aug 28 country first time file...
3    mobile test hey yallim take test mobile device please yall use headphones mobile test u speak ph...
4    curious message x200b anyone know ishttpspreviewredditjqfjcozz3fj51pngwidth1283formatpngautowebp...
Name: cleaned, dtype: object

In [28]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

lower_no_punct_sw_stem = lower_no_punct_sw.apply(lambda text: stem_words(text))
lower_no_punct_sw_stem.head()

0    live convers via phone hi guy quit confus start month sinc yesterday got 3 test dashboard say sc...
1               what test step youv ever got got one 57 step back 48 step curiou anyon got anyth gruesom
2    prepar live convers hi ive schedul live test aug 27 pdt aug 28 countri first time file directli ...
3    mobil test hey yallim take test mobil devic pleas yall use headphon mobil test u speak phone dir...
4    curiou messag x200b anyon know ishttpspreviewredditjqfjcozz3fj51pngwidth1283formatpngautowebpscb...
Name: cleaned, dtype: object

In [35]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    for word in text:
        return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    
lower_no_punct_sw_lemma = lower_no_punct_sw.apply(lambda text: lemmatize_words(text))
lower_no_punct_sw_lemma.head()

0    live conversation via phone hi guy quite confused started month since yesterday got 3 test dashb...
1       whats test step youve ever got got one 57 step back 48 step curious anyone got anything gruesome
2    preparing live conversation hi ive scheduled live test aug 27 pdt aug 28 country first time file...
3    mobile test hey yallim take test mobile device please yall use headphone mobile test u speak pho...
4    curious message x200b anyone know ishttpspreviewredditjqfjcozz3fj51pngwidth1283formatpngautowebp...
Name: cleaned, dtype: object

In [56]:
vectorizer = CountVectorizer(max_features=2000)
X_vec = vectorizer.fit_transform(lower_no_punct_sw_lemma)
X_vec.shape

(997, 2000)

In [57]:
pd.Series(vectorizer.get_feature_names()).to_clipboard()

  obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)


In [23]:
import spacy

In [25]:
# TODO: 
import sys
!{sys.executable} python -m spacy download en_core_web_sm

/Users/simonthorogood/opt/anaconda3/envs/usertesting/bin/python: can't open file 'python3': [Errno 2] No such file or directory


In [29]:
doc = nlp("$ I was running down the road")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)

$ SYM dep $
I PRON nsubj -PRON-
was AUX aux be
running VERB ROOT run
down ADP prep down
the DET det the
road NOUN pobj road


In [14]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(df['cleaned'][0])
print(type(doc))
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)

<class 'spacy.tokens.doc.Doc'>
Live ADJ amod live
conversation NOUN ROOT conversation
via ADP prep via
phone NOUN compound phone
Hi INTJ compound hi
guys NOUN ROOT guy
I PRON nsubj -PRON-
m VERB ROOT m
quite ADV advmod quite
confused ADJ ROOT confused
So ADV advmod so
i PRON nsubj i
started VERB ROOT start
this DET det this
month NOUN npadvmod month
Since SCONJ prep since
yesterday NOUN pobj yesterday
i PRON nsubj i
got VERB advcl get
3 NUM nummod 3
tests NOUN dobj test
on ADP prep on
my DET poss -PRON-
dashboard NOUN pobj dashboard
saying VERB advcl say
you PRON nsubjpass -PRON-
are AUX auxpass be
scheduled VERB ccomp schedule
for ADP prep for
a DET det a
live ADJ amod live
interviewuse NOUN pobj interviewuse
zoom NOUN acl zoom
on ADP prep on
your DET poss -PRON-
phone NOUN compound phone
bla NOUN pobj bla
bla VERB appos bla
  SPACE   
Next ADV advmod next
to ADP prep to
it PRON pobj -PRON-
there PRON expl there
is AUX ROOT be
the DET det the
phone NOUN compound phone
icon NOUN attr i