## Language Processing Library ( spaCy)

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import English
nlp=spacy.load('en_core_web_lg')
import re

## Libraries from Scikit-learns

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from time import time

In [24]:
import numpy as np
import string
import regex as re


In [45]:
from save_load import save_obj, load_obj

## Download dataset

In [3]:
dataset = fetch_20newsgroups(shuffle=True, 
                             random_state=1, 
                             remove=('headers', 'footers', 'quotes'))

In [5]:
data_samples = dataset.data
type(data_samples)

list

In [6]:
len(data_samples)

11314

In [7]:
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

## Render with spaCy natural language processing library

In [166]:
#!python -m spacy download en_core_web_lg
# !pip install pyldavis

In [257]:
spacy.displacy.render(nlp(data_samples[3]), style='ent',jupyter=True)

In [120]:
for token in nlp(data_samples[0])[:10]:
    print(token,token.lemma_)

Well well
i -PRON-
'm be
not not
sure sure
about about
the the
story story
nad nad
it -PRON-


In [19]:
STOPS = set(string.punctuation) | STOP_WORDS

## Strip Punctuation and Stop Words

In [96]:
' '.join(re.findall('[a-zA-Z]+','this 3for five5 test! 45'))

'this for five test'

In [112]:
def spacy_tokenizer(corpus):
    
    corpus = re.sub(r'[^\w\s]', '', corpus)  # Remove punctuation
    corpus = ' '.join(re.findall('[a-zA-Z]+', corpus))  # Remove digit.
#     corpus = re.sub(r'\n',' ',corpus)

    doc = nlp(corpus)
    
    doc_list = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            word = token.lemma_.rstrip()
            # Remove stopwords, Nonetype and select word-length between 2 and 20.
            if word not in STOPS and word and 2< len(word) < 20:  
                doc_list.append(word)
                
    
    return doc_list

In [113]:
t = time()
doc_list = np.array([ spacy_tokenizer(corpus) for corpus in data_samples])
print('Building Time : {:.2f} mins'.format((time() - t) / 60))

Building Time : 8.18 mins


In [114]:
doc_list[:3]

array([list(['sure', 'story', 'nad', 'biased', 'disagree', 'statement', 'media', 'ruin', 'israels', 'reputation', 'rediculous', 'media', 'proisraeli', 'medium', 'world', 'live', 'europe', 'realize', 'incidence', 'describe', 'letter', 'occur', 'medium', 'try', 'ignore', 'subsidize', 'israels', 'existance', 'europeans', 'degree', 'think', 'reason', 'report', 'clearly', 'atrocity', 'shame', 'austria', 'daily', 'report', 'inhuman', 'act', 'commit', 'israeli', 'soldier', 'blessing', 'receive', 'government', 'holocaust', 'guilt', 'away', 'look', 'jews', 'treat', 'race', 'power', 'unfortunate']),
       list(['yeah', 'expect', 'people', 'read', 'faq', 'etc', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'run', 'steam', 'jim', 'sorry', 'pity', 'jim', 'sorry', 'feeling', 'denial', 'faith', 'need', 'pretend', 'end', 'happily', 'maybe', 'start', 'new', 'newsgroup', 'altatheisthard', 'bummin', 'byebye', 'big', 'jim', 'forget', 'flintstones', 'chewabl

## Save To Pickle File

In [115]:
# Save to pickle object
save_obj(doc_list,'doc_list')

In [116]:
t = time()
mod_doc = [ ' '.join(alist) for alist in doc_list]
print('Building Time : {:.2f} mins'.format((time() - t) / 60))

Building Time : 0.00 mins


In [117]:
# Save as a list of list
save_obj(mod_doc,'mod_doc')

In [52]:
doc_list[0]

['sure',
 'story',
 'nad',
 'biased',
 'disagree',
 'statement',
 'media',
 'ruin',
 'israels',
 'reputation',
 'rediculous',
 'media',
 'proisraeli',
 'medium',
 'world',
 'live',
 'europe',
 'realize',
 'incidence',
 'describe',
 'letter',
 'occur',
 'medium',
 'try',
 'ignore',
 'subsidize',
 'israels',
 'existance',
 'europeans',
 'degree',
 'think',
 'reason',
 'report',
 'clearly',
 'atrocity',
 'shame',
 'austria',
 'daily',
 'report',
 'inhuman',
 'act',
 'commit',
 'israeli',
 'soldier',
 'blessing',
 'receive',
 'government',
 'holocaust',
 'guilt',
 'away',
 'look',
 'jews',
 'treat',
 'race',
 'power',
 'unfortunate']

In [53]:
!ls -lh data/

total 7156488
-rw-r--r--  1 thein  staff     5B Feb 22 16:13 a.pkl
-rw-r--r--  1 thein  staff   320B Feb 24 22:21 coherence_vals.pkl
-rw-r--r--  1 thein  staff    16M Feb 23 21:46 doc_list.pkl
-rw-r--r--  1 thein  staff   1.0M Feb 23 00:54 lda_gensim.pkl
-rw-r--r--  1 thein  staff   2.5G Feb 22 15:54 mod_doc.pkl
-rw-r--r--  1 thein  staff   217M Feb 24 22:21 models.pkl
-rw-r--r--  1 thein  staff     0B Feb 22 15:48 obj.pkl
-rw-r--r--  1 thein  staff    24M Feb 23 17:52 word2vec.pkl
-rw-r--r--  1 thein  staff    79M Feb 23 17:57 word2vec_1000.pkl
-rw-r--r--  1 thein  staff   158M Feb 23 18:01 word2vec_2000.pkl
-rw-r--r--  1 thein  staff   393M Feb 23 18:06 word2vec_5000.pkl
