<a href="https://colab.research.google.com/github/vikrantpotnis123/DS/blob/master/spacy_nlk_gensim_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy

In [None]:
import numpy as np
import pprint
import spacy

In [None]:
pp = pprint.PrettyPrinter(indent=4)

### **Word Tokenize**

In [None]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
pp.pprint(token_list)

### **Pipeline**
Print token (text), label, lemma, pos, dep

In [None]:
import spacy

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
    "Apple is looking at buying U.K. startup for $1 billion",
    "Manchester United isn't looking to sign any forward."
]

nlp = spacy.load("en_core_web_sm")

for doc in nlp.pipe(texts):
    # Do something with the doc here
    pp.pprint(doc)
    pp.pprint([(ent.text, ent.label_, ent.lemma_) for ent in doc.ents])
    pp.pprint([(tok.text, tok.pos_, tok.dep_) for tok in doc]) 



### **Sentencizer**

In [None]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
pp.pprint(sents_list)

###  **Stop words**


In [None]:
import spacy
sp = spacy.load('en_core_web_sm')
pp.pprint(sp.Defaults.stop_words)

### **NLTK stemmer**

Stemming refers to reducing a word to its root form.

spaCy doesn't contain any function for stemming as it relies on **lemmatization** only. 

There are two types of stemmers in NLTK: Porter Stemmer and Snowball stemmers. 

In [None]:
import nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
for token in ['compute', 'computer', 'computed', 'computing', 'victor', 'victory', 'victim', 'victorious', 'victimized' , 'victimize']:
    pp.pprint(token + ' --> ' + stemmer.stem(token))


### **GenSim**

1.   Document: some text.
2.   Corpus: a collection of documents.
3.   Vector: a mathematically convenient representation of a document.
4.   Model: an algorithm for transforming vectors from one representation to another





In [None]:
type(sp.Defaults.stop_words)

In [None]:
import bs4 as bs
import urllib.request
import re
def download_wiki(url, debug = False):
  wiki_data =  urllib.request.urlopen(url)
  wiki_page = wiki_data.read()
  parsed_wiki = bs.BeautifulSoup(wiki_page, 'lxml')
  paras = parsed_wiki.find_all('p')
  wiki_text = ""
  for p in paras:
    wiki_text +=  p.text
    
  # Remove square brackets, extra spaces
  wiki_text = re.sub(r'\[[0-9]*\]', ' ', wiki_text)
  wiki_text = re.sub(r'\s+', ' ', wiki_text)

  # Removing special characters and digits
  if debug :
    pp.pprint(wiki_text[0:100])
  formatted_wiki_text = re.sub('[^a-zA-Z]', ' ', wiki_text)
  
  if debug:
    pp.pprint(formatted_wiki_text[0:100])
  formatted_wiki_text = re.sub(r'\s+', ' ', formatted_wiki_text)
  return formatted_wiki_text

In [None]:
# text_corpus = download_wiki("https://en.wikipedia.org/wiki/Abraham_Lincoln")
# pp.pprint(text_corpus)
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# remove common words and tokenize
stoplist = sp.Defaults.stop_words
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
pprint.pprint(corpus)


In [None]:
# create a transofrm
from gensim import models

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)