# Module 1 Get Started on spaCy

In [3]:
import spacy

## Load Language Model

In [None]:
nlp = spacy.load("en")

# Module 2 Linguistic Features

## Process text/corpus with nlp

In [None]:
text = """Dostoevsky was the son of a doctor. 
His parents were very hard-working and deeply religious people,
but so poor that they lived with their five children in only
two rooms. The father and mother spent their evenings
in reading aloud to their children, generally from books of
a serious character."""

In [None]:
doc = nlp(text)
doc

In [None]:
# Get text from file

text = open('sample.txt').read()
text

In [None]:
doc = nlp(text)
doc

In [None]:
import pandas as pd

df = pd.read_csv('research_paper.csv')
df.head()


In [None]:
INFO_text = [text for text in df[df['Conference'] == 'INFOCOM']['Title']]
INFO_text
doc = nlp(INFO_text[0])
doc

## Tokenization


### Word Tokenization

In [None]:
doc = nlp(u'Today is a great day')
[word.text for word in doc]

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

word_tokens = [token.text for token in doc]
word_tokens

In [None]:
# Tally occurrences of words in a list
from collections import Counter

cnt = Counter()
for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
    cnt[word] += 1
cnt

#### Ex: Count the Unique Words

In [None]:
text = open('sample.txt').read()
doc = nlp(text)

word_tokens = [token.text for token in doc]

for word in word_tokens:
    cnt[word] += 1
cnt

### Regular Expression

In [None]:
import re

text= "Clutching the coin, Maria ran to the shops. She went straight to the counter and bought the sweets"
words = re.findall(r'\w+',text)
words

In [None]:
import re

words = re.findall(r'\w+', open('sample.txt').read().lower())
# Counter(words)
Counter(words).most_common(10)


### Sentence Tokenization

In [None]:
# Sentence Tokenization

doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

In [None]:
text = open('sample.txt').read()
doc = nlp(text)
for sent in doc.sents:
    print(sent.text)


## Stop Words

In [None]:
doc[1].is_stop

In [None]:
[word.is_stop for word in doc]

In [None]:
nlp.vocab["the"].is_stop = True

In [None]:
[word.is_stop for word in doc]

In [None]:
# Default stop words in spaCy

stopwords = nlp.Defaults.stop_words
stopwords

In [None]:
# Default stop words in NLTK

from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords

In [None]:
# import the punctuations

import string
punctuations = string.punctuation
punctuations

#### Ex: Stopwords

In [None]:
text= "Clutching the coin, Maria ran to the shops. She went straight to the counter and bought the sweets"
doc = nlp(text)

tokens = [tok.text for tok in doc if tok.text not in stopwords and tok.text not in punctuations]
cleaned_doc = ' '.join(tokens)
cleaned_doc

## Stemming

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
# stemmer = LancasterStemmer()

text = "I ran to the clinic with running nose"
words = word_tokenize(text)
words
[stemmer.stem(w) for w in words]

## Lemmatization

In [None]:
# doc = nlp(u'running run')
doc = nlp(u'meaning mean')
doc = nlp(u'meanness meaning mean')

for token in doc:
	print(token.text,token.lemma_)

In [None]:
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token, token.lemma_)

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

for token in doc:
    print(token, token.lemma_)

### Lemma for Pronoun

In [None]:
text= "She He They We"
doc = nlp(text)
[tok.lemma_ for tok in doc]

#### Ex: Remove Pronoun

In [None]:
text= "Clutching the coin, Maria ran to the shops. She went straight to the counter and bought the sweets"
doc = nlp(text)
tokens = [tok.text for tok in doc]
print(tokens)
tokens = [tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-']
print(tokens)


In [None]:
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
cleaned_doc = ' '.join(tokens)
cleaned_doc

## Part of Speech (POS)

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

In [None]:

for token in doc:
    print(token.text,token.tag_)

## Noun Chuck and Dependency

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

doc = nlp(u'The cat sit on the mat')
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text)

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

 

##  Visualize Dependency and POS 

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
displacy.serve(doc, style='dep')

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
options = {'compact': True, 'bg': '#09a3d5',
           'color': 'white', 'font': 'Source Sans Pro'}
displacy.serve(doc, style='dep', options=options)

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
html = displacy.render([doc], style='dep', jupyter=True)

## Name Entity Recognition (NER)

In [None]:
doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

doc.user_data['title'] = 'This is a title'
displacy.serve(doc, style='ent')

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG'], 'colors': colors}
displacy.serve(doc, style='ent', options=options)

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

html = displacy.render([doc], style='ent', jupyter=True)

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG'], 'colors': colors}
html = displacy.render([doc], style='ent', jupyter=True,options=options)

# Module 3 Processing Pipelines

## Default Pipeline

In [None]:
nlp.pipeline

In [None]:
nlp.pipe_names

## Disable Components

In [None]:
nlp = spacy.load('en', disable=['parser', 'tagger'])
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

In [None]:
nlp = spacy.load('en')
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

In [None]:
nlp = spacy.load('en')
nlp.remove_pipe('tagger')
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)


## Rename Components

In [None]:
nlp = spacy.load('en')
nlp.rename_pipe('ner', 'entityrecognizer')
nlp.pipe_names

## Adding Custom Component

In [None]:
def my_component(doc):
    print("After tokenization, this doc has %s tokens." % len(doc))
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc



In [None]:
nlp = spacy.load('en')
nlp.add_pipe(my_component, name='print_info', first=True)
nlp.pipe_names


In [None]:
doc = nlp(u'The cat sit on the mat')
doc

### Ex: Add Component to Pipeline

In [None]:
def my_component2(doc):
    
    for tok in doc:
        print(tok.text, tok.pos_,tok.tag_)
    return doc


In [None]:
nlp.add_pipe(my_component2, name="print_pos",last=True)
nlp.pipe_names

In [None]:
nlp = spacy.load('en')
nlp.add_pipe(my_component2, name="print_pos", after='parser')
nlp.pipe_names

### Ex2 : Add A Clean Up Component to Pipeline

In [None]:
import string
def clean_text(doc):
    stopwords = nlp.Defaults.stop_words
    punctuations = string.punctuation
    doc = [tok.text for tok in doc if tok.text not in stopwords and tok.text not in punctuations and tok.text != '\n']
    doc = [tok.lower() for tok in doc]
    doc = ' '.join(doc)
    return nlp.make_doc(doc)

# Module 4 Vectors & Similarity

## Count Vectroization

In [None]:
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


data = pd.read_csv('research_paper.csv')[0:5]
     
count_vect = CountVectorizer(analyzer=clean_text)
# count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(data['Title'])
print(X_counts.shape)


In [None]:
print(count_vect.get_feature_names())

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df

## Gensim Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.corpus import gutenberg

embedding = Word2Vec(gutenberg.sents(),min_count=1, window=5, size=32)

print(embedding.most_similar('man', topn=5))
print(embedding.most_similar('woman', topn=5))


## Similarity

In [None]:
nlp = spacy.load('en_core_web_md')
tokens = nlp(u'dog cat banana afskfsd')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

## Pre-trainred Word Vector

In [5]:
nlp = spacy.load('en_core_web_lg')

dog = nlp.vocab['dog']
dog.vector

array([-4.0176e-01,  3.7057e-01,  2.1281e-02, -3.4125e-01,  4.9538e-02,
        2.9440e-01, -1.7376e-01, -2.7982e-01,  6.7622e-02,  2.1693e+00,
       -6.2691e-01,  2.9106e-01, -6.7270e-01,  2.3319e-01, -3.4264e-01,
        1.8311e-01,  5.0226e-01,  1.0689e+00,  1.4698e-01, -4.5230e-01,
       -4.1827e-01, -1.5967e-01,  2.6748e-01, -4.8867e-01,  3.6462e-01,
       -4.3403e-02, -2.4474e-01, -4.1752e-01,  8.9088e-02, -2.5552e-01,
       -5.5695e-01,  1.2243e-01, -8.3526e-02,  5.5095e-01,  3.6410e-01,
        1.5361e-01,  5.5738e-01, -9.0702e-01, -4.9098e-02,  3.8580e-01,
        3.8000e-01,  1.4425e-01, -2.7221e-01, -3.7016e-01, -1.2904e-01,
       -1.5085e-01, -3.8076e-01,  4.9583e-02,  1.2755e-01, -8.2788e-02,
        1.4339e-01,  3.2537e-01,  2.7226e-01,  4.3632e-01, -3.1769e-01,
        7.9405e-01,  2.6529e-01,  1.0135e-01, -3.3279e-01,  4.3117e-01,
        1.6687e-01,  1.0729e-01,  8.9418e-02,  2.8635e-01,  4.0117e-01,
       -3.9222e-01,  4.5217e-01,  1.3521e-01, -2.8878e-01, -2.28

In [None]:
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

In [6]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']
 
print(dog.similarity(animal), dog.similarity(fruit)) 
print(banana.similarity(fruit), banana.similarity(animal)) 

0.6618534 0.23552845
0.67148364 0.2427285


In [7]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1)) 
print(target.similarity(doc2))  
print(target.similarity(doc3))  

0.8901765218466683
0.9115828449161616
0.7822956752876101


# Module 5 Machine Learning using spaCy

In [None]:
import pandas as pd

df = pd.read_csv('research_paper.csv')
df.head()

In [None]:
# Check the data shape
df.shape

In [None]:
# Check Missing values
df.isnull().sum()

In [None]:
# Check no of unique conference titles
df['Conference'].nunique()

In [None]:
# Split the data into train and test

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.33, random_state=42)

In [None]:
# Do a quick check after split

print('Research title sample:', train['Title'].iloc[0])
print('Conference of this paper:', train['Conference'].iloc[0])
print('Training Data Shape:', train.shape)
print('Testing Data Shape:', test.shape)

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.figure(figsize=(8,4))
sns.barplot(x = train['Conference'].unique(), y=train['Conference'].value_counts())
plt.show()

In [None]:
import spacy
import string

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
    texts = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [None]:
from collections import Counter

INFO_text = [text for text in train[train['Conference'] == 'INFOCOM']['Title']]

IS_text = [text for text in train[train['Conference'] == 'ISCAS']['Title']]

INFO_clean = cleanup_text(INFO_text)
INFO_clean = ' '.join(INFO_clean).split()

IS_clean = cleanup_text(IS_text)
IS_clean = ' '.join(IS_clean).split()

INFO_counts = Counter(INFO_clean)
IS_counts = Counter(IS_clean)

In [None]:
INFO_clean

In [None]:
INFO_common_words = [word[0] for word in INFO_counts.most_common(20)]
INFO_common_counts = [word[1] for word in INFO_counts.most_common(20)]

fig = plt.figure(figsize=(18,6))
sns.barplot(x=INFO_common_words, y=INFO_common_counts)
plt.title('Most Common Words used in the research papers for conference INFOCOM')
plt.show()

In [None]:
IS_common_words = [word[0] for word in IS_counts.most_common(20)]
IS_common_counts = [word[1] for word in IS_counts.most_common(20)]

fig = plt.figure(figsize=(18,6))
sns.barplot(x=IS_common_words, y=IS_common_counts)
plt.title('Most Common Words used in the research papers for conference ISCAS')
plt.show()

In [None]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import string

STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

In [None]:
from sklearn.base import TransformerMixin

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

## Vectorize the Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))

## SVM Classifier

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()


## Pipeline

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])


## Training the Pipeline

In [None]:
# data
X_train = train['Title'].tolist()
y_train = train['Conference'].tolist() 

# train
pipe.fit(X_train,y_train)

## Testing the Model

In [None]:
X_test = test['Title'].tolist()
y_test = test['Conference'].tolist() 
# test
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

print("accuray = ",accuracy_score(y_test,y_pred))
print("Top 10 features used to predict: ")

In [None]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)


In [None]:
printNMostInformative(vectorizer, clf, 10)

In [None]:
from sklearn import metrics

print(metrics.classification_report(labelsTest1, preds,  target_names=df['Conference'].unique()))