In [9]:
!pip install -q transformers

In [10]:
from __future__ import print_function
import ipywidgets as widgets
from transformers import pipeline

In [11]:
import numpy as np
import pandas as pd

In [12]:
## sentence classification - Sentiment Analysis

In [13]:
nlp_sentiment_model = pipeline('sentiment-analysis')

In [14]:
 nlp_sentiment_model('This is an excellent movie! Rellay nice plot and casting.')

[{'label': 'POSITIVE', 'score': 0.9998732805252075}]

In [15]:
nlp_sentiment_model('This movie was not so good.')

[{'label': 'NEGATIVE', 'score': 0.9997850060462952}]

In [16]:
# Token Classification - Named Entity Recognition

In [76]:
nlp_token_class = pipeline("ner")

In [81]:
text = ["LinkedIn was built to help professionals achieve more in their careers, and every day millions of people use our products to make connections, discover opportunities, and gain insights."
        "Our global reach means we get to make a direct impact on the world’s workforce in ways no other company can."
        "We’re much more than a digital resume -- we transform lives through innovative products and technology."
        "Every day, millions of posts, videos, and articles course through the LinkedIn feed, generating tens of thousands of comments every hour — and tens of millions more shares and likes."
        "We are looking for a research engineer/scientist to develop state of art vision algorithms to understand member posted visual content meaningfully."
        "You will be instrumental in improving the efficacy of communication and content exchange between millions of LinkedIn members by developing cutting edge content understanding and classification algorithms."
        "The understanding not only limited to extract features but to summarizing visual content, recognizing text and classifying it to the appropriate category. We own end to end stack from idea creation, POC, design to product deployment."
        "As part of a new and fast growing team of top-notch scientists and engineers, you will experience all the excitement and dynamism of a startup along with the scale and technology of a world-class enterprise."
  ]

In [82]:
nlp_token_class(text)

[{'word': 'Link',
  'score': 0.6165536642074585,
  'entity': 'I-ORG',
  'index': 1,
  'start': 0,
  'end': 4},
 {'word': '##ed',
  'score': 0.6635887026786804,
  'entity': 'I-ORG',
  'index': 2,
  'start': 4,
  'end': 6},
 {'word': '##I',
  'score': 0.8781384229660034,
  'entity': 'I-ORG',
  'index': 3,
  'start': 6,
  'end': 7},
 {'word': '##n',
  'score': 0.937221348285675,
  'entity': 'I-ORG',
  'index': 4,
  'start': 7,
  'end': 8},
 {'word': 'Link',
  'score': 0.6891728639602661,
  'entity': 'I-MISC',
  'index': 94,
  'start': 466,
  'end': 470},
 {'word': '##ed',
  'score': 0.6991322636604309,
  'entity': 'I-ORG',
  'index': 95,
  'start': 470,
  'end': 472},
 {'word': '##I',
  'score': 0.8263445496559143,
  'entity': 'I-ORG',
  'index': 96,
  'start': 472,
  'end': 473},
 {'word': '##n',
  'score': 0.9524123668670654,
  'entity': 'I-ORG',
  'index': 97,
  'start': 473,
  'end': 474},
 {'word': 'Link',
  'score': 0.9450805187225342,
  'entity': 'I-ORG',
  'index': 159,
  'start':

In [83]:
# Question Answering

In [84]:
nlp_na = pipeline("question-answering")

In [85]:
context= " LinkedIn was built to help professionals achieve more in their careers, and every day millions of people use our products to make connections, discover opportunities, and gain insights. Our global reach means we get to make a direct impact on the world’s workforce in ways no other company can. We’re much more than a digital resume -- we transform lives through innovative products and technology.Every day, millions of posts, videos, and articles course through the LinkedIn feed, generating tens of thousands of comments every hour — and tens of millions more shares and likes.We are looking for a research engineer/scientist to develop state of art vision algorithms to understand member posted visual content meaningfully. You will be instrumental in improving the efficacy of communication and content exchange between millions of LinkedIn members by developing cutting edge content understanding and classification algorithms. The understanding not only limited to extract features but to summarizing visual content, recognizing text and classifying it to the appropriate category. We own end to end stack from idea creation, POC, design to product deployment.As part of a new and fast growing team of top-notch scientists and engineers, you will experience all the excitement and dynamism of a startup along with the scale and technology of a world-class enterprise."

In [86]:
nlp_na(context = context, question = 'What is a research engineer?')

{'score': 0.20314958691596985,
 'start': 641,
 'end': 671,
 'answer': 'state of art vision algorithms'}

In [87]:
nlp_na(context = context, question = 'What are the requirements for research scientist?')

{'score': 0.34431007504463196,
 'start': 630,
 'end': 727,
 'answer': 'to develop state of art vision algorithms to understand member posted visual content meaningfully'}

In [88]:
nlp_na(context = context, question = 'What are the work to be done for product development?')

{'score': 0.06225580349564552,
 'start': 869,
 'end': 933,
 'answer': 'cutting edge content understanding and classification algorithms'}

In [89]:
nlp_na(context = context, question = 'How we scale a world-class enterprise?')

{'score': 0.37905916571617126,
 'start': 1246,
 'end': 1310,
 'answer': 'you will experience all the excitement and dynamism of a startup'}

In [90]:
#  Text Generation - Mask Filling

In [91]:
nlp_fill = pipeline('fill-mask')

In [92]:
nlp_fill('We are looking for ' + nlp_fill.tokenizer.mask_token)

[{'sequence': 'We are looking for answers',
  'score': 0.06869515031576157,
  'token': 5274,
  'token_str': ' answers'},
 {'sequence': 'We are looking for solutions',
  'score': 0.032083362340927124,
  'token': 2643,
  'token_str': ' solutions'},
 {'sequence': 'We are looking for volunteers',
  'score': 0.028713900595903397,
  'token': 4618,
  'token_str': ' volunteers'},
 {'sequence': 'We are looking for submissions',
  'score': 0.021905235946178436,
  'token': 18219,
  'token_str': ' submissions'},
 {'sequence': 'We are looking for recommendations',
  'score': 0.02122603915631771,
  'token': 4664,
  'token_str': ' recommendations'}]

In [93]:
corpus = ["LinkedIn was built to help professionals achieve more in their careers, and every day millions of people use our products to make connections, discover opportunities, and gain insights."
        "Our global reach means we get to make a direct impact on the world’s workforce in ways no other company can."
        "We’re much more than a digital resume -- we transform lives through innovative products and technology."
        "Every day, millions of posts, videos, and articles course through the LinkedIn feed, generating tens of thousands of comments every hour — and tens of millions more shares and likes."
        "We are looking for a research engineer/scientist to develop state of art vision algorithms to understand member posted visual content meaningfully."
        "You will be instrumental in improving the efficacy of communication and content exchange between millions of LinkedIn members by developing cutting edge content understanding and classification algorithms."
        "The understanding not only limited to extract features but to summarizing visual content, recognizing text and classifying it to the appropriate category. We own end to end stack from idea creation, POC, design to product deployment."
        "As part of a new and fast growing team of top-notch scientists and engineers, you will experience all the excitement and dynamism of a startup along with the scale and technology of a world-class enterprise."
  ]

In [94]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thudi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thudi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [95]:
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['linkedin built help professionals achieve careers every day millions people use products make connections discover opportunities gain insightsour global reach means get make direct impact worlds workforce ways company canwere much digital resume transform lives innovative products technologyevery day millions posts videos articles course linkedin feed generating tens thousands comments every hour tens millions shares likeswe looking research engineerscientist develop state art vision algorithms understand member posted visual content meaningfullyyou instrumental improving efficacy communication content exchange millions linkedin members developing cutting edge content understanding classification algorithmsthe understanding limited extract features summarizing visual content recognizing text classifying appropriate category end end stack idea creation poc design product deploymentas part new fast growing team topnotch scientists engineers experience excitement dynamism startup 

In [96]:
# Bag of words 

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [98]:
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1,
        1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1]],
      dtype=int64)

In [99]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,achieve,algorithms,algorithmsthe,along,appropriate,art,articles,built,canwere,careers,...,understand,understanding,use,videos,vision,visual,ways,workforce,worldclass,worlds
0,1,1,1,1,1,1,1,1,1,1,...,1,2,1,1,1,2,1,1,1,1


In [100]:
# Bag of N_Grams Models

In [112]:
# We can set N_Gram Ranage from 1,2 to unigramsas well as bigrams and more.

bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix,columns=vocab)

Unnamed: 0,achieve careers,algorithms understand,algorithmsthe understanding,along scale,appropriate category,art vision,articles course,built help,canwere much,careers every,...,understanding classification,understanding limited,use products,videos articles,vision algorithms,visual content,ways company,workforce ways,worldclass enterprise,worlds workforce
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1


In [113]:
## Lets create TD-IDF Model

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0.,max_df=1.,use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix,2),columns=vocab)

Unnamed: 0,achieve,algorithms,algorithmsthe,along,appropriate,art,articles,built,canwere,careers,...,understand,understanding,use,videos,vision,visual,ways,workforce,worldclass,worlds
0,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,0.08,0.15,0.08,0.08,0.08,0.15,0.08,0.08,0.08,0.08


In [121]:
# Document Similarity

from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df=pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0
0,1.0
