In [13]:
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

### Vectorizing Sentences

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
model.encode('I love doing NLP')

array([ 3.45676602e-03, -1.91000011e-02,  5.86551502e-02, -9.63218044e-03,
        6.08638711e-02,  3.79803404e-03,  6.73312396e-02,  3.24859507e-02,
        6.68662861e-02,  6.60145730e-02, -2.31421590e-02, -3.50490510e-02,
       -4.55491766e-02,  8.81254449e-02,  1.72850043e-02,  7.66821429e-02,
       -7.37398043e-02,  4.43506762e-02, -3.22165005e-02, -3.21099907e-02,
       -5.51065840e-02,  1.08279273e-01,  1.68212689e-02, -7.93265700e-02,
        1.74275693e-02,  4.37984653e-02, -3.27889621e-02, -3.71682569e-02,
        2.25870535e-02, -3.24320011e-02, -3.80629525e-02,  4.58278209e-02,
        3.80485356e-02,  7.31964111e-02, -6.04659989e-02,  2.20493041e-02,
        2.88322605e-02,  1.60849672e-02,  5.09422049e-02,  4.82737795e-02,
       -3.20035703e-02, -2.74517909e-02, -1.71955694e-02,  2.07176544e-02,
        2.45474167e-02,  2.29512174e-02, -3.04011628e-02,  4.36681248e-02,
        4.85298559e-02,  6.14521420e-03, -1.44977132e-02, -1.65237151e-02,
        1.83046907e-02,  

In [5]:
len(model.encode('I love doing NLP'))

384

### Similarities

In [7]:
cosine_scores = util.cos_sim(model.encode('I love doing NLP'), model.encode('I love doing NLP'))

In [8]:
cosine_scores

tensor([[1.0000]])

In [11]:
cosine_scores = util.cos_sim(model.encode('The cat sits outside'), model.encode('The cat is layind down'))

In [12]:
cosine_scores

tensor([[0.6641]])

### Sentiment Analysis

In [14]:
text_classification_pipeline = pipeline("text-classification")

inputs = ["I love how amazingly simple ML has become!",
          "I hate doing mundane and thankless tasks. ☹️"]

results = text_classification_pipeline(inputs)
print(results)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 629/629 [00:00<00:00, 2.29MB/s]
Downloading model.safetensors: 100%|██████████| 268M/268M [00:17<00:00, 15.0MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 242kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 522kB/s]


[{'label': 'POSITIVE', 'score': 0.9995759129524231}, {'label': 'NEGATIVE', 'score': 0.9903519749641418}]


### Specific Model

In [15]:
pipe = pipeline(task="text-classification", model="ProsusAI/finbert")
pipe(["Stocks rallied and the British pound gained.","Stocks making the biggest moves midday: Nvidia, Palantir and more"])

Downloading (…)lve/main/config.json: 100%|██████████| 758/758 [00:00<00:00, 2.45MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:29<00:00, 15.0MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 252/252 [00:00<00:00, 1.03MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 560kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 387kB/s]


[{'label': 'positive', 'score': 0.898361325263977},
 {'label': 'neutral', 'score': 0.8062635064125061}]

### Question Pairs

In [16]:
pipe = pipeline("text-classification", model ="textattack/bert-base-uncased-QQP")
pipe("Which city is the capital of France?, Where is the capital of France?")

Downloading (…)lve/main/config.json: 100%|██████████| 475/475 [00:00<00:00, 2.30MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:30<00:00, 14.3MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 185kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 561kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 431kB/s]


[{'label': 'LABEL_0', 'score': 0.9988721013069153}]

### Zero Shot Classification

In [17]:
classifier = pipeline(task="zero-shot-classification",model="facebook/bart-large-mnli")
text_to_classify= "I have a problem with my iphone that needs to be resolved asap!!"
candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
classifier(text_to_classify, candidate_labels, multi_label=True)

Downloading (…)lve/main/config.json: 100%|██████████| 1.15k/1.15k [00:00<00:00, 1.92MB/s]
Downloading model.safetensors: 100%|██████████| 1.63G/1.63G [01:48<00:00, 15.1MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 55.0kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.00MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.42MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.46MB/s]


{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.998576283454895,
  0.9949977993965149,
  0.13497847318649292,
  0.0006789048784412444,
  0.00041479969513602555]}

### Named Entity Recognition

In [18]:
pipe = pipeline(task="token-classification")
pipe("I am John and I live in New York City.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 998/998 [00:00<00:00, 3.12MB/s]
Downloading model.safetensors: 100%|██████████| 1.33G/1.33G [01:27<00:00, 15.2MB/s]
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model

[{'entity': 'I-PER',
  'score': 0.9974554,
  'index': 3,
  'word': 'John',
  'start': 5,
  'end': 9},
 {'entity': 'I-LOC',
  'score': 0.9992238,
  'index': 8,
  'word': 'New',
  'start': 24,
  'end': 27},
 {'entity': 'I-LOC',
  'score': 0.99931407,
  'index': 9,
  'word': 'York',
  'start': 28,
  'end': 32},
 {'entity': 'I-LOC',
  'score': 0.99942446,
  'index': 10,
  'word': 'City',
  'start': 33,
  'end': 37}]

### POS Tags

In [19]:
pipe = pipeline(task="token-classification", model="vblagoje/bert-english-uncased-finetuned-pos")
pipe("I am George and I live in Phoenix.")

Downloading (…)lve/main/config.json: 100%|██████████| 1.06k/1.06k [00:00<00:00, 3.29MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:29<00:00, 14.7MB/s] 
Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 220kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 

[{'entity': 'PRON',
  'score': 0.99950683,
  'index': 1,
  'word': 'i',
  'start': 0,
  'end': 1},
 {'entity': 'AUX',
  'score': 0.99707437,
  'index': 2,
  'word': 'am',
  'start': 2,
  'end': 4},
 {'entity': 'PROPN',
  'score': 0.9988508,
  'index': 3,
  'word': 'george',
  'start': 5,
  'end': 11},
 {'entity': 'CCONJ',
  'score': 0.99917895,
  'index': 4,
  'word': 'and',
  'start': 12,
  'end': 15},
 {'entity': 'PRON',
  'score': 0.99950755,
  'index': 5,
  'word': 'i',
  'start': 16,
  'end': 17},
 {'entity': 'VERB',
  'score': 0.99875176,
  'index': 6,
  'word': 'live',
  'start': 18,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.99939656,
  'index': 7,
  'word': 'in',
  'start': 23,
  'end': 25},
 {'entity': 'PROPN',
  'score': 0.99888057,
  'index': 8,
  'word': 'phoenix',
  'start': 26,
  'end': 33},
 {'entity': 'PUNCT',
  'score': 0.9996618,
  'index': 9,
  'word': '.',
  'start': 33,
  'end': 34}]

### Translation

In [20]:
pipe = pipeline(task="translation_en_to_fr")
pipe("How are you?")

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 6.16MB/s]
Downloading model.safetensors: 100%|██████████| 892M/892M [00:58<00:00, 15.3MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 216kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 19.6MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 7.78MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `ma

[{'translation_text': 'Comment êtes-vous?'}]

### Summarization

In [21]:
document = """
The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.

We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world.
"""
print(len(document.split())) #print the length of the document
pipe = pipeline(task="summarization")
pipe(document)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


348


Downloading (…)lve/main/config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 4.23MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.22G/1.22G [01:19<00:00, 15.5MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 157kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 981kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 7.58MB/s]


[{'summary_text': ' The unanimous Declaration of the thirteen united States of America . The Declaration of Independence declared that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness . The U.S. Constitution was established in 17th Amendment of 18th Amendment to First Amendment to the Constitution .'}]

### Question Answering

In [22]:
qa_model = pipeline("question-answering")
question = "Where do I live?"
context = "My name is Merve and I live in İstanbul."
qa_model(question = question, context = context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 473/473 [00:00<00:00, 1.26MB/s]
Downloading model.safetensors: 100%|██████████| 261M/261M [00:17<00:00, 15.2MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 87.6kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 77.1MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 17.0MB/s]


{'score': 0.9538118243217468, 'start': 31, 'end': 39, 'answer': 'İstanbul'}