In [None]:
!pip install transformers
!pip install sentence-transformers

# Welcome to our Huggingface Demo!

- This notebook gives some ideas about how we can use Huggingface's transformers library to do some VERY COOL STUFF with VERY LITTLE CODE
- Examples are drawn from Huggingface github here: https://github.com/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb

# Sentiment Analysis

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

In [None]:
demo_text = "Yu Chen is the best teacher ever"
classifier(demo_text)

In [None]:
demo_text = "Yu Chen is not the best teacher ever"
classifier(demo_text)

In [None]:
demo_text = "Yu Chen is not not the best teacher ever"
classifier(demo_text)

# Sentiment Analysis - Reviews

In [None]:
review_text = "I did not hate anything about this movie!"
classifier(review_text)

In [None]:
review_text = "I did not like one thing about this product!"
classifier(review_text)

# Paraphrasing

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [None]:
## PYTORCH CODE
sequence_0 = "This is a natural language processing class at Marshall Business School"
sequence_1 = "USC has great deep learning classes"
sequence_2 = "Marshall offers a language processing course"
# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
# the sequence, as well as compute the attention masks.

print('full statement:', sequence_0)

paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]


print('\nprobability "', sequence_1, '" is paraphrase:', paraphrase_results[1])

paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]

print('\nprobability "', sequence_2, '" is paraphrase:', paraphrase_results[1])


# Extractive Question Answering

In [None]:
from transformers import pipeline
question_answerer = pipeline("question-answering")

In [None]:
context = r"""
Telsa stock soared today after another positive earnings report.  
Elon Musk did some silly stuff on one of his friend's podcasts, but that only seemed to help.
"""

In [None]:
result = question_answerer(question="What did Elon Musk do?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What happened to Tesla's stock today?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

# Next-Word Prediction

In [None]:
## PYTORCH CODE
from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
import torch
from torch import nn
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
sequence = f"This has been my favorite course so far during my graduate"
inputs = tokenizer(sequence, return_tensors="pt")
input_ids = inputs["input_ids"]
# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]
# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat([input_ids, next_token], dim=-1)
resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)

# Text Generation

In [None]:
from transformers import pipeline
text_generator = pipeline("text-generation")

In [None]:
print(text_generator("This has been my favorite class so far", max_length=50, do_sample=False))

# Summarization

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization")

In [None]:
ARTICLE = """ Tesla drivers say they have been locked out of their cars after an outage struck the carmaker's app.
Dozens of owners posted on social media about seeing an error message on the mobile app that was preventing them from connecting to their vehicles.
Tesla chief executive Elon Musk personally responded to one complaint from a driver in South Korea, saying on Twitter: "Checking."
Mr Musk later said the app was coming back online.
The Tesla app is used as a key by drivers to unlock and start their cars.
Owners posted a multitude of complaints online about not being able to use their vehicles.
"I'm stuck an hour away from home because I normally use my phone to start [my] car," one owner tweeted.
About 500 users reported an error on the app at around 16:40 ET (21:40 GMT) on Friday, according to the outage tracking site DownDetector. Five hours later, there were just over 60 reports of an error.
"Apologies, we will take measures to ensure this doesn't happen again," Mr Musk tweeted.
The app is not the only way to access the cars though, Stuart Masson, editor of The Car Expert website, told the BBC.
"""

print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

# Semantic Similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
model = SentenceTransformer('stsb-roberta-large')

In [None]:
sentence1 = "Apple's earnings were affected by a recent negative outlook in the market for new headphones"
sentence2 = "That apple fell on the floor"
sentence3 = "The market for personal audio devices took a hit last week"
# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
embedding3 = model.encode(sentence3, convert_to_tensor=True)

In [None]:
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

In [None]:
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding3)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence3)
print("Similarity score:", cosine_scores.item())