## Import all required libraries 

In [1]:
!pip install transformers

import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'# This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 16


Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 85.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 69.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 83.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [3]:
N_EPOCHS = 5 # we can put more, because evaluation of the model shows big difference in loss with accuracy 1.0

# Load Dataset

We will take a column with not preprocecced text data for pure experiment with Hugging Face distilbert model

In [4]:

df =  pd.read_csv("train.En.csv")

# Data Cleaning

In [5]:
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')



b = list(df["tweet"])
corpus = []


for i in range(len(b)):
    review =re.sub(r'http\S+', ' ', str(b[i]))
    review = re.sub("\d*\.\d+","",review)
    review =re.sub(r'@\S+', ' ', review)
    
    
    review = re.sub('\[[^]]*\]', ' ', review)
    
    review = review.lower()
    

    

    corpus.append(review)
df = df.assign(clean_tweet = corpus)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Check the shapes and split proportion 

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_tweet"], df["sarcastic"], test_size=0.33, random_state=42,stratify=df.sarcastic.values)

In [8]:
print('The proportion in y_train\n',y_train.value_counts(normalize=True).mul(100))
print('The proportion in y_test\n',y_test.value_counts(normalize=True).mul(100))

The proportion in y_train
 0    74.989238
1    25.010762
Name: sarcastic, dtype: float64
The proportion in y_test
 0    75.021834
1    24.978166
Name: sarcastic, dtype: float64


## Preprocess

### Decode byte arrays into string representation. 

### Max sentence length

In [9]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

38

## Encode with  DistilBertTokenizer

In [10]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding=True)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

First paragraph: '430    hope u sleep maybe forever maybe lol
Name: clean_tweet, dtype: object'
Input ids: [101, 3246, 1057, 3637, 2672, 5091, 2672, 8840, 2140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [12]:
len(train_encodings["attention_mask"][0]) 

78

In [13]:
train_encodings

{'input_ids': [[101, 3246, 1057, 3637, 2672, 5091, 2672, 8840, 2140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 5466, 5963, 4660, 3042, 2147, 1523, 4067, 4214, 14139, 999, 2171, 4054, 1524, 10166, 2293, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2667, 3422, 6583, 6826, 2401, 7150, 2745, 3929, 2938, 2667, 3275, 2051, 4224, 6583, 6826, 2401, 3011, 2147, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1523, 2045, 3618, 6569, 10866, 2135, 5278, 9154, 1524, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

###  Turn our labels and encodings into a tf.Dataset object

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

In [15]:
train_dataset 

<TensorSliceDataset shapes: ({input_ids: (78,), attention_mask: (78,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

## Fine-tuning with native TensorFlow


In [16]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME) # load pre trained distil bert model 

# define a optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5) 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 

#complile the model 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])

# train our model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f149027a690>

## Model Evaluation

In [17]:
# evaluation of model on test data
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)



{'accuracy': 0.6707423329353333, 'loss': 1.0635888576507568}

## Predict on the different text examples

In [20]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 , 1 classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) # dataset created
  preds = model.predict(dataset.batch(1)).logits # prediction
  res = tf.nn.sigmoid(preds).numpy() 
    
  return res

We take a txt file [here](https://github.com/Galina-Blokh/ai_assignment_aidock/blob/refator/data/test_links.txt). This file contains links to the recipe pages which our model didn't saw yet. Assuming you scraped data from the first [url](https://www.loveandlemons.com/green-bean-salad-recipe/). The data you feed into your model for prediction will be looking like in the cell below. (*A list with one first string of ingredients and following three strings with instructions.)

In [21]:
strings_list =["""
                  1 pound green beans, trimmed
                  ½ head radicchio, sliced into strips
                  Scant ¼ cup thinly sliced red onion
                  Honey Mustard Dressing, for drizzling
                  2 ounces goat cheese
                  2 tablespoons chopped walnuts
                  2 tablespoons sliced almonds
                  ¼ cup tarragon
                  Flaky sea salt
                  """,
                  """
                  Bring a large pot of salted water to a boil and set a bowl of ice water nearby.
                  Drop the green beans into the boiling water and blanch for 2 minutes.
                    Remove the beans and immediately immerse in the ice water long enough 
                    to cool completely, about 15 seconds. Drain and place on paper towels to dry.
                  """,
                  """
                  Transfer the beans to a bowl and toss with the radicchio, onion, 
                  and a few spoonfuls of the dressing.
                  """,
                  """
                  Arrange on a platter and top with small dollops of goat cheese, the walnuts, 
                  almonds, and tarragon. Drizzle with more dressing, season to taste with flaky 
                  salt, and serve.
                  """]
predict_proba(strings_list, model, tokenizer)

array([[0.76201963, 0.27008307],
       [0.70485216, 0.33389962],
       [0.6144321 , 0.4021424 ],
       [0.76218677, 0.25893256]], dtype=float32)

# Prediction on test data(unlabeled data )

In [23]:
df1 = pd.read_csv("taskA.En.input.csv")

In [25]:
string1 = list(df1["text"])
a = predict_proba(string1, model, tokenizer)

In [27]:
a

array([[0.9338634 , 0.09600922],
       [0.49522942, 0.5609809 ],
       [0.9860071 , 0.03096099],
       ...,
       [0.07643391, 0.926257  ],
       [0.89797455, 0.1274918 ],
       [0.44045463, 0.58680797]], dtype=float32)

In [28]:
len(a)

1400

In [29]:
import numpy as np

In [30]:
flat_predictions = np.argmax(a, axis=1).flatten()


In [31]:
flat_predictions

array([0, 1, 0, ..., 1, 0, 1])

In [39]:
df2 = pd.DataFrame(flat_predictions,columns = ["task_a_en"])

In [40]:
df2

Unnamed: 0,task_a_en
0,0
1,1
2,0
3,0
4,0
...,...
1395,1
1396,1
1397,1
1398,0


In [41]:
df2["task_a_en"].value_counts(normalize=True)

0    0.780714
1    0.219286
Name: task_a_en, dtype: float64

In [42]:
df.to_csv("taska.csv")

# Results 

1. accuracy on validation data is 0.67