In [21]:
from pathlib import Path
import shutil
import os
import logging
import sys

import SentimNC.SentimNC

sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

from transformers import AutoModelForSequenceClassification

from SentimNC.SentimNC import *
import SentimNC.utils as tools

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  pd.set_option('max_colwidth', -1)


In [22]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [23]:
cl_path = project_dir/'models'/'classifier_model'/'sentimnc-sentiment'
cl_data_path = project_dir/'data'/'sentiment_data'

In [24]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path)
except:
    pass

bertmodel = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [25]:
sentimNC = SentimNC(config)
sentimNC.base_model = 'bert-base-uncased'
sentimNC.config.discriminate=True
sentimNC.config.gradual_unfreeze=True

In [26]:
sentimNC.prepare_model(label_list=['positive','negative','neutral'])

03/31/2023 11:28:43 - INFO - SentimNC.SentimNC -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False


In [30]:
train_data = sentimNC.get_data('train')

In [31]:
model = sentimNC.create_the_model()



In [33]:
freeze = 6

for param in model.bert.embeddings.parameters():
    param.requires_grad = False

for i in range(freeze):
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False

In [34]:
trained_model = sentimNC.train(train_examples = train_data, model = model)

03/31/2023 11:33:50 - INFO - SentimNC.utils -   *** Example ***
03/31/2023 11:33:50 - INFO - SentimNC.utils -   guid: train-1
03/31/2023 11:33:50 - INFO - SentimNC.utils -   tokens: [CLS] new york giants ' michael st ##rah ##an and dr ##ei ##er ll ##p host charity golf tournament june . . . [SEP]
03/31/2023 11:33:50 - INFO - SentimNC.utils -   input_ids: 101 2047 2259 7230 1005 2745 2358 10404 2319 1998 2852 7416 2121 2222 2361 3677 5952 5439 2977 2238 1012 1012 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 11:33:50 - INFO - SentimNC.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 11:33:50 - INFO - SentimNC.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 11:33:50 - INFO - SentimNC.utils -   label: positive (id = 0)
03/31/2023 11:36:26 - INFO - SentimNC.SentimNC -   ***** Loading data *****
03/31/2

Iteration:   0%|          | 0/46388 [00:00<?, ?it/s]

03/31/2023 15:16:39 - INFO - SentimNC.utils -   *** Example ***
03/31/2023 15:16:39 - INFO - SentimNC.utils -   guid: validation-1
03/31/2023 15:16:39 - INFO - SentimNC.utils -   tokens: [CLS] fran ##shi ##on properties ( china ) limited announces 2007 annual results [SEP]
03/31/2023 15:16:39 - INFO - SentimNC.utils -   input_ids: 101 23151 6182 2239 5144 1006 2859 1007 3132 17472 2289 3296 3463 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 15:16:39 - INFO - SentimNC.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 15:16:39 - INFO - SentimNC.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 15:16:39 - INFO - SentimNC.utils -   label: negative (id = 1)
03/31/2023 15:16:55 - INFO - SentimNC.SentimNC -   ***** Loading data *****
03/31/2023 15:16:55 - INFO - SentimNC.SentimNC -     Num exa

Validating:   0%|          | 0/5155 [00:00<?, ?it/s]

Validation losses: [0.28164853206957635]
No best model found


Epoch:  25%|██▌       | 1/4 [4:01:39<12:04:59, 14499.74s/it]

Iteration:   0%|          | 0/46388 [00:00<?, ?it/s]

03/31/2023 21:13:58 - INFO - SentimNC.utils -   *** Example ***
03/31/2023 21:13:58 - INFO - SentimNC.utils -   guid: validation-1
03/31/2023 21:13:58 - INFO - SentimNC.utils -   tokens: [CLS] fran ##shi ##on properties ( china ) limited announces 2007 annual results [SEP]
03/31/2023 21:13:58 - INFO - SentimNC.utils -   input_ids: 101 23151 6182 2239 5144 1006 2859 1007 3132 17472 2289 3296 3463 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 21:13:58 - INFO - SentimNC.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 21:13:58 - INFO - SentimNC.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/31/2023 21:13:58 - INFO - SentimNC.utils -   label: negative (id = 1)
03/31/2023 21:14:14 - INFO - SentimNC.SentimNC -   ***** Loading data *****
03/31/2023 21:14:14 - INFO - SentimNC.SentimNC -     Num exa

Validating:   0%|          | 0/5155 [00:00<?, ?it/s]

Validation losses: [0.28164853206957635, 0.08529077643618009]


Epoch:  50%|█████     | 2/4 [10:00:12<10:20:50, 18625.06s/it]

Iteration:   0%|          | 0/46388 [00:00<?, ?it/s]

04/01/2023 04:43:06 - INFO - SentimNC.utils -   *** Example ***
04/01/2023 04:43:06 - INFO - SentimNC.utils -   guid: validation-1
04/01/2023 04:43:06 - INFO - SentimNC.utils -   tokens: [CLS] fran ##shi ##on properties ( china ) limited announces 2007 annual results [SEP]
04/01/2023 04:43:06 - INFO - SentimNC.utils -   input_ids: 101 23151 6182 2239 5144 1006 2859 1007 3132 17472 2289 3296 3463 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/01/2023 04:43:06 - INFO - SentimNC.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/01/2023 04:43:06 - INFO - SentimNC.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/01/2023 04:43:06 - INFO - SentimNC.utils -   label: negative (id = 1)
04/01/2023 04:43:22 - INFO - SentimNC.SentimNC -   ***** Loading data *****
04/01/2023 04:43:22 - INFO - SentimNC.SentimNC -     Num exa

Validating:   0%|          | 0/5155 [00:00<?, ?it/s]

Validation losses: [0.28164853206957635, 0.08529077643618009, 0.0507508343764943]


Epoch:  75%|███████▌  | 3/4 [17:28:14<6:13:15, 22395.15s/it] 

Iteration:   0%|          | 0/46388 [00:00<?, ?it/s]

Epoch:  75%|███████▌  | 3/4 [22:51:53<7:37:17, 27437.72s/it]


KeyboardInterrupt: 

In [None]:
print("hello")

In [None]:
test_data = sentimNC.get_data('test')


In [None]:
results = sentimNC.evaluate(examples=test_data, model=trained_model)


In [None]:
def report(df, cols=['label','prediction','logits']):
    #print('Validation loss:{0:.2f}'.format(metrics['best_validation_loss']))
    cs = CrossEntropyLoss(weight=se.class_weights)
    loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
    print("Loss:{0:.2f}".format(loss))
    print("Accuracy:{0:.2f}".format((df[cols[0]] == df[cols[1]]).sum() / df.shape[0]) )
    print("\nClassification Report:")
    print(classification_report(df[cols[0]], df[cols[1]]))

In [None]:
results['prediction'] = results.predictions.apply(lambda x: np.argmax(x,axis=0))


In [None]:
report(results,cols=['labels','prediction','predictions'])


In [None]:
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."

In [None]:
cl_path = project_dir/'models'/'classifier_model'/'sentimnc-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
result = predict(text,model)

In [None]:
blob = TextBlob(text)
result['textblob_prediction'] = [sentence.sentiment.polarity for sentence in blob.sentences]
result

In [None]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))


In [None]:
text2 = "Shares in the spin-off of South African e-commerce group Naspers surged more than 25% \
in the first minutes of their market debut in Amsterdam on Wednesday. Bob van Dijk, CEO of \
Naspers and Prosus Group poses at Amsterdam's stock exchange, as Prosus begins trading on the \
Euronext stock exchange in Amsterdam, Netherlands, September 11, 2019. REUTERS/Piroschka van de Wouw \
Prosus comprises Naspers’ global empire of consumer internet assets, with the jewel in the crown a \
31% stake in Chinese tech titan Tencent. There is 'way more demand than is even available, so that’s \
good,' said the CEO of Euronext Amsterdam, Maurice van Tilburg. 'It’s going to be an interesting \
hour of trade after opening this morning.' Euronext had given an indicative price of 58.70 euros \
per share for Prosus, implying a market value of 95.3 billion euros ($105 billion). The shares \
jumped to 76 euros on opening and were trading at 75 euros at 0719 GMT."

In [None]:
result2 = predict(text2,model)
blob = TextBlob(text2)
result2['textblob_prediction'] = [sentence.sentiment.polarity for sentence in blob.sentences]

In [None]:
result2

In [None]:
print(f'Average sentiment is %.2f.' % (result2.sentiment_score.mean()))
