In [None]:
#Reference:
#https://huggingface.co/docs/transformers/preprocessing

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

import pandas as pd
import numpy as np
import torch
import datetime

In [3]:
filepath = r"C:\Users\Tlhogi\Documents\Grace\Employers\Upwork\20230625_Twitter Sentiment Indicator"
filename = "\InputData_FinBERT.csv"
fullpath = filepath + filename

df = pd.read_csv(fullpath)
df.head()

Unnamed: 0,DateTime,TweetId,Text,Language
0,2023-06-21 08:04:41+00:00,1.67e+18,"Finally, a positive for South Africa as Inflat...",en
1,2023-06-19 10:11:17+00:00,1.67e+18,USD/ZAR: Rand will remain fragile on geopoliti...,en
2,2023-06-16 21:12:51+00:00,1.67e+18,South Africa's risk is real - influential Amer...,en
3,2023-06-16 18:12:57+00:00,1.67e+18,Factors that are helping mitigate against Sout...,en
4,2023-06-16 09:01:58+00:00,1.67e+18,Probably am too critical of Zanu pf government...,en


In [3]:
df_array = np.array(df)
df_list = list(df_array[:,2]) 
print(df_list[:10])

['Finally, a positive for South Africa as Inflation comes in better than expected. A pause from SARB? @DailyFXTeam @DailyFX\n#SouthAfrica #Mzansi #USDZAR #SouthAfricanRand #inflation #interestrates', 'USD/ZAR: Rand will remain fragile on geopolitics – ING https://t.co/18lKmAQ2zq #USDZAR #SouthAfrica #Banks', "South Africa's risk is real\xa0- influential American\xa0senators are pushing for RSA to be dropped from AGOA.\nFor now, some people appear to believe the rapprochement may work — the rand is the best performing currency in the world this month.  #NextAfrica @BloombergAfrica https://t.co/6wcrQm2Bgu", 'Factors that are helping mitigate against South Africa defaulting on its debt are;\n\n1) debt service costs are subtracted from budget before allocating revenue, and\n\n2) significant portion (+-90%) of SA debt is Rand-denominated (Currency issuers have a "zero default risk").', "Probably am too critical of Zanu pf government but if I may ask other than ZIMBABWE is there any country 

In [10]:
#Reference:
#https://towardsdatascience.com/does-bert-need-clean-data-part-2-classification-d29adf9f745a

#Cleaning the data to prepare for FinBERT
#Light text cleaning is employed as advised in article

import re
def text_clean(x):
    # lowercase everything
    x = x.lower() 
    
    # remove unicode characters
    x = x.encode('ascii', 'ignore').decode()
    
    # remove links
    x = re.sub(r'https*\S+', ' ', x) 
    x = re.sub(r'http*\S+', ' ', x)
    
    # cleaning up text
    x = re.sub(r'\'\w+', '', x) 
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)
    
    return x

In [13]:
#train_df['cleaned_text'] = train_df.text.apply(text_clean)

clean_list =[]
for i in df_list:
    x = text_clean(i)
    clean_list.append(x)

print(len(clean_list))
print(clean_list[:10])

7252
['finally, a positive for south africa as inflation comes in better than expected. a pause from sarb? @dailyfxteam @dailyfx\n#southafrica #mzansi #usdzar #southafricanrand #inflation #interestrates', 'usd/zar: rand will remain fragile on geopolitics ing #usdzar #southafrica #banks', 'south africa risk is real- influential americansenators are pushing for rsa to be dropped from agoa.\nfor now, some people appear to believe the rapprochement may work the rand is the best performing currency in the world this month. #nextafrica @bloombergafrica ', 'factors that are helping mitigate against south africa defaulting on its debt are;debt service costs are subtracted from budget before allocating revenue, andsignificant portion (+-%) of sa debt is rand-denominated (currency issuers have a "zero default risk").', 'probably am too critical of zanu pf government but if i may ask other than zimbabwe is there any country in the sadc region that doesn sell fuel in their local currency? in south

In [14]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

df3 = pd.DataFrame(columns = ["Tweet", "Positive", "Negative", "Neutral"])
from datetime import datetime
start_time = datetime.now()
count = 0

for tweet in clean_list:
    #tokenize text to be sent to model
    encoded_input = tokenizer(tweet, padding = True, truncation = True, return_tensors='pt')
    output = model(**encoded_input)
    prediction = torch.nn.functional.softmax(output.logits, dim=-1)
    model.config.id2label
    
    positive = prediction[:, 0].tolist()
    negative = prediction[:, 1].tolist()
    neutral = prediction[:, 2].tolist()

    new_items = {'Tweet':tweet, "Positive":positive, "Negative":negative, "Neutral":neutral}      
    df2 = pd.DataFrame(new_items, columns = ["Tweet", "Positive", "Negative", "Neutral"])
    df3 = pd.concat([df2, df3],ignore_index=True)
    
    count = count+1
    if count%500 == 0:
        end_time = datetime.now()
        print("Completed ", count, "items in ",'Duration: {}'.format(end_time - start_time))

print("-------------------------------------------------------------------------------------")
print("-------------------------------------------------------------------------------------")
print("-------------------------------------------------------------------------------------")

print(df3)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Completed  500 items in  Duration: 0:02:07.196275
Completed  1000 items in  Duration: 0:04:16.642679
Completed  1500 items in  Duration: 0:06:12.565309
Completed  2000 items in  Duration: 0:08:08.208924
Completed  2500 items in  Duration: 0:09:54.452000
Completed  3000 items in  Duration: 0:11:12.872486
Completed  3500 items in  Duration: 0:12:32.822059
Completed  4000 items in  Duration: 0:13:55.791804
Completed  4500 items in  Duration: 0:15:19.357584
Completed  5000 items in  Duration: 0:16:45.163492
Completed  5500 items in  Duration: 0:18:11.340421
Completed  6000 items in  Duration: 0:19:32.956089
Completed  6500 items in  Duration: 0:21:02.363203
Completed  7000 items in  Duration: 0:22:26.936040
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
                             

In [15]:
df3.to_csv(f"FinBert_Results.csv",index=False)