In [1]:
from blob_storage import download as blob_storage 
import pandas as pd
import numpy as np
import util
import sentiment_score
import nltk
from langchain.chains import create_tagging_chain
from langchain_openai import ChatOpenAI

In [2]:
local_path = "C:\\Users\\Vinim\\Documents\\github\\tcc_vinicius\\data_2"

#blob_storage.download_blob_storage(local_path)

In [3]:
df = util.get_dataframe(local_path)
df = df.drop_duplicates(subset=['url']).copy()

## Liwic sentiment score

In [4]:
def get_sentiment_score(text, dictionary):
    tokens = nltk.word_tokenize(text,language='portuguese')

    df_tokens = pd.DataFrame({'tokens':tokens})
    df_tokens['tokens'] = df_tokens['tokens'].astype('str').str.lower()

    result = pd.merge(df_tokens, dictionary, left_on='tokens',right_on='DicTerm', how='left')
    result['score_liwc2015'].fillna(0, inplace=True)

    palavras = result.shape[0]
    count_n = (result['score_liwc2015'] == -1).sum()
    count_p = (result['score_liwc2015'] == 1).sum()

    result_score = result.loc[result['score_liwc2015'] != 0,'score_liwc2015'].mean()

    raw_mean = result['score_liwc2015'].mean()

    return f'positivo:{count_p}\nnegativo:{count_n}'

In [5]:
liwc2015 = sentiment_score.read_liwc('LIWC2015 Dictionary - Brazilian Portuguese.dicx')
df['score_liwic'] = df['full_text'].apply(lambda x: get_sentiment_score(x, liwc2015))
df[['positivo', 'negativo']] = df['score_liwic'].str.split('\\n', expand=True)

## Chatgpt

* llm model

In [6]:
#LLM
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

### relevância

In [7]:
def classify_relevance(company, input, llm):

    relevance_text = f'Please assess whether a given news article is directly pertinent to the Brazilian company {company}. First, identify the primary topics addressed in the news, then indicate if these topics are relevant to {company}. Return 1 if they are relevant, and 0 if they are not.'

    schema = {
        "properties": {
            "relevance": {
                "type": "integer",
                "enum": [0,1],
                "description": relevance_text,
            }},
        "required": ["relevance"]
    }

    chain = create_tagging_chain(schema, llm)
    result = chain.invoke(input)
    return result['text']

### Análise de sentimentos

In [8]:
def classify_sentiment_chain(llm):

    schema = {
        "properties": {
            "sentiment": {"type": "string"},
            "sentiment score": {
                "type": "integer",
                "enum": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5],
                "description": "describes how polarized the statement sentiment is, the higher the number the more positive, and the lower the number more negative",
            },
            'covered topics': {"type": "string"}
        },
        "required": ["sentiment", "sentiment score", 'covered topics']
    }

    chain = create_tagging_chain(schema, llm)
    return chain

def classify_sentiment(chain, input):
    result = chain.invoke(input)
    return result['text']

-------------------------

## TESTE

* tratamento dos dados

In [9]:
# padronização dos nomes da empresa
df['key_standard'] = util.substituir_nomes(df_col= df['key'])


# tratando textos não coletados
filtro = (df['full_text'] == 'ERROR')
df.loc[filtro, 'full_text'] = df.loc[filtro, 'name'] 

* amostra

In [17]:
df_sample = df.sample(50, random_state=31)


>Classificação da relevancia

In [18]:
df_sample['raw_classif'] = df_sample.apply(lambda x: classify_relevance(company=x['key_standard'],
                                      input=x['full_text'][:3900],
                                      llm=llm), axis=1)


df_sample['relevance'] = df_sample['raw_classif'].apply(lambda x: int(x['relevance']) if len(list(x.keys())) > 0 else '')

In [19]:
result_sample = df_sample[['name','key_standard','full_text','relevance']]

In [20]:
result_sample

Unnamed: 0,name,key_standard,full_text,relevance
1431,Cenário está bem para uma Selic terminal mais ...,Ticket: BPAC11 - company name: Banco BTG Pactual,,1
1229,Plano Nova Indústria Brasil deve favorecer set...,Ticket: WEGE3 - company name: WEG,Plano Nova Indústria Brasil deve favorecer set...,1
262,"Itaúsa (ITSA4) aprova R$ 3,1 bilhões em divide...",Ticket: ITSA4 - company name: Itaúsa,A semana começou com boas notícias para os aci...,1
1303,Rumor: BYD planeja ofertar o Dolphin para PcD ...,Ticket: SANB3 - company name: Banco Santander,Ele tem tomado cada vez mais as garagens e rua...,1
362,Madonna cria filme para Itaú\n\n,Ticket: ITUB4 - company name: Itaú Unibanco,A estrela internacional do pop protagoniza nov...,1
286,Petrobras (PETR4): mesmo após dividendos e pro...,Ticket: PETR4 - company name: Petrobras,A Petrobras estampou os noticiários depois de ...,1
792,BTG Pactual pretende oferecer renda fixa inter...,Ticket: BPAC11 - company name: Banco BTG Pactual,"O sócio do BTG Pactual, Renato Moritz, disse q...",1
7,Petrobras (PETR4) e Banco ABC (ABCB4) pagam di...,Ticket: PETR4 - company name: Petrobras,A Petrobras e o Banco ABC Brasil pagam provent...,1
238,Lucro de 2023 separa bancos em 2 grupos. Qual ...,Ticket: SANB3 - company name: Banco Santander,Os líderes de lucratividade foram Itaú e Banco...,1
141,"BNDES aprova financiamento de R$ 118,8 milhões...",Ticket: WEGE3 - company name: WEG,,1


>Análise de sentimentos

In [21]:
relevance_filter = (df_sample['relevance'] == 1)

chain = classify_sentiment_chain(llm)

df_sample.loc[relevance_filter,'raw_sentiment'] = df_sample.loc[relevance_filter,'full_text'].apply(lambda x:classify_sentiment(chain=chain, input=x))

df_sample.loc[relevance_filter,'sentiment'] = df_sample.loc[relevance_filter,'raw_sentiment'].apply(lambda x: x['sentiment'])
df_sample.loc[relevance_filter,'sentiment score'] = df_sample.loc[relevance_filter,'raw_sentiment'].apply(lambda x: x['sentiment score'])
df_sample.loc[relevance_filter,'covered topics'] = df_sample.loc[relevance_filter,'raw_sentiment'].apply(lambda x: x['covered topics'])

In [22]:
result_sample2 =  df_sample[['name','key_standard','full_text','relevance', 'sentiment',
            'sentiment score','score_liwic', 'covered topics']]

In [24]:
# result_sample2

precisa ter variancia para ter relevancia

fazer gráficos 

soma, média, médiana, contagem, menor, maior 



