In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [2]:
final_file = pd.read_csv('cleaned_dataset.csv')
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [3]:
type(final_file.text.iloc[0])

str

In [4]:
file_weighting = final_file.copy()
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text.values.astype('U'))
features = pd.Series(tfidf.get_feature_names_out())
transformed = tfidf.transform(file_weighting.text.values.astype('U'))



In [5]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], str(x.text).lower().split()))

In [6]:
# %%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes t

In [7]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [8]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), str(x).split())))

In [9]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [14]:
replacement_df[replacement_df['prediction'] == 0]

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
6,"[4.216235800301971, 1.3054374751824371, 1.0810...","[10.231498289601708, 2.708287137775497, 1.9096...",kevin_leary like elon_musk far_less successful...,-368.974481,0
34,"[-2.3897000908081174, -1.676489860141257, -1.3...","[10.636963397709872, 10.07734760977445, 5.8268...",dentre plataformas que dados pra_vender tweete...,-68.059042,0
35,"[-1.573594317260521, -1.4526839346900224, -1.2...","[10.07734760977445, 9.250669036589981, 8.65596...",us_wherever sell_perfume shipping_starts 5 oil...,-1424.241253,0
43,"[1.3064039461920518, -1.4242921640123751, -1.6...","[7.117982980391334, 7.278325630466513, 9.82603...",ah nao mas_isso e ! elon_musk por_exemplo nasc...,-175.481824,0
63,"[1.4973112055222213, -1.1511845496275368, -1.1...","[5.045044016199064, 17.11504371206007, 17.1150...",best interior interior anon musk,-5.152198,0
...,...,...,...,...,...
61235,"[-1.0499231691511863, 3.0693216607105644, -1.2...","[6.841474208537678, 10.07734760977445, 8.93221...",bpal goggles metallic needs metallic_notes ind...,-104.615326,0
61249,"[-1.172726964224266, -1.574757572603959, 1.280...","[8.526750197363283, 10.231498289601708, 6.0726...",al_rehab attars project really well shada musk...,-79.172989,0
61251,"[1.5181924346549036, 3.699953954792449, 3.4061...","[11.008220941778735, 10.636963397709872, 10.63...",thinking many_radical left_participate speedru...,-1580.682807,0
61255,"[-1.7835763851347646, -1.3176195438629958, -1....","[10.413819846395663, 7.945720314924044, 10.077...",proto sem byl v san tam taxi : : musk jak kdyz...,-321.334869,0


In [19]:
replacement_df[['sentence', 'prediction']].to_csv('labeled_data.csv')