In [4]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [5]:
final_file = pd.read_csv('cleaned_dataset.csv')
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [6]:
type(final_file.text.iloc[0])

str

In [7]:
file_weighting = final_file.copy()
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text.values.astype('U'))
features = pd.Series(tfidf.get_feature_names_out())
transformed = tfidf.transform(file_weighting.text.values.astype('U'))



In [8]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], str(x.text).lower().split()))

In [9]:
# %%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes t

In [10]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [11]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), str(x).split())))

In [31]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>-100).astype('int8')

In [33]:
replacement_df[replacement_df['prediction'] == 0]

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
2,"[1.7244563648614055, -0.9948794868151926, -1.9...","[8.124406690778535, 1.0427354536430666, 8.9866...",og musk duck life wall,-19.194891,1
3,"[-1.697171576418397, -1.6641501991749887, -1.5...","[7.025794402110425, 5.357389013588209, 3.49790...",dare speak way great powerful musk obviously d...,-99.530856,1
4,"[-1.449100638134386, -1.5635951194832456, -1.6...","[5.301705927111406, 5.75936858313794, 6.433900...",wait finally excuse shower douche,-36.174028,1
5,"[-0.9948794868151926, -1.307036541067213, -1.5...","[1.0427354536430666, 3.621638438408525, 5.0926...",musk try oh boy head buttock,-1.735252,1
7,"[-1.762821927266953, -1.5728633876026017, -0.9...","[4.833520329934062, 3.7777194225499624, 1.0427...",case mean musk,-15.499868,1
...,...,...,...,...,...
55866,"[-1.6970219009256653, -0.9948794868151926, -1....","[5.147743979944895, 1.0427354536430666, 4.5421...",remember musk call onion unfunny post old meme...,-61.290207,1
55869,"[-0.9948794868151926, -1.372332088661498, -1.1...","[1.0427354536430666, 5.883697001502577, 4.0593...",musk wake new black,-8.507571,1
55870,"[-1.3499090596213068, -1.50874768163437, -0.99...","[5.606710218167544, 4.747198465486402, 1.04273...",risk fire musk disagree,-24.243057,1
55875,"[-0.9948794868151926, -1.6526913989427787, -1....","[1.0427354536430666, 4.333171230027071, 5.8789...",musk run president 's office thankfully bear u...,-87.797739,1


In [34]:
replacement_df[['sentence','prediction']].to_csv('text_with_label.csv')

In [35]:
replacement_df.describe()

Unnamed: 0,sentiment_rate,prediction
count,55885.0,55885.0
mean,-251.513129,0.450568
std,1984.377863,0.497555
min,-47992.683413,0.0
25%,-298.034083,0.0
50%,-118.211886,0.0
75%,-44.909313,1.0
max,138872.633451,1.0
