In [32]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [33]:
final_file = pd.read_csv('cleaned_dataset.csv')
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [34]:
type(final_file.text.iloc[0])

str

In [35]:
file_weighting = final_file.copy()
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text.values.astype('U'))
features = pd.Series(tfidf.get_feature_names_out())
transformed = tfidf.transform(file_weighting.text.values.astype('U'))



In [36]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], str(x.text).lower().split()))

In [37]:
# %%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes t

In [38]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [39]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), str(x).split())))

In [48]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>100).astype('int8')

In [49]:
replacement_df[replacement_df['prediction'] == 0]

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
2,"[-0.9540358009457152, 0.9786753693078538, 1.77...","[7.512398252312913, 1.555309033361735, 9.53835...",og musk duck lives wall,29.696213,0
4,"[1.696641513583642, 1.339973693457616, 1.64246...","[7.398284945545492, 6.0046645443335285, 6.9052...",cannot_wait finally excuse shower douche,54.617788,0
5,"[0.9786753693078538, 1.521034753349722, 2.9382...","[1.555309033361735, 4.8235794131385905, 9.2506...",musk trying oh_boy head buttocks,22.023611,0
6,"[4.216235800301971, 1.3054374751824371, 1.0810...","[10.231498289601708, 2.708287137775497, 1.9096...",kevin_leary like elon_musk far_less successful...,-368.974481,0
7,"[1.718969931654016, 1.5023113265457595, 0.9786...","[5.224201896770967, 5.022923298554533, 1.55530...",case means musk,18.048383,0
...,...,...,...,...,...
61264,"[1.299418580314482, 1.3838757021566253, 0.9786...","[8.034273712265488, 2.812217631739307, 1.55530...",maher would musk cock live_tv could embarrassi...,90.440197,0
61265,"[1.4329371705631306, 1.491180094561145, 1.1214...","[4.008591388364821, 3.7128427646414712, 7.3788...",well could apartheid - raised musk woke new black,39.234013,0
61266,"[1.2758252792217688, 0.9786753693078538]","[7.61653851156551, 1.555309033361735]",kanye musk,11.239515,0
61267,"[1.5259895939725314, 1.3841192788707477, 1.353...","[3.8207753122532258, 6.302945982222352, 5.9711...",still risk fired ever say something musk disag...,56.822436,0


In [50]:
replacement_df[['sentence','prediction']].to_csv('text_with_label.csv')