In [7]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [8]:
final_file = pd.read_csv('cleaned_dataset.csv')
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [9]:
type(final_file.text.iloc[0])

str

In [10]:
file_weighting = final_file.copy()
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text.values.astype('U'))
features = pd.Series(tfidf.get_feature_names_out())
transformed = tfidf.transform(file_weighting.text.values.astype('U'))



In [11]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], str(x.text).lower().split()))

In [12]:
# %%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes t

In [13]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [14]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), str(x).split())))

In [15]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [17]:
replacement_df[replacement_df['prediction'] == 1]

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
0,"[1.070661800630579, 1.4898866718974308, 0.9786...","[5.996326430329564, 6.992819837437327, 1.55530...",twitter prior musk takeover talking directly n...,351.171740,1
1,"[1.171212606414365, 1.6110511295479415, 1.6746...","[11.299875938505501, 7.9656217550533075, 8.216...",article say imply states feature turns crashes...,524.379199,1
2,"[-0.9540358009457152, 0.9786753693078538, 1.77...","[7.512398252312913, 1.555309033361735, 9.53835...",og musk duck lives wall,29.696213,1
3,"[3.883288813006564, 1.6448545388112097, 1.3164...","[10.07734760977445, 3.7509426108737416, 4.8348...",dare_speak way great powerful musk obviously d...,112.723930,1
4,"[1.696641513583642, 1.339973693457616, 1.64246...","[7.398284945545492, 6.0046645443335285, 6.9052...",cannot_wait finally excuse shower douche,54.617788,1
...,...,...,...,...,...
61277,"[1.3054374751824371, 3.3744355009374853, 1.385...","[2.708287137775497, 9.458308401368226, 6.13437...",like brain_dead lmao anything barely relating ...,144.924125,1
61278,"[1.5343300576417465, 4.772229403494811, 1.6039...","[6.638762696040674, 10.636963397709872, 12.599...",lying agenda_please correct errors anything sa...,305.757294,1
61279,"[1.5064479466120888, 1.456652356153979, 1.4127...","[5.071485594430851, 6.432270778318906, 3.24863...",hard disagree think parody elon_musk sure tend...,592.831585,1
61280,"[1.4606167682907296, 1.4127184184664383, 1.655...","[4.8356005947148075, 3.2486355381327656, 4.346...",yeah think many things lining right smaller_sc...,973.381514,1
