In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import string
import ast
import operator
from collections import Counter
import statistics as stats

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [14]:
df = pd.read_csv('C:\\Users\\Reen\\Desktop\\updated.csv')

In [15]:
#df = df[:]

In [16]:
def lemmatize_entity(entity):
    result = (entity.lower()).translate(str.maketrans("","", string.punctuation))
    word_list = nltk.word_tokenize(result)
    filtered_output = [w for w in word_list if not w in stop_words]
    lemmatized_key = [lemmatizer.lemmatize(w) for w in filtered_output]
    return lemmatized_key

In [17]:
def match_entity(match_dict,entity):
    matched_key_dict = {}
    entity_lemma = lemmatize_entity(str(entity))
    for key in match_dict.keys():
        key_lemma = lemmatize_entity(key)
        if(len(list(set(key_lemma).intersection(set(entity_lemma))))>0):
            matched_key_dict[key] = match_dict[key]
            #return True
    return matched_key_dict

In [18]:
def find_source_emotions(emotions_dict,tfidf_dict):
    common_emotions_dict = {}
    for key in tfidf_dict.keys():
        lemma_tfidf_key = lemmatize_entity(key)
        matched_key_dict = match_entity(emotions_dict,lemma_tfidf_key)
        
        emotion_avg = {}
        #combine using avg
        if(matched_key_dict):
            emotion_values = (list(matched_key_dict.values()))
            N = float(len(emotion_values))
            emotion_avg = {emotion : sum(val[emotion] for val in emotion_values)/N for emotion in emotion_values[0]}
            
        #taking most representive mention with max words
            key_lengths = {key : len(lemmatize_entity(key)) for key in matched_key_dict.keys()}
            max_key = max(key_lengths.items(), key=operator.itemgetter(1))[0]
            
        #check
            if(max_key not in common_emotions_dict):
                common_emotions_dict[max_key] = emotion_avg

    return common_emotions_dict

In [21]:
def computed_sensational_score(src1_emotions, src2_emotions):
   
    common_emotions = []
    allkeys_weighted_score = []
    
    for key in src1_emotions.keys():
        weighted_score = {}
        lemma_key_src1 = lemmatize_entity(key)
        
        #find matching keys in src2_emotions w.r.t. src1key
        matched_key_dict = match_entity(src2_emotions,lemma_key_src1)
        
        #combine using avg
        emotion_avg = {}
        if(matched_key_dict):
            emotion_values = (list(matched_key_dict.values()))
            N = float(len(emotion_values))
            emotion_avg = {emotion : sum(val[emotion] for val in emotion_values)/N for emotion in emotion_values[0]}
        
            key_score = src1_emotions[key]
            matched_score = emotion_avg
            
            #calculate weighted score for 
            weighted_score = {key: abs(key_score[key] - matched_score.get(key, 0)) for key in key_score.keys()}
            
            #merge corresponding key and matched_key_dict to common_emotions -- [src1key,val,[matched_key_dict],emotion_avg]
            common_emotions.append([key,key_score,matched_key_dict,matched_score])
            allkeys_weighted_score.append(weighted_score)
    
    if(allkeys_weighted_score):
        length = float(len(allkeys_weighted_score))
        total_sensational_score = {emotion : sum(val[emotion] for val in allkeys_weighted_score)/length for emotion in allkeys_weighted_score[0]}
        #print(total_sensational_score)
        mean_score = stats.mean([v for k,v in total_sensational_score.items()])
        #print(mean_score)
        return [mean_score,total_sensational_score]
    else:
        return [0,0]

In [22]:
#dd_emotions, ndtv_emotions, timesemotions, dd_tfidf, ndtv_tfidf, times_tfidf
def process_rows(df):
    
    score_df = pd.DataFrame(columns=['ID','dd_ndtv_emotion_score','dd_times_emotion_score','dd_ndtv_emotion_mean_score','dd_times_emotion_mean_score'])
    
    for index,row in df.iterrows():
        print(index)
        dd_emotions = ast.literal_eval(row['dd_emotion'])
        ndtv_emotions = ast.literal_eval(row['ndtv_emotion'])
        times_emotions = ast.literal_eval(row['tn_emotion'])
    
        dd_tfidf = ast.literal_eval(row['dd_tfid'])
        ndtv_tfidf = ast.literal_eval(row['ndtv_tfid'])
        times_tfidf = ast.literal_eval(row['tn_tfid'])
    
    
        dd_tfid_emotion_common = find_source_emotions(dd_emotions,dd_tfidf)
        ndtv_tfid_emotion_common = find_source_emotions(ndtv_emotions,ndtv_tfidf)
        times_tfid_emotion_common = find_source_emotions(times_emotions,times_tfidf)

        score_dd_ndtv = computed_sensational_score(dd_tfid_emotion_common,ndtv_tfid_emotion_common)
        sensational_score_dd_ndtv = score_dd_ndtv[1]
        mean_score_dd_ndtv = score_dd_ndtv[0]
        score_dd_times = computed_sensational_score(dd_tfid_emotion_common,times_tfid_emotion_common)
        sensational_score_dd_times = score_dd_times[1]
        mean_score_dd_times = score_dd_times[0]
        score_df = score_df.append({'ID':row['ID'],'dd_ndtv_emotion_score':sensational_score_dd_ndtv,'dd_times_emotion_score':sensational_score_dd_times,'dd_ndtv_emotion_mean_score': mean_score_dd_ndtv,'dd_times_emotion_mean_score':mean_score_dd_times},ignore_index=True)
    return score_df


In [3]:
abc = scores_df

In [1]:
scores_df = process_rows(df)

In [None]:
scores_df.to_csv('emotion_scores_1000_new.csv', encoding='utf-8',index=False)